absee 0.1.0.0 → 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/absee.rb +102 -17
- metadata +2 -2
data/lib/absee.rb
CHANGED
@@ -9,8 +9,17 @@
|
|
9
9
|
#
|
10
10
|
# MIT license 2012
|
11
11
|
|
12
|
-
|
12
|
+
class ABSee
|
13
13
|
|
14
|
+
#variables
|
15
|
+
@traceA = []
|
16
|
+
@traceG = []
|
17
|
+
@traceC = []
|
18
|
+
@traceT = []
|
19
|
+
@calledSequences = []
|
20
|
+
@peakIndexes = []
|
21
|
+
@qualityScores = []
|
22
|
+
|
14
23
|
#opens the ABIF sequencing / chromatogram file
|
15
24
|
#checks for ABIF file type
|
16
25
|
#major ABIF versions greater than 1 are not supported
|
@@ -21,7 +30,7 @@ module Absee
|
|
21
30
|
#
|
22
31
|
#== Returns:
|
23
32
|
# Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
|
24
|
-
def
|
33
|
+
def read(filename)
|
25
34
|
#opens ab1 as a File object
|
26
35
|
abFile = open(filename)
|
27
36
|
byteArray = ""
|
@@ -32,12 +41,54 @@ module Absee
|
|
32
41
|
abFile.read(4, byteArray)
|
33
42
|
#ABIF file indicator
|
34
43
|
if byteArray == "ABIF"
|
35
|
-
|
44
|
+
processAB(abFile)
|
36
45
|
else
|
37
|
-
|
46
|
+
raise "file not recognized as ABIF"
|
38
47
|
end
|
39
48
|
end
|
40
49
|
|
50
|
+
|
51
|
+
|
52
|
+
##accessors
|
53
|
+
#== Returns:
|
54
|
+
# the trace data for adenine
|
55
|
+
def get_traceA()
|
56
|
+
return @traceA
|
57
|
+
end
|
58
|
+
#== Returns:
|
59
|
+
# an array with the trace data for guanine
|
60
|
+
def get_traceG()
|
61
|
+
return @traceG
|
62
|
+
end
|
63
|
+
#== Returns:
|
64
|
+
# an array with the trace data for thymine
|
65
|
+
def get_traceT()
|
66
|
+
return @traceT
|
67
|
+
end
|
68
|
+
#== Returns:
|
69
|
+
# an array with the trace data for cytosine
|
70
|
+
def get_traceC()
|
71
|
+
return @traceC
|
72
|
+
end
|
73
|
+
#== Returns:
|
74
|
+
# an array with the Basecalled sequence
|
75
|
+
def get_calledSequence()
|
76
|
+
return @calledSequence
|
77
|
+
end
|
78
|
+
#== Returns:
|
79
|
+
# an array with the Basecalled quality scores
|
80
|
+
def get_qualityScores()
|
81
|
+
return @qualityScores
|
82
|
+
end
|
83
|
+
#== Returns:
|
84
|
+
# an array with the peak indexes
|
85
|
+
def get_peakIndexes()
|
86
|
+
return @peakIndexes
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
private
|
91
|
+
|
41
92
|
#process the opened ABIF filestream, and calls subsequent methods to extract the data
|
42
93
|
#
|
43
94
|
#== Parameters:
|
@@ -46,13 +97,13 @@ module Absee
|
|
46
97
|
#== Returns:
|
47
98
|
#Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
|
48
99
|
#readAB returns the results of this method
|
49
|
-
def
|
100
|
+
def processAB(filestream)
|
50
101
|
#// here, we can read the ABIF header information
|
51
102
|
version = readUnsignedByte_2(4, filestream)
|
52
103
|
#// major versions greater than 1 are not supported
|
53
104
|
#// Applied Biosystems rules
|
54
105
|
if (version / 100 > 1)
|
55
|
-
|
106
|
+
raise "ABIF version #{version} not supported (only supported for version less than 1)"
|
56
107
|
end
|
57
108
|
#// we just read ABIF, so we don't need more information than that
|
58
109
|
numElements = readUnsignedByte_4(18, filestream)
|
@@ -61,10 +112,19 @@ module Absee
|
|
61
112
|
numSamples, numBases = gatherInformation(directory, numElements)
|
62
113
|
samples_a, samples_c, samples_g, samples_t = getSamples(filestream, directory, numElements, numSamples)
|
63
114
|
called_sequence = getCalledSequence(filestream, directory, numElements, numBases)
|
64
|
-
|
65
|
-
|
115
|
+
quality_scores = getQualityScores(filestream, directory, numElements, numBases)
|
116
|
+
peak_indexes = getPeakIndexes(filestream, directory, numElements, numBases)
|
117
|
+
##return samples_a, samples_c, samples_g, samples_t, called_sequence, peak_indexes, quality_scores
|
118
|
+
@traceA = samples_a
|
119
|
+
@traceC = samples_c
|
120
|
+
@traceG = samples_g
|
121
|
+
@traceT = samples_t
|
122
|
+
@calledSequence = called_sequence
|
123
|
+
@qualityScores = quality_scores
|
124
|
+
@peakIndexes = peak_indexes
|
125
|
+
nil
|
66
126
|
end
|
67
|
-
|
127
|
+
|
68
128
|
#reads 2 unsigned bytes and orders by most significant byte first
|
69
129
|
#
|
70
130
|
#== Parameters:
|
@@ -73,7 +133,7 @@ module Absee
|
|
73
133
|
#
|
74
134
|
#== Returns:
|
75
135
|
#an int ordered by most significant byte first
|
76
|
-
def
|
136
|
+
def readUnsignedByte_2(offset, filestream)
|
77
137
|
#// most significant byte first
|
78
138
|
#// |byte0|byte1| <= |unsigned int|
|
79
139
|
byteArray = ""
|
@@ -90,7 +150,7 @@ module Absee
|
|
90
150
|
#
|
91
151
|
#== Returns:
|
92
152
|
#an int ordered by most significant byte first
|
93
|
-
def
|
153
|
+
def readUnsignedByte_4(offset, filestream)
|
94
154
|
byteArray = ""
|
95
155
|
filestream.seek(offset, IO::SEEK_SET)
|
96
156
|
byteArray = filestream.read(4, byteArray)
|
@@ -109,7 +169,7 @@ module Absee
|
|
109
169
|
#== Returns:
|
110
170
|
#an array of arrays, each with information from the directory
|
111
171
|
#[name, tag number, element type, element size, number of elements, data size, data offset]
|
112
|
-
def
|
172
|
+
def readDirectoryEntry(filestream, dataOffset, numElements)
|
113
173
|
filestream.seek(dataOffset, IO::SEEK_SET)
|
114
174
|
byteArray = ""
|
115
175
|
filestream.read(28*numElements, byteArray)
|
@@ -163,7 +223,7 @@ module Absee
|
|
163
223
|
#
|
164
224
|
#== Returns:
|
165
225
|
#the element from the array
|
166
|
-
def
|
226
|
+
def get(array, element)
|
167
227
|
if element == "name"
|
168
228
|
return array[0]
|
169
229
|
elsif element == "tag_number"
|
@@ -191,7 +251,7 @@ module Absee
|
|
191
251
|
#
|
192
252
|
#== Returns:
|
193
253
|
#number of samples and number of bases contained in this ABIF file
|
194
|
-
def
|
254
|
+
def gatherInformation(directory, numElements)
|
195
255
|
numSamples = 0
|
196
256
|
numBases = 0
|
197
257
|
|
@@ -218,7 +278,7 @@ module Absee
|
|
218
278
|
#
|
219
279
|
#== Returns:
|
220
280
|
#four arrays with trace data in the order ACGT
|
221
|
-
def
|
281
|
+
def getSamples(filestream, directory, numElements, numSamples)
|
222
282
|
samples_a = []
|
223
283
|
samples_c = []
|
224
284
|
samples_g = []
|
@@ -268,7 +328,7 @@ module Absee
|
|
268
328
|
#
|
269
329
|
#== Returns:
|
270
330
|
#an array with the called sequence
|
271
|
-
def
|
331
|
+
def getCalledSequence(filestream, directory, numElements, numBases)
|
272
332
|
calledSequence = []
|
273
333
|
(0..numElements-1).each do |i|
|
274
334
|
if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
|
@@ -283,6 +343,31 @@ module Absee
|
|
283
343
|
return calledSequence
|
284
344
|
end
|
285
345
|
|
346
|
+
#extracts the quality score associated with the called sequence
|
347
|
+
#
|
348
|
+
#== Parameters:
|
349
|
+
#filestream:: an open File
|
350
|
+
#directory:: an array of array generated by readDirectoryEntry
|
351
|
+
#numElements:: an int indicating the number of elements in this ABIF file
|
352
|
+
#numBases:: an int calculated by gatherInformation
|
353
|
+
#
|
354
|
+
#== Returns:
|
355
|
+
#an array with the quality scores
|
356
|
+
def getQualityScores(filestream, directory, numElements, numBases)
|
357
|
+
qualityScore = []
|
358
|
+
(0..numElements-1).each do |i|
|
359
|
+
if (get(directory[i], "name") == "PCON") && (get(directory[i], "tag_number") == 2)
|
360
|
+
byteArray_seq = ""
|
361
|
+
filestream.seek(get(directory[i], "data_offset"))
|
362
|
+
filestream.read(numBases,byteArray_seq)
|
363
|
+
(0..numBases-1).each do |j|
|
364
|
+
qualityScore[j] = byteArray_seq.getbyte(j)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
return qualityScore
|
369
|
+
end
|
370
|
+
|
286
371
|
#extracts the trace information for the bases
|
287
372
|
#
|
288
373
|
#== Parameters:
|
@@ -293,7 +378,7 @@ module Absee
|
|
293
378
|
#
|
294
379
|
#== Returns:
|
295
380
|
#an array with the indexes of the peaks
|
296
|
-
def
|
381
|
+
def getPeakIndexes(filestream, directory, numElements, numBases)
|
297
382
|
peakIndexes = []
|
298
383
|
(0..numElements-1).each do |i|
|
299
384
|
if (get(directory[i], "name") == "PLOC") && (get(directory[i], "tag_number") == 2)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: absee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: '1.0'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,7 +12,7 @@ cert_chain: []
|
|
12
12
|
date: 2012-11-14 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: .ab1 reader / ABIF reader; extracts the peak indexes, called sequence,
|
15
|
-
and ACGT values from sequencing files
|
15
|
+
quality scores, and ACGT values from sequencing files
|
16
16
|
email: jencheng@ginkgobioworks.com
|
17
17
|
executables: []
|
18
18
|
extensions: []
|