absee 0.1.0.0 → 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/absee.rb +102 -17
- metadata +2 -2
data/lib/absee.rb
CHANGED
@@ -9,8 +9,17 @@
|
|
9
9
|
#
|
10
10
|
# MIT license 2012
|
11
11
|
|
12
|
-
|
12
|
+
class ABSee
|
13
13
|
|
14
|
+
#variables
|
15
|
+
@traceA = []
|
16
|
+
@traceG = []
|
17
|
+
@traceC = []
|
18
|
+
@traceT = []
|
19
|
+
@calledSequences = []
|
20
|
+
@peakIndexes = []
|
21
|
+
@qualityScores = []
|
22
|
+
|
14
23
|
#opens the ABIF sequencing / chromatogram file
|
15
24
|
#checks for ABIF file type
|
16
25
|
#major ABIF versions greater than 1 are not supported
|
@@ -21,7 +30,7 @@ module Absee
|
|
21
30
|
#
|
22
31
|
#== Returns:
|
23
32
|
# Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
|
24
|
-
def
|
33
|
+
def read(filename)
|
25
34
|
#opens ab1 as a File object
|
26
35
|
abFile = open(filename)
|
27
36
|
byteArray = ""
|
@@ -32,12 +41,54 @@ module Absee
|
|
32
41
|
abFile.read(4, byteArray)
|
33
42
|
#ABIF file indicator
|
34
43
|
if byteArray == "ABIF"
|
35
|
-
|
44
|
+
processAB(abFile)
|
36
45
|
else
|
37
|
-
|
46
|
+
raise "file not recognized as ABIF"
|
38
47
|
end
|
39
48
|
end
|
40
49
|
|
50
|
+
|
51
|
+
|
52
|
+
##accessors
|
53
|
+
#== Returns:
|
54
|
+
# the trace data for adenine
|
55
|
+
def get_traceA()
|
56
|
+
return @traceA
|
57
|
+
end
|
58
|
+
#== Returns:
|
59
|
+
# an array with the trace data for guanine
|
60
|
+
def get_traceG()
|
61
|
+
return @traceG
|
62
|
+
end
|
63
|
+
#== Returns:
|
64
|
+
# an array with the trace data for thymine
|
65
|
+
def get_traceT()
|
66
|
+
return @traceT
|
67
|
+
end
|
68
|
+
#== Returns:
|
69
|
+
# an array with the trace data for cytosine
|
70
|
+
def get_traceC()
|
71
|
+
return @traceC
|
72
|
+
end
|
73
|
+
#== Returns:
|
74
|
+
# an array with the Basecalled sequence
|
75
|
+
def get_calledSequence()
|
76
|
+
return @calledSequence
|
77
|
+
end
|
78
|
+
#== Returns:
|
79
|
+
# an array with the Basecalled quality scores
|
80
|
+
def get_qualityScores()
|
81
|
+
return @qualityScores
|
82
|
+
end
|
83
|
+
#== Returns:
|
84
|
+
# an array with the peak indexes
|
85
|
+
def get_peakIndexes()
|
86
|
+
return @peakIndexes
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
private
|
91
|
+
|
41
92
|
#process the opened ABIF filestream, and calls subsequent methods to extract the data
|
42
93
|
#
|
43
94
|
#== Parameters:
|
@@ -46,13 +97,13 @@ module Absee
|
|
46
97
|
#== Returns:
|
47
98
|
#Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
|
48
99
|
#readAB returns the results of this method
|
49
|
-
def
|
100
|
+
def processAB(filestream)
|
50
101
|
#// here, we can read the ABIF header information
|
51
102
|
version = readUnsignedByte_2(4, filestream)
|
52
103
|
#// major versions greater than 1 are not supported
|
53
104
|
#// Applied Biosystems rules
|
54
105
|
if (version / 100 > 1)
|
55
|
-
|
106
|
+
raise "ABIF version #{version} not supported (only supported for version less than 1)"
|
56
107
|
end
|
57
108
|
#// we just read ABIF, so we don't need more information than that
|
58
109
|
numElements = readUnsignedByte_4(18, filestream)
|
@@ -61,10 +112,19 @@ module Absee
|
|
61
112
|
numSamples, numBases = gatherInformation(directory, numElements)
|
62
113
|
samples_a, samples_c, samples_g, samples_t = getSamples(filestream, directory, numElements, numSamples)
|
63
114
|
called_sequence = getCalledSequence(filestream, directory, numElements, numBases)
|
64
|
-
|
65
|
-
|
115
|
+
quality_scores = getQualityScores(filestream, directory, numElements, numBases)
|
116
|
+
peak_indexes = getPeakIndexes(filestream, directory, numElements, numBases)
|
117
|
+
##return samples_a, samples_c, samples_g, samples_t, called_sequence, peak_indexes, quality_scores
|
118
|
+
@traceA = samples_a
|
119
|
+
@traceC = samples_c
|
120
|
+
@traceG = samples_g
|
121
|
+
@traceT = samples_t
|
122
|
+
@calledSequence = called_sequence
|
123
|
+
@qualityScores = quality_scores
|
124
|
+
@peakIndexes = peak_indexes
|
125
|
+
nil
|
66
126
|
end
|
67
|
-
|
127
|
+
|
68
128
|
#reads 2 unsigned bytes and orders by most significant byte first
|
69
129
|
#
|
70
130
|
#== Parameters:
|
@@ -73,7 +133,7 @@ module Absee
|
|
73
133
|
#
|
74
134
|
#== Returns:
|
75
135
|
#an int ordered by most significant byte first
|
76
|
-
def
|
136
|
+
def readUnsignedByte_2(offset, filestream)
|
77
137
|
#// most significant byte first
|
78
138
|
#// |byte0|byte1| <= |unsigned int|
|
79
139
|
byteArray = ""
|
@@ -90,7 +150,7 @@ module Absee
|
|
90
150
|
#
|
91
151
|
#== Returns:
|
92
152
|
#an int ordered by most significant byte first
|
93
|
-
def
|
153
|
+
def readUnsignedByte_4(offset, filestream)
|
94
154
|
byteArray = ""
|
95
155
|
filestream.seek(offset, IO::SEEK_SET)
|
96
156
|
byteArray = filestream.read(4, byteArray)
|
@@ -109,7 +169,7 @@ module Absee
|
|
109
169
|
#== Returns:
|
110
170
|
#an array of arrays, each with information from the directory
|
111
171
|
#[name, tag number, element type, element size, number of elements, data size, data offset]
|
112
|
-
def
|
172
|
+
def readDirectoryEntry(filestream, dataOffset, numElements)
|
113
173
|
filestream.seek(dataOffset, IO::SEEK_SET)
|
114
174
|
byteArray = ""
|
115
175
|
filestream.read(28*numElements, byteArray)
|
@@ -163,7 +223,7 @@ module Absee
|
|
163
223
|
#
|
164
224
|
#== Returns:
|
165
225
|
#the element from the array
|
166
|
-
def
|
226
|
+
def get(array, element)
|
167
227
|
if element == "name"
|
168
228
|
return array[0]
|
169
229
|
elsif element == "tag_number"
|
@@ -191,7 +251,7 @@ module Absee
|
|
191
251
|
#
|
192
252
|
#== Returns:
|
193
253
|
#number of samples and number of bases contained in this ABIF file
|
194
|
-
def
|
254
|
+
def gatherInformation(directory, numElements)
|
195
255
|
numSamples = 0
|
196
256
|
numBases = 0
|
197
257
|
|
@@ -218,7 +278,7 @@ module Absee
|
|
218
278
|
#
|
219
279
|
#== Returns:
|
220
280
|
#four arrays with trace data in the order ACGT
|
221
|
-
def
|
281
|
+
def getSamples(filestream, directory, numElements, numSamples)
|
222
282
|
samples_a = []
|
223
283
|
samples_c = []
|
224
284
|
samples_g = []
|
@@ -268,7 +328,7 @@ module Absee
|
|
268
328
|
#
|
269
329
|
#== Returns:
|
270
330
|
#an array with the called sequence
|
271
|
-
def
|
331
|
+
def getCalledSequence(filestream, directory, numElements, numBases)
|
272
332
|
calledSequence = []
|
273
333
|
(0..numElements-1).each do |i|
|
274
334
|
if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
|
@@ -283,6 +343,31 @@ module Absee
|
|
283
343
|
return calledSequence
|
284
344
|
end
|
285
345
|
|
346
|
+
#extracts the quality score associated with the called sequence
|
347
|
+
#
|
348
|
+
#== Parameters:
|
349
|
+
#filestream:: an open File
|
350
|
+
#directory:: an array of array generated by readDirectoryEntry
|
351
|
+
#numElements:: an int indicating the number of elements in this ABIF file
|
352
|
+
#numBases:: an int calculated by gatherInformation
|
353
|
+
#
|
354
|
+
#== Returns:
|
355
|
+
#an array with the quality scores
|
356
|
+
def getQualityScores(filestream, directory, numElements, numBases)
|
357
|
+
qualityScore = []
|
358
|
+
(0..numElements-1).each do |i|
|
359
|
+
if (get(directory[i], "name") == "PCON") && (get(directory[i], "tag_number") == 2)
|
360
|
+
byteArray_seq = ""
|
361
|
+
filestream.seek(get(directory[i], "data_offset"))
|
362
|
+
filestream.read(numBases,byteArray_seq)
|
363
|
+
(0..numBases-1).each do |j|
|
364
|
+
qualityScore[j] = byteArray_seq.getbyte(j)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
return qualityScore
|
369
|
+
end
|
370
|
+
|
286
371
|
#extracts the trace information for the bases
|
287
372
|
#
|
288
373
|
#== Parameters:
|
@@ -293,7 +378,7 @@ module Absee
|
|
293
378
|
#
|
294
379
|
#== Returns:
|
295
380
|
#an array with the indexes of the peaks
|
296
|
-
def
|
381
|
+
def getPeakIndexes(filestream, directory, numElements, numBases)
|
297
382
|
peakIndexes = []
|
298
383
|
(0..numElements-1).each do |i|
|
299
384
|
if (get(directory[i], "name") == "PLOC") && (get(directory[i], "tag_number") == 2)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: absee
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: '1.0'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,7 +12,7 @@ cert_chain: []
|
|
12
12
|
date: 2012-11-14 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: .ab1 reader / ABIF reader; extracts the peak indexes, called sequence,
|
15
|
-
and ACGT values from sequencing files
|
15
|
+
quality scores, and ACGT values from sequencing files
|
16
16
|
email: jencheng@ginkgobioworks.com
|
17
17
|
executables: []
|
18
18
|
extensions: []
|