absee 0.1.0.0 → 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/absee.rb +102 -17
  2. metadata +2 -2
@@ -9,8 +9,17 @@
9
9
  #
10
10
  # MIT license 2012
11
11
 
12
- module Absee
12
+ class ABSee
13
13
 
14
+ #variables
15
+ @traceA = []
16
+ @traceG = []
17
+ @traceC = []
18
+ @traceT = []
19
+ @calledSequences = []
20
+ @peakIndexes = []
21
+ @qualityScores = []
22
+
14
23
  #opens the ABIF sequencing / chromatogram file
15
24
  #checks for ABIF file type
16
25
  #major ABIF versions greater than 1 are not supported
@@ -21,7 +30,7 @@ module Absee
21
30
  #
22
31
  #== Returns:
23
32
  # Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
24
- def self.readAB(filename)
33
+ def read(filename)
25
34
  #opens ab1 as a File object
26
35
  abFile = open(filename)
27
36
  byteArray = ""
@@ -32,12 +41,54 @@ module Absee
32
41
  abFile.read(4, byteArray)
33
42
  #ABIF file indicator
34
43
  if byteArray == "ABIF"
35
- return processAB(abFile)
44
+ processAB(abFile)
36
45
  else
37
- return [],[],[],[],[],[]
46
+ raise "file not recognized as ABIF"
38
47
  end
39
48
  end
40
49
 
50
+
51
+
52
+ ##accessors
53
+ #== Returns:
54
+ # the trace data for adenine
55
+ def get_traceA()
56
+ return @traceA
57
+ end
58
+ #== Returns:
59
+ # an array with the trace data for guanine
60
+ def get_traceG()
61
+ return @traceG
62
+ end
63
+ #== Returns:
64
+ # an array with the trace data for thymine
65
+ def get_traceT()
66
+ return @traceT
67
+ end
68
+ #== Returns:
69
+ # an array with the trace data for cytosine
70
+ def get_traceC()
71
+ return @traceC
72
+ end
73
+ #== Returns:
74
+ # an array with the Basecalled sequence
75
+ def get_calledSequence()
76
+ return @calledSequence
77
+ end
78
+ #== Returns:
79
+ # an array with the Basecalled quality scores
80
+ def get_qualityScores()
81
+ return @qualityScores
82
+ end
83
+ #== Returns:
84
+ # an array with the peak indexes
85
+ def get_peakIndexes()
86
+ return @peakIndexes
87
+ end
88
+
89
+
90
+ private
91
+
41
92
  #process the opened ABIF filestream, and calls subsequent methods to extract the data
42
93
  #
43
94
  #== Parameters:
@@ -46,13 +97,13 @@ module Absee
46
97
  #== Returns:
47
98
  #Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
48
99
  #readAB returns the results of this method
49
- def self.processAB(filestream)
100
+ def processAB(filestream)
50
101
  #// here, we can read the ABIF header information
51
102
  version = readUnsignedByte_2(4, filestream)
52
103
  #// major versions greater than 1 are not supported
53
104
  #// Applied Biosystems rules
54
105
  if (version / 100 > 1)
55
- return [], [], [], [], [], []
106
+ raise "ABIF version #{version} not supported (only supported for version less than 1)"
56
107
  end
57
108
  #// we just read ABIF, so we don't need more information than that
58
109
  numElements = readUnsignedByte_4(18, filestream)
@@ -61,10 +112,19 @@ module Absee
61
112
  numSamples, numBases = gatherInformation(directory, numElements)
62
113
  samples_a, samples_c, samples_g, samples_t = getSamples(filestream, directory, numElements, numSamples)
63
114
  called_sequence = getCalledSequence(filestream, directory, numElements, numBases)
64
- peakIndexes = getPeakIndexes(filestream, directory, numElements, numBases)
65
- return samples_a, samples_c, samples_g, samples_t, called_sequence, peakIndexes
115
+ quality_scores = getQualityScores(filestream, directory, numElements, numBases)
116
+ peak_indexes = getPeakIndexes(filestream, directory, numElements, numBases)
117
+ ##return samples_a, samples_c, samples_g, samples_t, called_sequence, peak_indexes, quality_scores
118
+ @traceA = samples_a
119
+ @traceC = samples_c
120
+ @traceG = samples_g
121
+ @traceT = samples_t
122
+ @calledSequence = called_sequence
123
+ @qualityScores = quality_scores
124
+ @peakIndexes = peak_indexes
125
+ nil
66
126
  end
67
-
127
+
68
128
  #reads 2 unsigned bytes and orders by most significant byte first
69
129
  #
70
130
  #== Parameters:
@@ -73,7 +133,7 @@ module Absee
73
133
  #
74
134
  #== Returns:
75
135
  #an int ordered by most significant byte first
76
- def self.readUnsignedByte_2(offset, filestream)
136
+ def readUnsignedByte_2(offset, filestream)
77
137
  #// most significant byte first
78
138
  #// |byte0|byte1| <= |unsigned int|
79
139
  byteArray = ""
@@ -90,7 +150,7 @@ module Absee
90
150
  #
91
151
  #== Returns:
92
152
  #an int ordered by most significant byte first
93
- def self.readUnsignedByte_4(offset, filestream)
153
+ def readUnsignedByte_4(offset, filestream)
94
154
  byteArray = ""
95
155
  filestream.seek(offset, IO::SEEK_SET)
96
156
  byteArray = filestream.read(4, byteArray)
@@ -109,7 +169,7 @@ module Absee
109
169
  #== Returns:
110
170
  #an array of arrays, each with information from the directory
111
171
  #[name, tag number, element type, element size, number of elements, data size, data offset]
112
- def self.readDirectoryEntry(filestream, dataOffset, numElements)
172
+ def readDirectoryEntry(filestream, dataOffset, numElements)
113
173
  filestream.seek(dataOffset, IO::SEEK_SET)
114
174
  byteArray = ""
115
175
  filestream.read(28*numElements, byteArray)
@@ -163,7 +223,7 @@ module Absee
163
223
  #
164
224
  #== Returns:
165
225
  #the element from the array
166
- def self.get(array, element)
226
+ def get(array, element)
167
227
  if element == "name"
168
228
  return array[0]
169
229
  elsif element == "tag_number"
@@ -191,7 +251,7 @@ module Absee
191
251
  #
192
252
  #== Returns:
193
253
  #number of samples and number of bases contained in this ABIF file
194
- def self.gatherInformation(directory, numElements)
254
+ def gatherInformation(directory, numElements)
195
255
  numSamples = 0
196
256
  numBases = 0
197
257
 
@@ -218,7 +278,7 @@ module Absee
218
278
  #
219
279
  #== Returns:
220
280
  #four arrays with trace data in the order ACGT
221
- def self.getSamples(filestream, directory, numElements, numSamples)
281
+ def getSamples(filestream, directory, numElements, numSamples)
222
282
  samples_a = []
223
283
  samples_c = []
224
284
  samples_g = []
@@ -268,7 +328,7 @@ module Absee
268
328
  #
269
329
  #== Returns:
270
330
  #an array with the called sequence
271
- def self.getCalledSequence(filestream, directory, numElements, numBases)
331
+ def getCalledSequence(filestream, directory, numElements, numBases)
272
332
  calledSequence = []
273
333
  (0..numElements-1).each do |i|
274
334
  if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
@@ -283,6 +343,31 @@ module Absee
283
343
  return calledSequence
284
344
  end
285
345
 
346
+ #extracts the quality score associated with the called sequence
347
+ #
348
+ #== Parameters:
349
+ #filestream:: an open File
350
+ #directory:: an array of array generated by readDirectoryEntry
351
+ #numElements:: an int indicating the number of elements in this ABIF file
352
+ #numBases:: an int calculated by gatherInformation
353
+ #
354
+ #== Returns:
355
+ #an array with the quality scores
356
+ def getQualityScores(filestream, directory, numElements, numBases)
357
+ qualityScore = []
358
+ (0..numElements-1).each do |i|
359
+ if (get(directory[i], "name") == "PCON") && (get(directory[i], "tag_number") == 2)
360
+ byteArray_seq = ""
361
+ filestream.seek(get(directory[i], "data_offset"))
362
+ filestream.read(numBases,byteArray_seq)
363
+ (0..numBases-1).each do |j|
364
+ qualityScore[j] = byteArray_seq.getbyte(j)
365
+ end
366
+ end
367
+ end
368
+ return qualityScore
369
+ end
370
+
286
371
  #extracts the trace information for the bases
287
372
  #
288
373
  #== Parameters:
@@ -293,7 +378,7 @@ module Absee
293
378
  #
294
379
  #== Returns:
295
380
  #an array with the indexes of the peaks
296
- def self.getPeakIndexes(filestream, directory, numElements, numBases)
381
+ def getPeakIndexes(filestream, directory, numElements, numBases)
297
382
  peakIndexes = []
298
383
  (0..numElements-1).each do |i|
299
384
  if (get(directory[i], "name") == "PLOC") && (get(directory[i], "tag_number") == 2)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: absee
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.0
4
+ version: '1.0'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,7 +12,7 @@ cert_chain: []
12
12
  date: 2012-11-14 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: .ab1 reader / ABIF reader; extracts the peak indexes, called sequence,
15
- and ACGT values from sequencing files
15
+ quality scores, and ACGT values from sequencing files
16
16
  email: jencheng@ginkgobioworks.com
17
17
  executables: []
18
18
  extensions: []