absee 0.1.0.0 → 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/absee.rb +102 -17
  2. metadata +2 -2
@@ -9,8 +9,17 @@
9
9
  #
10
10
  # MIT license 2012
11
11
 
12
- module Absee
12
+ class ABSee
13
13
 
14
+ #variables
15
+ @traceA = []
16
+ @traceG = []
17
+ @traceC = []
18
+ @traceT = []
19
+ @calledSequences = []
20
+ @peakIndexes = []
21
+ @qualityScores = []
22
+
14
23
  #opens the ABIF sequencing / chromatogram file
15
24
  #checks for ABIF file type
16
25
  #major ABIF versions greater than 1 are not supported
@@ -21,7 +30,7 @@ module Absee
21
30
  #
22
31
  #== Returns:
23
32
  # Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
24
- def self.readAB(filename)
33
+ def read(filename)
25
34
  #opens ab1 as a File object
26
35
  abFile = open(filename)
27
36
  byteArray = ""
@@ -32,12 +41,54 @@ module Absee
32
41
  abFile.read(4, byteArray)
33
42
  #ABIF file indicator
34
43
  if byteArray == "ABIF"
35
- return processAB(abFile)
44
+ processAB(abFile)
36
45
  else
37
- return [],[],[],[],[],[]
46
+ raise "file not recognized as ABIF"
38
47
  end
39
48
  end
40
49
 
50
+
51
+
52
+ ##accessors
53
+ #== Returns:
54
+ # the trace data for adenine
55
+ def get_traceA()
56
+ return @traceA
57
+ end
58
+ #== Returns:
59
+ # an array with the trace data for guanine
60
+ def get_traceG()
61
+ return @traceG
62
+ end
63
+ #== Returns:
64
+ # an array with the trace data for thymine
65
+ def get_traceT()
66
+ return @traceT
67
+ end
68
+ #== Returns:
69
+ # an array with the trace data for cytosine
70
+ def get_traceC()
71
+ return @traceC
72
+ end
73
+ #== Returns:
74
+ # an array with the Basecalled sequence
75
+ def get_calledSequence()
76
+ return @calledSequence
77
+ end
78
+ #== Returns:
79
+ # an array with the Basecalled quality scores
80
+ def get_qualityScores()
81
+ return @qualityScores
82
+ end
83
+ #== Returns:
84
+ # an array with the peak indexes
85
+ def get_peakIndexes()
86
+ return @peakIndexes
87
+ end
88
+
89
+
90
+ private
91
+
41
92
  #process the opened ABIF filestream, and calls subsequent methods to extract the data
42
93
  #
43
94
  #== Parameters:
@@ -46,13 +97,13 @@ module Absee
46
97
  #== Returns:
47
98
  #Six arrays: trace data for A, C, G, T, called sequence, and peak indexes
48
99
  #readAB returns the results of this method
49
- def self.processAB(filestream)
100
+ def processAB(filestream)
50
101
  #// here, we can read the ABIF header information
51
102
  version = readUnsignedByte_2(4, filestream)
52
103
  #// major versions greater than 1 are not supported
53
104
  #// Applied Biosystems rules
54
105
  if (version / 100 > 1)
55
- return [], [], [], [], [], []
106
+ raise "ABIF version #{version} not supported (only supported for version less than 1)"
56
107
  end
57
108
  #// we just read ABIF, so we don't need more information than that
58
109
  numElements = readUnsignedByte_4(18, filestream)
@@ -61,10 +112,19 @@ module Absee
61
112
  numSamples, numBases = gatherInformation(directory, numElements)
62
113
  samples_a, samples_c, samples_g, samples_t = getSamples(filestream, directory, numElements, numSamples)
63
114
  called_sequence = getCalledSequence(filestream, directory, numElements, numBases)
64
- peakIndexes = getPeakIndexes(filestream, directory, numElements, numBases)
65
- return samples_a, samples_c, samples_g, samples_t, called_sequence, peakIndexes
115
+ quality_scores = getQualityScores(filestream, directory, numElements, numBases)
116
+ peak_indexes = getPeakIndexes(filestream, directory, numElements, numBases)
117
+ ##return samples_a, samples_c, samples_g, samples_t, called_sequence, peak_indexes, quality_scores
118
+ @traceA = samples_a
119
+ @traceC = samples_c
120
+ @traceG = samples_g
121
+ @traceT = samples_t
122
+ @calledSequence = called_sequence
123
+ @qualityScores = quality_scores
124
+ @peakIndexes = peak_indexes
125
+ nil
66
126
  end
67
-
127
+
68
128
  #reads 2 unsigned bytes and orders by most significant byte first
69
129
  #
70
130
  #== Parameters:
@@ -73,7 +133,7 @@ module Absee
73
133
  #
74
134
  #== Returns:
75
135
  #an int ordered by most significant byte first
76
- def self.readUnsignedByte_2(offset, filestream)
136
+ def readUnsignedByte_2(offset, filestream)
77
137
  #// most significant byte first
78
138
  #// |byte0|byte1| <= |unsigned int|
79
139
  byteArray = ""
@@ -90,7 +150,7 @@ module Absee
90
150
  #
91
151
  #== Returns:
92
152
  #an int ordered by most significant byte first
93
- def self.readUnsignedByte_4(offset, filestream)
153
+ def readUnsignedByte_4(offset, filestream)
94
154
  byteArray = ""
95
155
  filestream.seek(offset, IO::SEEK_SET)
96
156
  byteArray = filestream.read(4, byteArray)
@@ -109,7 +169,7 @@ module Absee
109
169
  #== Returns:
110
170
  #an array of arrays, each with information from the directory
111
171
  #[name, tag number, element type, element size, number of elements, data size, data offset]
112
- def self.readDirectoryEntry(filestream, dataOffset, numElements)
172
+ def readDirectoryEntry(filestream, dataOffset, numElements)
113
173
  filestream.seek(dataOffset, IO::SEEK_SET)
114
174
  byteArray = ""
115
175
  filestream.read(28*numElements, byteArray)
@@ -163,7 +223,7 @@ module Absee
163
223
  #
164
224
  #== Returns:
165
225
  #the element from the array
166
- def self.get(array, element)
226
+ def get(array, element)
167
227
  if element == "name"
168
228
  return array[0]
169
229
  elsif element == "tag_number"
@@ -191,7 +251,7 @@ module Absee
191
251
  #
192
252
  #== Returns:
193
253
  #number of samples and number of bases contained in this ABIF file
194
- def self.gatherInformation(directory, numElements)
254
+ def gatherInformation(directory, numElements)
195
255
  numSamples = 0
196
256
  numBases = 0
197
257
 
@@ -218,7 +278,7 @@ module Absee
218
278
  #
219
279
  #== Returns:
220
280
  #four arrays with trace data in the order ACGT
221
- def self.getSamples(filestream, directory, numElements, numSamples)
281
+ def getSamples(filestream, directory, numElements, numSamples)
222
282
  samples_a = []
223
283
  samples_c = []
224
284
  samples_g = []
@@ -268,7 +328,7 @@ module Absee
268
328
  #
269
329
  #== Returns:
270
330
  #an array with the called sequence
271
- def self.getCalledSequence(filestream, directory, numElements, numBases)
331
+ def getCalledSequence(filestream, directory, numElements, numBases)
272
332
  calledSequence = []
273
333
  (0..numElements-1).each do |i|
274
334
  if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
@@ -283,6 +343,31 @@ module Absee
283
343
  return calledSequence
284
344
  end
285
345
 
346
+ #extracts the quality score associated with the called sequence
347
+ #
348
+ #== Parameters:
349
+ #filestream:: an open File
350
+ #directory:: an array of array generated by readDirectoryEntry
351
+ #numElements:: an int indicating the number of elements in this ABIF file
352
+ #numBases:: an int calculated by gatherInformation
353
+ #
354
+ #== Returns:
355
+ #an array with the quality scores
356
+ def getQualityScores(filestream, directory, numElements, numBases)
357
+ qualityScore = []
358
+ (0..numElements-1).each do |i|
359
+ if (get(directory[i], "name") == "PCON") && (get(directory[i], "tag_number") == 2)
360
+ byteArray_seq = ""
361
+ filestream.seek(get(directory[i], "data_offset"))
362
+ filestream.read(numBases,byteArray_seq)
363
+ (0..numBases-1).each do |j|
364
+ qualityScore[j] = byteArray_seq.getbyte(j)
365
+ end
366
+ end
367
+ end
368
+ return qualityScore
369
+ end
370
+
286
371
  #extracts the trace information for the bases
287
372
  #
288
373
  #== Parameters:
@@ -293,7 +378,7 @@ module Absee
293
378
  #
294
379
  #== Returns:
295
380
  #an array with the indexes of the peaks
296
- def self.getPeakIndexes(filestream, directory, numElements, numBases)
381
+ def getPeakIndexes(filestream, directory, numElements, numBases)
297
382
  peakIndexes = []
298
383
  (0..numElements-1).each do |i|
299
384
  if (get(directory[i], "name") == "PLOC") && (get(directory[i], "tag_number") == 2)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: absee
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.0
4
+ version: '1.0'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,7 +12,7 @@ cert_chain: []
12
12
  date: 2012-11-14 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: .ab1 reader / ABIF reader; extracts the peak indexes, called sequence,
15
- and ACGT values from sequencing files
15
+ quality scores, and ACGT values from sequencing files
16
16
  email: jencheng@ginkgobioworks.com
17
17
  executables: []
18
18
  extensions: []