absee 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/absee.rb +221 -0
  2. metadata +47 -0
data/lib/absee.rb ADDED
@@ -0,0 +1,221 @@
1
+ # absee
2
+ #
3
+ # Jenny Cheng
4
+ # jencheng@ginkgobioworks.com
5
+ #
6
+ # based off of Abi.cs by Ronaldo Rodrigues Ferreira
7
+ #
8
+ # extracts the data from ABIF files
9
+ #
10
+ # MIT license 2012
11
+
12
+ def readAB(filename)
13
+ #opens ab1 as a File object
14
+ abFile = open(filename)
15
+ byteArray = ""
16
+ #// here we read the first four bytes. It is important
17
+ #// to remember that we do not seek back the file, just
18
+ #// because it is not necessary to do this.
19
+ abFile.seek(0, IO::SEEK_SET)
20
+ abFile.read(4, byteArray)
21
+ #ABIF file indicator
22
+ if byteArray == "ABIF"
23
+ return processAB(abFile)
24
+ else
25
+ return [],[],[],[],[],[]
26
+ end
27
+ end
28
+
29
+ def processAB(filestream)
30
+ #// here, we can read the ABIF header information
31
+ version = readUnsignedByte_2(4, filestream)
32
+ #// major versions greater than 1 are not supported
33
+ #// Applied Biosystems rules
34
+ if (version / 100 > 1)
35
+ return [], [], [], [], [], []
36
+ end
37
+ #// we just read ABIF, so we don't need more information than that
38
+ numElements = readUnsignedByte_4(18, filestream)
39
+ dataOffset = readUnsignedByte_4(26, filestream)
40
+ directory = readDirectoryEntry(filestream, dataOffset, numElements)
41
+ numSamples, numBases = gatherInformation(directory, numElements)
42
+ samples_a, samples_c, samples_g, samples_t = getSamples(filestream, directory, numElements, numSamples)
43
+ called_sequence = getCalledSequence(filestream, directory, numElements, numBases)
44
+ peakIndexes = getPeakIndexes(filestream, directory, numElements, numBases)
45
+ return samples_a, samples_c, samples_g, samples_t, called_sequence, peakIndexes
46
+ end
47
+
48
+ def readUnsignedByte_2(offset, filestream)
49
+ #// most significant byte first
50
+ #// |byte0|byte1| <= |unsigned int|
51
+ byteArray = ""
52
+ filestream.seek(offset, IO::SEEK_SET)
53
+ byteArray = filestream.read(2, byteArray)
54
+ return (byteArray.getbyte(0) << 8) | byteArray.getbyte(1)
55
+ end
56
+
57
+ def readUnsignedByte_4(offset, filestream)
58
+ byteArray = ""
59
+ filestream.seek(offset, IO::SEEK_SET)
60
+ byteArray = filestream.read(4, byteArray)
61
+ #// most significant byte first
62
+ #// |byte0|byte1|byte2|byte3| <= |unsigned int|
63
+ return (byteArray.getbyte(0)<<24) | (byteArray.getbyte(1)<<16) | (byteArray.getbyte(2)<<8) | byteArray.getbyte(3)
64
+ end
65
+
66
+ def readDirectoryEntry(filestream, dataOffset, numElements)
67
+ filestream.seek(dataOffset, IO::SEEK_SET)
68
+ byteArray = ""
69
+ filestream.read(28*numElements, byteArray)
70
+ directory = []
71
+ pos = -1
72
+
73
+
74
+ #directory structure
75
+ #[name, tag number, element type, element size, number of elements, data size, data offset]
76
+ (0..(numElements-1)).each do |i|
77
+ directory[i] = []
78
+ #// name
79
+ name = ""
80
+ name << byteArray.getbyte(pos+=1).chr
81
+ name << byteArray.getbyte(pos+=1).chr
82
+ name << byteArray.getbyte(pos+=1).chr
83
+ name << byteArray.getbyte(pos+=1).chr
84
+ directory[i] << name
85
+ #// tag number
86
+ tag_number = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
87
+ directory[i] << tag_number
88
+ #// element type
89
+ element_type = byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
90
+ directory[i] << element_type
91
+ #// element size
92
+ element_size = byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
93
+ directory[i] << element_size
94
+ #// number of elements
95
+ number_of_elements = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
96
+ directory[i] << number_of_elements
97
+ #// data size
98
+ data_size = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
99
+ directory[i] << data_size
100
+ #// data offset
101
+ data_offset = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
102
+ directory[i] << data_offset
103
+ #// we do not save the dataHandle field
104
+ pos += 4;
105
+ end
106
+ return directory
107
+ end
108
+
109
+
110
+ #directory structure
111
+ #[name, tag number, element type, element size, number of elements, data size, data offset]
112
+ #this is for easier access to the directory element
113
+ def get(array, element)
114
+ if element == "name"
115
+ return array[0]
116
+ elsif element == "tag_number"
117
+ return array[1]
118
+ elsif element == "element_type"
119
+ return array[2]
120
+ elsif element == "element_size"
121
+ return array[3]
122
+ elsif element == "number_of_elements"
123
+ return array[4]
124
+ elsif element == "data_size"
125
+ return array[5]
126
+ elsif element == "data_offset"
127
+ return array[6]
128
+ else
129
+ return array[0]
130
+ end
131
+ end
132
+
133
+
134
+ def gatherInformation(directory, numElements)
135
+ numSamples = 0
136
+ numBases = 0
137
+
138
+ (0..(numElements-1)).each do |i|
139
+ if (get(directory[i],"name") == "DATA") && (get(directory[i], "tag_number") == 9)
140
+ numSamples = get(directory[i], "number_of_elements") #number of elements
141
+ else
142
+ if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
143
+ numBases = get(directory[i], "number_of_elements") #number of elements
144
+ end
145
+ end
146
+ end
147
+
148
+ return numSamples, numBases
149
+ end
150
+
151
+ def getSamples(filestream, directory, numElements, numSamples)
152
+ samples_a = []
153
+ samples_c = []
154
+ samples_g = []
155
+ samples_t = []
156
+
157
+ #// we guess the order being GATC, as Ferreira and Staden does
158
+ (0..numElements-1).each do |i|
159
+ tag_number = get(directory[i], "tag_number")
160
+ if (get(directory[i],"name") == "DATA") && ([9,10,11,12].include? tag_number)
161
+ byteArray_samples = ""
162
+ filestream.seek(get(directory[i],"data_offset"), IO::SEEK_SET)
163
+ filestream.read(get(directory[i], "number_of_elements")*2, byteArray_samples)
164
+ pos = -1
165
+ if tag_number == 9 #G
166
+ (0..numSamples-1).each do |j|
167
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
168
+ samples_g[j] = value
169
+ end
170
+ elsif tag_number == 10 #A
171
+ (0..numSamples-1).each do |j|
172
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
173
+ samples_a[j] = value
174
+ end
175
+ elsif tag_number == 11 #T
176
+ (0..numSamples-1).each do |j|
177
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
178
+ samples_t[j] = value
179
+ end
180
+ else #C
181
+ (0..numSamples-1).each do |j|
182
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
183
+ samples_c[j] = value
184
+ end
185
+ end
186
+ end
187
+ end
188
+ return samples_a, samples_c, samples_g, samples_t
189
+ end
190
+
191
+ def getCalledSequence(filestream, directory, numElements, numBases)
192
+ calledSequence = []
193
+ (0..numElements-1).each do |i|
194
+ if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
195
+ byteArray_seq = ""
196
+ filestream.seek(get(directory[i], "data_offset"))
197
+ filestream.read(numBases,byteArray_seq)
198
+ (0..numBases-1).each do |j|
199
+ calledSequence[j] = byteArray_seq.getbyte(j).chr
200
+ end
201
+ end
202
+ end
203
+ return calledSequence
204
+ end
205
+
206
+ def getPeakIndexes(filestream, directory, numElements, numBases)
207
+ peakIndexes = []
208
+ (0..numElements-1).each do |i|
209
+ if (get(directory[i], "name") == "PLOC") && (get(directory[i], "tag_number") == 2)
210
+ byteArray_peak = ""
211
+ filestream.seek(get(directory[i], "data_offset"), IO::SEEK_SET)
212
+ filestream.read(get(directory[i], "number_of_elements")*4, byteArray_peak)
213
+ pos = -1
214
+ (0..numBases-1).each do |j|
215
+ peakIndex = byteArray_peak.getbyte(pos+=1) << 8 | byteArray_peak.getbyte(pos+=1)
216
+ peakIndexes[j] = peakIndex
217
+ end
218
+ end
219
+ end
220
+ return peakIndexes
221
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: absee
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jenny Cheng
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-24 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: reads ABIF sequencing / chromatogram files and extracts the peak indexes,
15
+ called sequence, and ACGT values
16
+ email: jencheng@ginkgobioworks.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/absee.rb
22
+ homepage: http://rubygems.org/gems/absee
23
+ licenses:
24
+ - MIT
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: 1.9.3
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 1.8.23
44
+ signing_key:
45
+ specification_version: 3
46
+ summary: reads .ab1 sequencing/chromatogram files
47
+ test_files: []