absee 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/absee.rb +221 -0
  2. metadata +47 -0
data/lib/absee.rb ADDED
@@ -0,0 +1,221 @@
1
+ # absee
2
+ #
3
+ # Jenny Cheng
4
+ # jencheng@ginkgobioworks.com
5
+ #
6
+ # based off of Abi.cs by Ronaldo Rodrigues Ferreira
7
+ #
8
+ # extracts the data from ABIF files
9
+ #
10
+ # MIT license 2012
11
+
12
+ def readAB(filename)
13
+ #opens ab1 as a File object
14
+ abFile = open(filename)
15
+ byteArray = ""
16
+ #// here we read the first four bytes. It is important
17
+ #// to remember that we do not seek back the file, just
18
+ #// because it is not necessary to do this.
19
+ abFile.seek(0, IO::SEEK_SET)
20
+ abFile.read(4, byteArray)
21
+ #ABIF file indicator
22
+ if byteArray == "ABIF"
23
+ return processAB(abFile)
24
+ else
25
+ return [],[],[],[],[],[]
26
+ end
27
+ end
28
+
29
+ def processAB(filestream)
30
+ #// here, we can read the ABIF header information
31
+ version = readUnsignedByte_2(4, filestream)
32
+ #// major versions greater than 1 are not supported
33
+ #// Applied Biosystems rules
34
+ if (version / 100 > 1)
35
+ return [], [], [], [], [], []
36
+ end
37
+ #// we just read ABIF, so we don't need more information than that
38
+ numElements = readUnsignedByte_4(18, filestream)
39
+ dataOffset = readUnsignedByte_4(26, filestream)
40
+ directory = readDirectoryEntry(filestream, dataOffset, numElements)
41
+ numSamples, numBases = gatherInformation(directory, numElements)
42
+ samples_a, samples_c, samples_g, samples_t = getSamples(filestream, directory, numElements, numSamples)
43
+ called_sequence = getCalledSequence(filestream, directory, numElements, numBases)
44
+ peakIndexes = getPeakIndexes(filestream, directory, numElements, numBases)
45
+ return samples_a, samples_c, samples_g, samples_t, called_sequence, peakIndexes
46
+ end
47
+
48
+ def readUnsignedByte_2(offset, filestream)
49
+ #// most significant byte first
50
+ #// |byte0|byte1| <= |unsigned int|
51
+ byteArray = ""
52
+ filestream.seek(offset, IO::SEEK_SET)
53
+ byteArray = filestream.read(2, byteArray)
54
+ return (byteArray.getbyte(0) << 8) | byteArray.getbyte(1)
55
+ end
56
+
57
+ def readUnsignedByte_4(offset, filestream)
58
+ byteArray = ""
59
+ filestream.seek(offset, IO::SEEK_SET)
60
+ byteArray = filestream.read(4, byteArray)
61
+ #// most significant byte first
62
+ #// |byte0|byte1|byte2|byte3| <= |unsigned int|
63
+ return (byteArray.getbyte(0)<<24) | (byteArray.getbyte(1)<<16) | (byteArray.getbyte(2)<<8) | byteArray.getbyte(3)
64
+ end
65
+
66
+ def readDirectoryEntry(filestream, dataOffset, numElements)
67
+ filestream.seek(dataOffset, IO::SEEK_SET)
68
+ byteArray = ""
69
+ filestream.read(28*numElements, byteArray)
70
+ directory = []
71
+ pos = -1
72
+
73
+
74
+ #directory structure
75
+ #[name, tag number, element type, element size, number of elements, data size, data offset]
76
+ (0..(numElements-1)).each do |i|
77
+ directory[i] = []
78
+ #// name
79
+ name = ""
80
+ name << byteArray.getbyte(pos+=1).chr
81
+ name << byteArray.getbyte(pos+=1).chr
82
+ name << byteArray.getbyte(pos+=1).chr
83
+ name << byteArray.getbyte(pos+=1).chr
84
+ directory[i] << name
85
+ #// tag number
86
+ tag_number = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
87
+ directory[i] << tag_number
88
+ #// element type
89
+ element_type = byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
90
+ directory[i] << element_type
91
+ #// element size
92
+ element_size = byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
93
+ directory[i] << element_size
94
+ #// number of elements
95
+ number_of_elements = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
96
+ directory[i] << number_of_elements
97
+ #// data size
98
+ data_size = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
99
+ directory[i] << data_size
100
+ #// data offset
101
+ data_offset = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
102
+ directory[i] << data_offset
103
+ #// we do not save the dataHandle field
104
+ pos += 4;
105
+ end
106
+ return directory
107
+ end
108
+
109
+
110
+ #directory structure
111
+ #[name, tag number, element type, element size, number of elements, data size, data offset]
112
+ #this is for easier access to the directory element
113
+ def get(array, element)
114
+ if element == "name"
115
+ return array[0]
116
+ elsif element == "tag_number"
117
+ return array[1]
118
+ elsif element == "element_type"
119
+ return array[2]
120
+ elsif element == "element_size"
121
+ return array[3]
122
+ elsif element == "number_of_elements"
123
+ return array[4]
124
+ elsif element == "data_size"
125
+ return array[5]
126
+ elsif element == "data_offset"
127
+ return array[6]
128
+ else
129
+ return array[0]
130
+ end
131
+ end
132
+
133
+
134
+ def gatherInformation(directory, numElements)
135
+ numSamples = 0
136
+ numBases = 0
137
+
138
+ (0..(numElements-1)).each do |i|
139
+ if (get(directory[i],"name") == "DATA") && (get(directory[i], "tag_number") == 9)
140
+ numSamples = get(directory[i], "number_of_elements") #number of elements
141
+ else
142
+ if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
143
+ numBases = get(directory[i], "number_of_elements") #number of elements
144
+ end
145
+ end
146
+ end
147
+
148
+ return numSamples, numBases
149
+ end
150
+
151
+ def getSamples(filestream, directory, numElements, numSamples)
152
+ samples_a = []
153
+ samples_c = []
154
+ samples_g = []
155
+ samples_t = []
156
+
157
+ #// we guess the order being GATC, as Ferreira and Staden does
158
+ (0..numElements-1).each do |i|
159
+ tag_number = get(directory[i], "tag_number")
160
+ if (get(directory[i],"name") == "DATA") && ([9,10,11,12].include? tag_number)
161
+ byteArray_samples = ""
162
+ filestream.seek(get(directory[i],"data_offset"), IO::SEEK_SET)
163
+ filestream.read(get(directory[i], "number_of_elements")*2, byteArray_samples)
164
+ pos = -1
165
+ if tag_number == 9 #G
166
+ (0..numSamples-1).each do |j|
167
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
168
+ samples_g[j] = value
169
+ end
170
+ elsif tag_number == 10 #A
171
+ (0..numSamples-1).each do |j|
172
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
173
+ samples_a[j] = value
174
+ end
175
+ elsif tag_number == 11 #T
176
+ (0..numSamples-1).each do |j|
177
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
178
+ samples_t[j] = value
179
+ end
180
+ else #C
181
+ (0..numSamples-1).each do |j|
182
+ value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
183
+ samples_c[j] = value
184
+ end
185
+ end
186
+ end
187
+ end
188
+ return samples_a, samples_c, samples_g, samples_t
189
+ end
190
+
191
+ def getCalledSequence(filestream, directory, numElements, numBases)
192
+ calledSequence = []
193
+ (0..numElements-1).each do |i|
194
+ if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
195
+ byteArray_seq = ""
196
+ filestream.seek(get(directory[i], "data_offset"))
197
+ filestream.read(numBases,byteArray_seq)
198
+ (0..numBases-1).each do |j|
199
+ calledSequence[j] = byteArray_seq.getbyte(j).chr
200
+ end
201
+ end
202
+ end
203
+ return calledSequence
204
+ end
205
+
206
+ def getPeakIndexes(filestream, directory, numElements, numBases)
207
+ peakIndexes = []
208
+ (0..numElements-1).each do |i|
209
+ if (get(directory[i], "name") == "PLOC") && (get(directory[i], "tag_number") == 2)
210
+ byteArray_peak = ""
211
+ filestream.seek(get(directory[i], "data_offset"), IO::SEEK_SET)
212
+ filestream.read(get(directory[i], "number_of_elements")*4, byteArray_peak)
213
+ pos = -1
214
+ (0..numBases-1).each do |j|
215
+ peakIndex = byteArray_peak.getbyte(pos+=1) << 8 | byteArray_peak.getbyte(pos+=1)
216
+ peakIndexes[j] = peakIndex
217
+ end
218
+ end
219
+ end
220
+ return peakIndexes
221
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: absee
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jenny Cheng
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-24 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: reads ABIF sequencing / chromatogram files and extracts the peak indexes,
15
+ called sequence, and ACGT values
16
+ email: jencheng@ginkgobioworks.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/absee.rb
22
+ homepage: http://rubygems.org/gems/absee
23
+ licenses:
24
+ - MIT
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: 1.9.3
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 1.8.23
44
+ signing_key:
45
+ specification_version: 3
46
+ summary: reads .ab1 sequencing/chromatogram files
47
+ test_files: []