absee 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/absee.rb +221 -0
- metadata +47 -0
data/lib/absee.rb
ADDED
@@ -0,0 +1,221 @@
|
|
1
|
+
# absee
|
2
|
+
#
|
3
|
+
# Jenny Cheng
|
4
|
+
# jencheng@ginkgobioworks.com
|
5
|
+
#
|
6
|
+
# based off of Abi.cs by Ronaldo Rodrigues Ferreira
|
7
|
+
#
|
8
|
+
# extracts the data from ABIF files
|
9
|
+
#
|
10
|
+
# MIT license 2012
|
11
|
+
|
12
|
+
def readAB(filename)
|
13
|
+
#opens ab1 as a File object
|
14
|
+
abFile = open(filename)
|
15
|
+
byteArray = ""
|
16
|
+
#// here we read the first four bytes. It is important
|
17
|
+
#// to remember that we do not seek back the file, just
|
18
|
+
#// because it is not necessary to do this.
|
19
|
+
abFile.seek(0, IO::SEEK_SET)
|
20
|
+
abFile.read(4, byteArray)
|
21
|
+
#ABIF file indicator
|
22
|
+
if byteArray == "ABIF"
|
23
|
+
return processAB(abFile)
|
24
|
+
else
|
25
|
+
return [],[],[],[],[],[]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def processAB(filestream)
|
30
|
+
#// here, we can read the ABIF header information
|
31
|
+
version = readUnsignedByte_2(4, filestream)
|
32
|
+
#// major versions greater than 1 are not supported
|
33
|
+
#// Applied Biosystems rules
|
34
|
+
if (version / 100 > 1)
|
35
|
+
return [], [], [], [], [], []
|
36
|
+
end
|
37
|
+
#// we just read ABIF, so we don't need more information than that
|
38
|
+
numElements = readUnsignedByte_4(18, filestream)
|
39
|
+
dataOffset = readUnsignedByte_4(26, filestream)
|
40
|
+
directory = readDirectoryEntry(filestream, dataOffset, numElements)
|
41
|
+
numSamples, numBases = gatherInformation(directory, numElements)
|
42
|
+
samples_a, samples_c, samples_g, samples_t = getSamples(filestream, directory, numElements, numSamples)
|
43
|
+
called_sequence = getCalledSequence(filestream, directory, numElements, numBases)
|
44
|
+
peakIndexes = getPeakIndexes(filestream, directory, numElements, numBases)
|
45
|
+
return samples_a, samples_c, samples_g, samples_t, called_sequence, peakIndexes
|
46
|
+
end
|
47
|
+
|
48
|
+
def readUnsignedByte_2(offset, filestream)
|
49
|
+
#// most significant byte first
|
50
|
+
#// |byte0|byte1| <= |unsigned int|
|
51
|
+
byteArray = ""
|
52
|
+
filestream.seek(offset, IO::SEEK_SET)
|
53
|
+
byteArray = filestream.read(2, byteArray)
|
54
|
+
return (byteArray.getbyte(0) << 8) | byteArray.getbyte(1)
|
55
|
+
end
|
56
|
+
|
57
|
+
def readUnsignedByte_4(offset, filestream)
|
58
|
+
byteArray = ""
|
59
|
+
filestream.seek(offset, IO::SEEK_SET)
|
60
|
+
byteArray = filestream.read(4, byteArray)
|
61
|
+
#// most significant byte first
|
62
|
+
#// |byte0|byte1|byte2|byte3| <= |unsigned int|
|
63
|
+
return (byteArray.getbyte(0)<<24) | (byteArray.getbyte(1)<<16) | (byteArray.getbyte(2)<<8) | byteArray.getbyte(3)
|
64
|
+
end
|
65
|
+
|
66
|
+
def readDirectoryEntry(filestream, dataOffset, numElements)
|
67
|
+
filestream.seek(dataOffset, IO::SEEK_SET)
|
68
|
+
byteArray = ""
|
69
|
+
filestream.read(28*numElements, byteArray)
|
70
|
+
directory = []
|
71
|
+
pos = -1
|
72
|
+
|
73
|
+
|
74
|
+
#directory structure
|
75
|
+
#[name, tag number, element type, element size, number of elements, data size, data offset]
|
76
|
+
(0..(numElements-1)).each do |i|
|
77
|
+
directory[i] = []
|
78
|
+
#// name
|
79
|
+
name = ""
|
80
|
+
name << byteArray.getbyte(pos+=1).chr
|
81
|
+
name << byteArray.getbyte(pos+=1).chr
|
82
|
+
name << byteArray.getbyte(pos+=1).chr
|
83
|
+
name << byteArray.getbyte(pos+=1).chr
|
84
|
+
directory[i] << name
|
85
|
+
#// tag number
|
86
|
+
tag_number = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
|
87
|
+
directory[i] << tag_number
|
88
|
+
#// element type
|
89
|
+
element_type = byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
|
90
|
+
directory[i] << element_type
|
91
|
+
#// element size
|
92
|
+
element_size = byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
|
93
|
+
directory[i] << element_size
|
94
|
+
#// number of elements
|
95
|
+
number_of_elements = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
|
96
|
+
directory[i] << number_of_elements
|
97
|
+
#// data size
|
98
|
+
data_size = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
|
99
|
+
directory[i] << data_size
|
100
|
+
#// data offset
|
101
|
+
data_offset = byteArray.getbyte(pos+=1)<<24 | byteArray.getbyte(pos+=1)<<16 | byteArray.getbyte(pos+=1)<<8 | byteArray.getbyte(pos+=1)
|
102
|
+
directory[i] << data_offset
|
103
|
+
#// we do not save the dataHandle field
|
104
|
+
pos += 4;
|
105
|
+
end
|
106
|
+
return directory
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
#directory structure
|
111
|
+
#[name, tag number, element type, element size, number of elements, data size, data offset]
|
112
|
+
#this is for easier access to the directory element
|
113
|
+
def get(array, element)
|
114
|
+
if element == "name"
|
115
|
+
return array[0]
|
116
|
+
elsif element == "tag_number"
|
117
|
+
return array[1]
|
118
|
+
elsif element == "element_type"
|
119
|
+
return array[2]
|
120
|
+
elsif element == "element_size"
|
121
|
+
return array[3]
|
122
|
+
elsif element == "number_of_elements"
|
123
|
+
return array[4]
|
124
|
+
elsif element == "data_size"
|
125
|
+
return array[5]
|
126
|
+
elsif element == "data_offset"
|
127
|
+
return array[6]
|
128
|
+
else
|
129
|
+
return array[0]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
|
134
|
+
def gatherInformation(directory, numElements)
|
135
|
+
numSamples = 0
|
136
|
+
numBases = 0
|
137
|
+
|
138
|
+
(0..(numElements-1)).each do |i|
|
139
|
+
if (get(directory[i],"name") == "DATA") && (get(directory[i], "tag_number") == 9)
|
140
|
+
numSamples = get(directory[i], "number_of_elements") #number of elements
|
141
|
+
else
|
142
|
+
if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
|
143
|
+
numBases = get(directory[i], "number_of_elements") #number of elements
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
return numSamples, numBases
|
149
|
+
end
|
150
|
+
|
151
|
+
def getSamples(filestream, directory, numElements, numSamples)
|
152
|
+
samples_a = []
|
153
|
+
samples_c = []
|
154
|
+
samples_g = []
|
155
|
+
samples_t = []
|
156
|
+
|
157
|
+
#// we guess the order being GATC, as Ferreira and Staden does
|
158
|
+
(0..numElements-1).each do |i|
|
159
|
+
tag_number = get(directory[i], "tag_number")
|
160
|
+
if (get(directory[i],"name") == "DATA") && ([9,10,11,12].include? tag_number)
|
161
|
+
byteArray_samples = ""
|
162
|
+
filestream.seek(get(directory[i],"data_offset"), IO::SEEK_SET)
|
163
|
+
filestream.read(get(directory[i], "number_of_elements")*2, byteArray_samples)
|
164
|
+
pos = -1
|
165
|
+
if tag_number == 9 #G
|
166
|
+
(0..numSamples-1).each do |j|
|
167
|
+
value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
|
168
|
+
samples_g[j] = value
|
169
|
+
end
|
170
|
+
elsif tag_number == 10 #A
|
171
|
+
(0..numSamples-1).each do |j|
|
172
|
+
value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
|
173
|
+
samples_a[j] = value
|
174
|
+
end
|
175
|
+
elsif tag_number == 11 #T
|
176
|
+
(0..numSamples-1).each do |j|
|
177
|
+
value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
|
178
|
+
samples_t[j] = value
|
179
|
+
end
|
180
|
+
else #C
|
181
|
+
(0..numSamples-1).each do |j|
|
182
|
+
value = byteArray_samples.getbyte(pos+=1) << 8 | byteArray_samples.getbyte(pos+=1)
|
183
|
+
samples_c[j] = value
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
return samples_a, samples_c, samples_g, samples_t
|
189
|
+
end
|
190
|
+
|
191
|
+
def getCalledSequence(filestream, directory, numElements, numBases)
|
192
|
+
calledSequence = []
|
193
|
+
(0..numElements-1).each do |i|
|
194
|
+
if (get(directory[i], "name") == "PBAS") && (get(directory[i], "tag_number") == 2)
|
195
|
+
byteArray_seq = ""
|
196
|
+
filestream.seek(get(directory[i], "data_offset"))
|
197
|
+
filestream.read(numBases,byteArray_seq)
|
198
|
+
(0..numBases-1).each do |j|
|
199
|
+
calledSequence[j] = byteArray_seq.getbyte(j).chr
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
return calledSequence
|
204
|
+
end
|
205
|
+
|
206
|
+
def getPeakIndexes(filestream, directory, numElements, numBases)
|
207
|
+
peakIndexes = []
|
208
|
+
(0..numElements-1).each do |i|
|
209
|
+
if (get(directory[i], "name") == "PLOC") && (get(directory[i], "tag_number") == 2)
|
210
|
+
byteArray_peak = ""
|
211
|
+
filestream.seek(get(directory[i], "data_offset"), IO::SEEK_SET)
|
212
|
+
filestream.read(get(directory[i], "number_of_elements")*4, byteArray_peak)
|
213
|
+
pos = -1
|
214
|
+
(0..numBases-1).each do |j|
|
215
|
+
peakIndex = byteArray_peak.getbyte(pos+=1) << 8 | byteArray_peak.getbyte(pos+=1)
|
216
|
+
peakIndexes[j] = peakIndex
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
return peakIndexes
|
221
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: absee
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jenny Cheng
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-04-24 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: reads ABIF sequencing / chromatogram files and extracts the peak indexes,
|
15
|
+
called sequence, and ACGT values
|
16
|
+
email: jencheng@ginkgobioworks.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- lib/absee.rb
|
22
|
+
homepage: http://rubygems.org/gems/absee
|
23
|
+
licenses:
|
24
|
+
- MIT
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ! '>='
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: 1.9.3
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 1.8.23
|
44
|
+
signing_key:
|
45
|
+
specification_version: 3
|
46
|
+
summary: reads .ab1 sequencing/chromatogram files
|
47
|
+
test_files: []
|