bio-maf 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/bio/maf/index.rb CHANGED
@@ -65,10 +65,11 @@ module Bio
65
65
  include KVHelpers
66
66
 
67
67
  attr_reader :db, :species, :species_max_id
68
- attr_accessor :index_sequences
68
+ attr_accessor :index_sequences, :ref_seq
69
69
 
70
70
  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
71
71
  FORMAT_VERSION = 2
72
+ REF_SEQ_KEY = 'bio-maf:reference-sequence'
72
73
  MAX_SPECIES = 64
73
74
 
74
75
  ## Key-value store index format
@@ -221,6 +222,7 @@ module Bio
221
222
  raise "Could not open DB file!"
222
223
  end
223
224
  if mode == KyotoCabinet::DB::OREADER
225
+ self.ref_seq = db[REF_SEQ_KEY]
224
226
  load_index_sequences
225
227
  load_species
226
228
  end
@@ -309,11 +311,12 @@ module Bio
309
311
  end
310
312
  ready = Time.now
311
313
  $stderr.puts "bin intervals computed after #{ready - start} seconds."
312
- if RUBY_PLATFORM == 'java'
313
- scan_bins_parallel(chrom_id, bin_intervals, filters)
314
- else
315
- scan_bins(chrom_id, bin_intervals, filters)
316
- end
314
+ matches = if RUBY_PLATFORM == 'java'
315
+ scan_bins_parallel(chrom_id, bin_intervals, filters)
316
+ else
317
+ scan_bins(chrom_id, bin_intervals, filters)
318
+ end
319
+ matches.sort_by! { |e| e[0] } # sort by offset in file
317
320
  end # #fetch_list
318
321
 
319
322
  # Scan the index for blocks matching the given bins and intervals.
@@ -344,7 +347,7 @@ module Bio
344
347
 
345
348
  def scan_bins_parallel(chrom_id, bin_intervals, filters)
346
349
  start = Time.now
347
- n_threads = ENV['profile'] ? 1 : 4
350
+ n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
348
351
  jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
349
352
  completed = java.util.concurrent.LinkedBlockingQueue.new(128)
350
353
  threads = []
@@ -445,7 +448,8 @@ module Bio
445
448
 
446
449
  def build_default(parser)
447
450
  first_block = parser.parse_block
448
- ref_seq = first_block.sequences.first.source
451
+ self.ref_seq = first_block.sequences.first.source
452
+ db[REF_SEQ_KEY] = ref_seq
449
453
  db[FORMAT_VERSION_KEY] = FORMAT_VERSION
450
454
  @index_sequences = { ref_seq => 0 }
451
455
  store_index_sequences!
@@ -521,6 +525,9 @@ module Bio
521
525
  end
522
526
 
523
527
  def entries_for(block)
528
+ unless block.ref_seq.source == @ref_seq
529
+ raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
530
+ end
524
531
  h = {}
525
532
  val = build_block_value(block)
526
533
  block.sequences.each do |seq|
@@ -0,0 +1,267 @@
1
+ module Bio
2
+ module MAF
3
+
4
+ # A MAF header, containing the variable-value pairs from the first
5
+ # line of the file as well as the alignment parameters.
6
+ # @api public
7
+ class Header
8
+ # Variable-value pairs from the ##maf line
9
+ # @return [Hash]
10
+ attr_accessor :vars
11
+ # Alignment parameters from the MAF header.
12
+ # @return [Hash]
13
+ attr_accessor :alignment_params
14
+
15
+ def initialize(vars, params)
16
+ @vars = vars
17
+ @alignment_params = params
18
+ end
19
+
20
+ # The required version parameter.
21
+ # @return [String]
22
+ def version
23
+ vars[:version]
24
+ end
25
+
26
+ # The optional scoring parameter, if present.
27
+ # @return [String]
28
+ def scoring
29
+ vars[:scoring]
30
+ end
31
+
32
+ end
33
+
34
+ # A MAF alignment block.
35
+ # @api public
36
+ class Block
37
+ # Parameters from the 'a' line starting the alignment block.
38
+ attr_reader :vars
39
+ # Sequences, one per 's' or 'e' line.
40
+ # @return [Array<Sequence>]
41
+ attr_reader :sequences
42
+ # Offset of the alignment block within the MAF file, in bytes.
43
+ # @return [Integer]
44
+ attr_reader :offset
45
+ # Size of the alignment block within the MAF file, in bytes.
46
+ # @return [Integer]
47
+ attr_reader :size
48
+
49
+ def initialize(vars, sequences, offset, size, filtered)
50
+ @vars = vars
51
+ @sequences = sequences
52
+ @offset = offset
53
+ @size = size
54
+ @filtered = filtered
55
+ end
56
+
57
+ def ref_seq
58
+ sequences[0]
59
+ end
60
+
61
+ def raw_seq(i)
62
+ sequences.fetch(i)
63
+ end
64
+
65
+ def each_raw_seq
66
+ sequences.each { |s| yield s }
67
+ end
68
+
69
+ # Text size of the alignment block. This is the number of text
70
+ # characters in each line of sequence data, including dashes and
71
+ # other gaps in the sequence.
72
+ def text_size
73
+ sequences.first.text.size
74
+ end
75
+
76
+ # Whether this block has been modified by a parser filter.
77
+ # @return [Boolean]
78
+ def filtered?
79
+ @filtered
80
+ end
81
+
82
+ GAP = /-+/
83
+
84
+ # Remove gaps present in all sequences. These would generally
85
+ # occur when some sequences have been filtered out.
86
+ # @see #remove_gaps!
87
+ # @see Parser#sequence_filter
88
+ def find_gaps
89
+ ref_s = StringScanner.new(sequences.first.text)
90
+ others = sequences.slice(1, sequences.size - 1).reject { |s| s.empty? }.collect { |s| StringScanner.new(s.text) }
91
+ gaps = []
92
+ while ref_s.scan_until(GAP)
93
+ offset = ref_s.pos - ref_s.matched_size
94
+ others.each { |s| s.pos = offset }
95
+ unless others.find { |s| ! s.scan(GAP) }
96
+ # all matched
97
+ gap_size = [ref_s.matched_size,
98
+ others.map {|s| s.matched_size}.min].min
99
+ gaps << [offset, gap_size]
100
+ end
101
+ end
102
+ gaps
103
+ end
104
+
105
+ # Remove gaps present in all sequences. These would generally
106
+ # occur when some sequences have been filtered out.
107
+ # @see #find_gaps
108
+ # @see Parser#sequence_filter
109
+ def remove_gaps!
110
+ gaps = find_gaps()
111
+ gaps.reverse_each do |offset, len|
112
+ sequences.each do |seq|
113
+ seq.delete_text(offset, len)
114
+ end
115
+ end
116
+ gaps.size
117
+ end
118
+
119
+ end
120
+
121
+ # A sequence within an alignment block.
122
+ # @api public
123
+ class Sequence
124
+ # @return [String] Source sequence name.
125
+ attr_reader :source
126
+ # @return [Integer] Zero-based start position.
127
+ attr_reader :start
128
+ # @return [Integer] Size of aligning region in source sequence.
129
+ attr_reader :size
130
+ # :+ or :-, indicating which strand the alignment is to.
131
+ # @return [Symbol]
132
+ attr_reader :strand
133
+ # Size of the entire source sequence, not just the aligning
134
+ # region.
135
+ # @return [Integer]
136
+ attr_reader :src_size
137
+ # Sequence data for the alignment, including insertions.
138
+ # @return [String]
139
+ attr_reader :text
140
+ # Array of raw synteny information from 'i' line.
141
+ # @return [Array<String>]
142
+ attr_accessor :i_data
143
+ # Quality string from 'q' line.
144
+ # @return [String]
145
+ attr_accessor :quality
146
+ alias_method :source_size, :src_size
147
+
148
+ def initialize(*args)
149
+ @source, @start, @size, @strand, @src_size, @text = args
150
+ end
151
+
152
+ def end
153
+ start + size
154
+ end
155
+
156
+ # Whether this sequence is empty. Only true for {EmptySequence}
157
+ # instances from 'e' lines.
158
+ def empty?
159
+ false
160
+ end
161
+
162
+ def gapped?
163
+ size != text.size
164
+ end
165
+
166
+ def species
167
+ parts = source.split('.', 2)
168
+ parts.size == 2 ? parts[0] : nil
169
+ end
170
+
171
+ def delete_text(offset, len)
172
+ unless empty?
173
+ text.slice!(offset, len)
174
+ if quality
175
+ quality.slice!(offset, len)
176
+ end
177
+ end
178
+ end
179
+
180
+ def write_fasta(writer)
181
+ writer.write("#{source}:#{start}-#{start + size}",
182
+ text)
183
+ end
184
+
185
+ # Maps the given zero-based genomic range onto a range of string
186
+ # offsets, suitable for extracting the text for the given range
187
+ # from #text.
188
+ #
189
+ # @see String#slice
190
+ def text_range(range)
191
+ r_end = range.exclude_end? ? range.end : range.end + 1
192
+ r_size = r_end - range.begin
193
+ if range.begin == start && r_size == size
194
+ # special case, entire text
195
+ 0...text.size
196
+ else
197
+ if range.begin < start || r_end > self.end
198
+ raise "Range #{range} outside sequence bounds; start #{start}, size #{size}"
199
+ end
200
+ if ! gapped?
201
+ # no gaps, can map indexes directly
202
+ (range.begin - start)...(r_end - start)
203
+ else
204
+ # gaps present
205
+ g_start = start # genomic position of the start
206
+ t_start = 0 # text position of the start
207
+ m_begin = nil # beginning of match
208
+ match = nil
209
+ text.scan(/(\w+|-+)/) do |parts|
210
+ part = parts[0]
211
+ if part[0] != '-'
212
+ # sequence text
213
+ g_end = g_start + part.size
214
+ if g_start <= range.begin && range.begin < g_end
215
+ offset_in_part = range.begin - g_start
216
+ m_begin = offset_in_part + t_start
217
+ end
218
+ if g_start <= r_end && r_end <= g_end
219
+ raise "reached end before start!" unless m_begin
220
+ offset_in_part = r_end - g_start
221
+ m_end = offset_in_part + t_start
222
+ match = m_begin...m_end
223
+ break
224
+ end
225
+ g_start = g_end
226
+ else
227
+ # gap
228
+ end
229
+ t_start += part.size
230
+ end
231
+ raise "no match found!" unless match
232
+ return match
233
+ end
234
+ end
235
+ end
236
+ end
237
+
238
+ # An empty sequence record from an 'e' line.
239
+ #
240
+ # This indicates that "there isn't aligning DNA for a species but
241
+ # that the current block is bridged by a chain that connects
242
+ # blocks before and after this block" (MAF spec).
243
+ # @api public
244
+ class EmptySequence < Sequence
245
+ attr_reader :status
246
+
247
+ def initialize(*args)
248
+ super(*args[0..4])
249
+ @status = args[5]
250
+ end
251
+
252
+ def text
253
+ ''
254
+ end
255
+
256
+ def empty?
257
+ true
258
+ end
259
+
260
+ def write_fasta(writer)
261
+ raise "empty sequence output not implemented!"
262
+ end
263
+ end
264
+
265
+ end
266
+
267
+ end