bio-maf 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/bio/maf/index.rb CHANGED
@@ -65,10 +65,11 @@ module Bio
65
65
  include KVHelpers
66
66
 
67
67
  attr_reader :db, :species, :species_max_id
68
- attr_accessor :index_sequences
68
+ attr_accessor :index_sequences, :ref_seq
69
69
 
70
70
  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
71
71
  FORMAT_VERSION = 2
72
+ REF_SEQ_KEY = 'bio-maf:reference-sequence'
72
73
  MAX_SPECIES = 64
73
74
 
74
75
  ## Key-value store index format
@@ -221,6 +222,7 @@ module Bio
221
222
  raise "Could not open DB file!"
222
223
  end
223
224
  if mode == KyotoCabinet::DB::OREADER
225
+ self.ref_seq = db[REF_SEQ_KEY]
224
226
  load_index_sequences
225
227
  load_species
226
228
  end
@@ -309,11 +311,12 @@ module Bio
309
311
  end
310
312
  ready = Time.now
311
313
  $stderr.puts "bin intervals computed after #{ready - start} seconds."
312
- if RUBY_PLATFORM == 'java'
313
- scan_bins_parallel(chrom_id, bin_intervals, filters)
314
- else
315
- scan_bins(chrom_id, bin_intervals, filters)
316
- end
314
+ matches = if RUBY_PLATFORM == 'java'
315
+ scan_bins_parallel(chrom_id, bin_intervals, filters)
316
+ else
317
+ scan_bins(chrom_id, bin_intervals, filters)
318
+ end
319
+ matches.sort_by! { |e| e[0] } # sort by offset in file
317
320
  end # #fetch_list
318
321
 
319
322
  # Scan the index for blocks matching the given bins and intervals.
@@ -344,7 +347,7 @@ module Bio
344
347
 
345
348
  def scan_bins_parallel(chrom_id, bin_intervals, filters)
346
349
  start = Time.now
347
- n_threads = ENV['profile'] ? 1 : 4
350
+ n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
348
351
  jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
349
352
  completed = java.util.concurrent.LinkedBlockingQueue.new(128)
350
353
  threads = []
@@ -445,7 +448,8 @@ module Bio
445
448
 
446
449
  def build_default(parser)
447
450
  first_block = parser.parse_block
448
- ref_seq = first_block.sequences.first.source
451
+ self.ref_seq = first_block.sequences.first.source
452
+ db[REF_SEQ_KEY] = ref_seq
449
453
  db[FORMAT_VERSION_KEY] = FORMAT_VERSION
450
454
  @index_sequences = { ref_seq => 0 }
451
455
  store_index_sequences!
@@ -521,6 +525,9 @@ module Bio
521
525
  end
522
526
 
523
527
  def entries_for(block)
528
+ unless block.ref_seq.source == @ref_seq
529
+ raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
530
+ end
524
531
  h = {}
525
532
  val = build_block_value(block)
526
533
  block.sequences.each do |seq|
@@ -0,0 +1,267 @@
1
+ module Bio
2
+ module MAF
3
+
4
+ # A MAF header, containing the variable-value pairs from the first
5
+ # line of the file as well as the alignment parameters.
6
+ # @api public
7
+ class Header
8
+ # Variable-value pairs from the ##maf line
9
+ # @return [Hash]
10
+ attr_accessor :vars
11
+ # Alignment parameters from the MAF header.
12
+ # @return [Hash]
13
+ attr_accessor :alignment_params
14
+
15
+ def initialize(vars, params)
16
+ @vars = vars
17
+ @alignment_params = params
18
+ end
19
+
20
+ # The required version parameter.
21
+ # @return [String]
22
+ def version
23
+ vars[:version]
24
+ end
25
+
26
+ # The optional scoring parameter, if present.
27
+ # @return [String]
28
+ def scoring
29
+ vars[:scoring]
30
+ end
31
+
32
+ end
33
+
34
+ # A MAF alignment block.
35
+ # @api public
36
+ class Block
37
+ # Parameters from the 'a' line starting the alignment block.
38
+ attr_reader :vars
39
+ # Sequences, one per 's' or 'e' line.
40
+ # @return [Array<Sequence>]
41
+ attr_reader :sequences
42
+ # Offset of the alignment block within the MAF file, in bytes.
43
+ # @return [Integer]
44
+ attr_reader :offset
45
+ # Size of the alignment block within the MAF file, in bytes.
46
+ # @return [Integer]
47
+ attr_reader :size
48
+
49
+ def initialize(vars, sequences, offset, size, filtered)
50
+ @vars = vars
51
+ @sequences = sequences
52
+ @offset = offset
53
+ @size = size
54
+ @filtered = filtered
55
+ end
56
+
57
+ def ref_seq
58
+ sequences[0]
59
+ end
60
+
61
+ def raw_seq(i)
62
+ sequences.fetch(i)
63
+ end
64
+
65
+ def each_raw_seq
66
+ sequences.each { |s| yield s }
67
+ end
68
+
69
+ # Text size of the alignment block. This is the number of text
70
+ # characters in each line of sequence data, including dashes and
71
+ # other gaps in the sequence.
72
+ def text_size
73
+ sequences.first.text.size
74
+ end
75
+
76
+ # Whether this block has been modified by a parser filter.
77
+ # @return [Boolean]
78
+ def filtered?
79
+ @filtered
80
+ end
81
+
82
+ GAP = /-+/
83
+
84
+ # Remove gaps present in all sequences. These would generally
85
+ # occur when some sequences have been filtered out.
86
+ # @see #remove_gaps!
87
+ # @see Parser#sequence_filter
88
+ def find_gaps
89
+ ref_s = StringScanner.new(sequences.first.text)
90
+ others = sequences.slice(1, sequences.size - 1).reject { |s| s.empty? }.collect { |s| StringScanner.new(s.text) }
91
+ gaps = []
92
+ while ref_s.scan_until(GAP)
93
+ offset = ref_s.pos - ref_s.matched_size
94
+ others.each { |s| s.pos = offset }
95
+ unless others.find { |s| ! s.scan(GAP) }
96
+ # all matched
97
+ gap_size = [ref_s.matched_size,
98
+ others.map {|s| s.matched_size}.min].min
99
+ gaps << [offset, gap_size]
100
+ end
101
+ end
102
+ gaps
103
+ end
104
+
105
+ # Remove gaps present in all sequences. These would generally
106
+ # occur when some sequences have been filtered out.
107
+ # @see #find_gaps
108
+ # @see Parser#sequence_filter
109
+ def remove_gaps!
110
+ gaps = find_gaps()
111
+ gaps.reverse_each do |offset, len|
112
+ sequences.each do |seq|
113
+ seq.delete_text(offset, len)
114
+ end
115
+ end
116
+ gaps.size
117
+ end
118
+
119
+ end
120
+
121
+ # A sequence within an alignment block.
122
+ # @api public
123
+ class Sequence
124
+ # @return [String] Source sequence name.
125
+ attr_reader :source
126
+ # @return [Integer] Zero-based start position.
127
+ attr_reader :start
128
+ # @return [Integer] Size of aligning region in source sequence.
129
+ attr_reader :size
130
+ # :+ or :-, indicating which strand the alignment is to.
131
+ # @return [Symbol]
132
+ attr_reader :strand
133
+ # Size of the entire source sequence, not just the aligning
134
+ # region.
135
+ # @return [Integer]
136
+ attr_reader :src_size
137
+ # Sequence data for the alignment, including insertions.
138
+ # @return [String]
139
+ attr_reader :text
140
+ # Array of raw synteny information from 'i' line.
141
+ # @return [Array<String>]
142
+ attr_accessor :i_data
143
+ # Quality string from 'q' line.
144
+ # @return [String]
145
+ attr_accessor :quality
146
+ alias_method :source_size, :src_size
147
+
148
+ def initialize(*args)
149
+ @source, @start, @size, @strand, @src_size, @text = args
150
+ end
151
+
152
+ def end
153
+ start + size
154
+ end
155
+
156
+ # Whether this sequence is empty. Only true for {EmptySequence}
157
+ # instances from 'e' lines.
158
+ def empty?
159
+ false
160
+ end
161
+
162
+ def gapped?
163
+ size != text.size
164
+ end
165
+
166
+ def species
167
+ parts = source.split('.', 2)
168
+ parts.size == 2 ? parts[0] : nil
169
+ end
170
+
171
+ def delete_text(offset, len)
172
+ unless empty?
173
+ text.slice!(offset, len)
174
+ if quality
175
+ quality.slice!(offset, len)
176
+ end
177
+ end
178
+ end
179
+
180
+ def write_fasta(writer)
181
+ writer.write("#{source}:#{start}-#{start + size}",
182
+ text)
183
+ end
184
+
185
+ # Maps the given zero-based genomic range onto a range of string
186
+ # offsets, suitable for extracting the text for the given range
187
+ # from #text.
188
+ #
189
+ # @see String#slice
190
+ def text_range(range)
191
+ r_end = range.exclude_end? ? range.end : range.end + 1
192
+ r_size = r_end - range.begin
193
+ if range.begin == start && r_size == size
194
+ # special case, entire text
195
+ 0...text.size
196
+ else
197
+ if range.begin < start || r_end > self.end
198
+ raise "Range #{range} outside sequence bounds; start #{start}, size #{size}"
199
+ end
200
+ if ! gapped?
201
+ # no gaps, can map indexes directly
202
+ (range.begin - start)...(r_end - start)
203
+ else
204
+ # gaps present
205
+ g_start = start # genomic position of the start
206
+ t_start = 0 # text position of the start
207
+ m_begin = nil # beginning of match
208
+ match = nil
209
+ text.scan(/(\w+|-+)/) do |parts|
210
+ part = parts[0]
211
+ if part[0] != '-'
212
+ # sequence text
213
+ g_end = g_start + part.size
214
+ if g_start <= range.begin && range.begin < g_end
215
+ offset_in_part = range.begin - g_start
216
+ m_begin = offset_in_part + t_start
217
+ end
218
+ if g_start <= r_end && r_end <= g_end
219
+ raise "reached end before start!" unless m_begin
220
+ offset_in_part = r_end - g_start
221
+ m_end = offset_in_part + t_start
222
+ match = m_begin...m_end
223
+ break
224
+ end
225
+ g_start = g_end
226
+ else
227
+ # gap
228
+ end
229
+ t_start += part.size
230
+ end
231
+ raise "no match found!" unless match
232
+ return match
233
+ end
234
+ end
235
+ end
236
+ end
237
+
238
+ # An empty sequence record from an 'e' line.
239
+ #
240
+ # This indicates that "there isn't aligning DNA for a species but
241
+ # that the current block is bridged by a chain that connects
242
+ # blocks before and after this block" (MAF spec).
243
+ # @api public
244
+ class EmptySequence < Sequence
245
+ attr_reader :status
246
+
247
+ def initialize(*args)
248
+ super(*args[0..4])
249
+ @status = args[5]
250
+ end
251
+
252
+ def text
253
+ ''
254
+ end
255
+
256
+ def empty?
257
+ true
258
+ end
259
+
260
+ def write_fasta(writer)
261
+ raise "empty sequence output not implemented!"
262
+ end
263
+ end
264
+
265
+ end
266
+
267
+ end