bio-maf 0.1.0-java → 0.2.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,50 @@
1
+ Feature: Remove gaps from MAF files
2
+ In order to work with only the alignment data involving sequences
3
+ Which can be used by downstream software
4
+ We may want to filter out certain species
5
+ Which can leave gap regions where sequence data was only present
6
+ For removed species
7
+ So it is useful to be able to remove those gaps
8
+
9
+ Background:
10
+ Given MAF data:
11
+ """
12
+ ##maf version=1
13
+ a score=10542.0
14
+ s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
15
+ s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
16
+ s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
17
+ s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
18
+ s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
19
+ s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
20
+ s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
21
+ s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
22
+ s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
23
+ s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
24
+ """
25
+ When I open it with a MAF reader
26
+ And filter for only the species
27
+ | mm8 |
28
+ | rn4 |
29
+ | hg18 |
30
+ | canFam2 |
31
+ | loxAfr1 |
32
+
33
+ Scenario: Detect filtered blocks
34
+ When an alignment block can be obtained
35
+ Then the alignment block is marked as filtered
36
+ And the alignment block has 5 sequences
37
+
38
+ Scenario: Detect gaps
39
+ When an alignment block can be obtained
40
+ Then 1 gap is found with length [14]
41
+
42
+ Scenario: Remove gaps
43
+ When an alignment block can be obtained
44
+ And gaps are removed
45
+ Then the text size of the block is 40
46
+
47
+ Scenario: Remove gaps in the parser
48
+ When I enable the :remove_gaps parser option
49
+ And an alignment block can be obtained
50
+ Then the text size of the block is 40
@@ -0,0 +1,32 @@
1
+ Given /^chromosome reference sequence:$/ do |string|
2
+ sio = StringIO.new(string)
3
+ @refseq = Bio::MAF::FASTARangeReader.new(sio)
4
+ end
5
+
6
+ When /^tile ([^:\s]+):(\d+)-(\d+)( with the chromosome reference)?$/ do |seq, i_start, i_end, ref_p|
7
+ @tiler = Bio::MAF::Tiler.new
8
+ @tiler.index = @idx
9
+ @tiler.parser = @parser
10
+ @tiler.reference = @refseq if ref_p
11
+ @tiler.interval = Bio::GenomicInterval.zero_based(seq,
12
+ i_start.to_i,
13
+ i_end.to_i)
14
+ end
15
+
16
+ When /^tile with species \[(.+?)\]$/ do |species_text|
17
+ @tiler.species = species_text.split(/,\s*/)
18
+ end
19
+
20
+ When /^map species (\S+) as (\S+)$/ do |sp1, sp2|
21
+ @tiler.species_map[sp1] = sp2
22
+ end
23
+
24
+ When /^write the tiled data as FASTA$/ do
25
+ @dst = Tempfile.new(["cuke", ".fa"])
26
+ @tiler.write_fasta(@dst)
27
+ end
28
+
29
+ Then /^the FASTA data obtained should be:$/ do |string|
30
+ @dst.seek(0)
31
+ @dst.read.rstrip.should == string.rstrip
32
+ end
@@ -0,0 +1,19 @@
1
+ Then /^the alignment block is marked as filtered$/ do
2
+ @block.filtered?.should be_true
3
+ end
4
+
5
+ Then /^(\d+) gaps? (?:is|are) found with length \[(\d+)\]$/ do |n_gaps, gap_sizes_s|
6
+ gaps = @block.find_gaps
7
+ gaps.size.should == n_gaps.to_i
8
+ e_gap_sizes = gap_sizes_s.split(/,\s*/).collect { |n| n.to_i }
9
+ gap_sizes = gaps.collect { |gap| gap[1] }
10
+ gap_sizes.should == e_gap_sizes
11
+ end
12
+
13
+ When /^gaps are removed$/ do
14
+ @block.remove_gaps!
15
+ end
16
+
17
+ Then /^the text size of the block is (\d+)$/ do |e_text_size|
18
+ @block.text_size.should == e_text_size.to_i
19
+ end
@@ -1,5 +1,6 @@
1
1
  When /^I open it with a MAF reader$/ do
2
- @parser = Bio::MAF::Parser.new(@src_f, @opts || {})
2
+ @opts ||= {}
3
+ @parser = Bio::MAF::Parser.new(@src_f, @opts)
3
4
  end
4
5
 
5
6
  When /^I enable the :(\S+) parser option$/ do |opt_s|
@@ -1,4 +1,6 @@
1
+ require 'bio/maf/maf'
1
2
  require 'bio/maf/struct'
2
3
  require 'bio/maf/index'
3
4
  require 'bio/maf/parser'
4
5
  require 'bio/maf/writer'
6
+ require 'bio/maf/tiler'
@@ -65,10 +65,11 @@ module Bio
65
65
  include KVHelpers
66
66
 
67
67
  attr_reader :db, :species, :species_max_id
68
- attr_accessor :index_sequences
68
+ attr_accessor :index_sequences, :ref_seq
69
69
 
70
70
  FORMAT_VERSION_KEY = 'bio-maf:index-format-version'
71
71
  FORMAT_VERSION = 2
72
+ REF_SEQ_KEY = 'bio-maf:reference-sequence'
72
73
  MAX_SPECIES = 64
73
74
 
74
75
  ## Key-value store index format
@@ -221,6 +222,7 @@ module Bio
221
222
  raise "Could not open DB file!"
222
223
  end
223
224
  if mode == KyotoCabinet::DB::OREADER
225
+ self.ref_seq = db[REF_SEQ_KEY]
224
226
  load_index_sequences
225
227
  load_species
226
228
  end
@@ -309,11 +311,12 @@ module Bio
309
311
  end
310
312
  ready = Time.now
311
313
  $stderr.puts "bin intervals computed after #{ready - start} seconds."
312
- if RUBY_PLATFORM == 'java'
313
- scan_bins_parallel(chrom_id, bin_intervals, filters)
314
- else
315
- scan_bins(chrom_id, bin_intervals, filters)
316
- end
314
+ matches = if RUBY_PLATFORM == 'java'
315
+ scan_bins_parallel(chrom_id, bin_intervals, filters)
316
+ else
317
+ scan_bins(chrom_id, bin_intervals, filters)
318
+ end
319
+ matches.sort_by! { |e| e[0] } # sort by offset in file
317
320
  end # #fetch_list
318
321
 
319
322
  # Scan the index for blocks matching the given bins and intervals.
@@ -344,7 +347,7 @@ module Bio
344
347
 
345
348
  def scan_bins_parallel(chrom_id, bin_intervals, filters)
346
349
  start = Time.now
347
- n_threads = ENV['profile'] ? 1 : 4
350
+ n_threads = ENV['profile'] ? 1 : java.lang.Runtime.runtime.availableProcessors
348
351
  jobs = java.util.concurrent.ConcurrentLinkedQueue.new(bin_intervals.to_a)
349
352
  completed = java.util.concurrent.LinkedBlockingQueue.new(128)
350
353
  threads = []
@@ -445,7 +448,8 @@ module Bio
445
448
 
446
449
  def build_default(parser)
447
450
  first_block = parser.parse_block
448
- ref_seq = first_block.sequences.first.source
451
+ self.ref_seq = first_block.sequences.first.source
452
+ db[REF_SEQ_KEY] = ref_seq
449
453
  db[FORMAT_VERSION_KEY] = FORMAT_VERSION
450
454
  @index_sequences = { ref_seq => 0 }
451
455
  store_index_sequences!
@@ -521,6 +525,9 @@ module Bio
521
525
  end
522
526
 
523
527
  def entries_for(block)
528
+ unless block.ref_seq.source == @ref_seq
529
+ raise "Inconsistent reference sequence: expected #{@ref_seq}, got #{block.ref_seq.source}"
530
+ end
524
531
  h = {}
525
532
  val = build_block_value(block)
526
533
  block.sequences.each do |seq|
@@ -0,0 +1,267 @@
1
+ module Bio
2
+ module MAF
3
+
4
+ # A MAF header, containing the variable-value pairs from the first
5
+ # line of the file as well as the alignment parameters.
6
+ # @api public
7
+ class Header
8
+ # Variable-value pairs from the ##maf line
9
+ # @return [Hash]
10
+ attr_accessor :vars
11
+ # Alignment parameters from the MAF header.
12
+ # @return [Hash]
13
+ attr_accessor :alignment_params
14
+
15
+ def initialize(vars, params)
16
+ @vars = vars
17
+ @alignment_params = params
18
+ end
19
+
20
+ # The required version parameter.
21
+ # @return [String]
22
+ def version
23
+ vars[:version]
24
+ end
25
+
26
+ # The optional scoring parameter, if present.
27
+ # @return [String]
28
+ def scoring
29
+ vars[:scoring]
30
+ end
31
+
32
+ end
33
+
34
+ # A MAF alignment block.
35
+ # @api public
36
+ class Block
37
+ # Parameters from the 'a' line starting the alignment block.
38
+ attr_reader :vars
39
+ # Sequences, one per 's' or 'e' line.
40
+ # @return [Array<Sequence>]
41
+ attr_reader :sequences
42
+ # Offset of the alignment block within the MAF file, in bytes.
43
+ # @return [Integer]
44
+ attr_reader :offset
45
+ # Size of the alignment block within the MAF file, in bytes.
46
+ # @return [Integer]
47
+ attr_reader :size
48
+
49
+ def initialize(vars, sequences, offset, size, filtered)
50
+ @vars = vars
51
+ @sequences = sequences
52
+ @offset = offset
53
+ @size = size
54
+ @filtered = filtered
55
+ end
56
+
57
+ def ref_seq
58
+ sequences[0]
59
+ end
60
+
61
+ def raw_seq(i)
62
+ sequences.fetch(i)
63
+ end
64
+
65
+ def each_raw_seq
66
+ sequences.each { |s| yield s }
67
+ end
68
+
69
+ # Text size of the alignment block. This is the number of text
70
+ # characters in each line of sequence data, including dashes and
71
+ # other gaps in the sequence.
72
+ def text_size
73
+ sequences.first.text.size
74
+ end
75
+
76
+ # Whether this block has been modified by a parser filter.
77
+ # @return [Boolean]
78
+ def filtered?
79
+ @filtered
80
+ end
81
+
82
+ GAP = /-+/
83
+
84
+ # Remove gaps present in all sequences. These would generally
85
+ # occur when some sequences have been filtered out.
86
+ # @see #remove_gaps!
87
+ # @see Parser#sequence_filter
88
+ def find_gaps
89
+ ref_s = StringScanner.new(sequences.first.text)
90
+ others = sequences.slice(1, sequences.size - 1).reject { |s| s.empty? }.collect { |s| StringScanner.new(s.text) }
91
+ gaps = []
92
+ while ref_s.scan_until(GAP)
93
+ offset = ref_s.pos - ref_s.matched_size
94
+ others.each { |s| s.pos = offset }
95
+ unless others.find { |s| ! s.scan(GAP) }
96
+ # all matched
97
+ gap_size = [ref_s.matched_size,
98
+ others.map {|s| s.matched_size}.min].min
99
+ gaps << [offset, gap_size]
100
+ end
101
+ end
102
+ gaps
103
+ end
104
+
105
+ # Remove gaps present in all sequences. These would generally
106
+ # occur when some sequences have been filtered out.
107
+ # @see #find_gaps
108
+ # @see Parser#sequence_filter
109
+ def remove_gaps!
110
+ gaps = find_gaps()
111
+ gaps.reverse_each do |offset, len|
112
+ sequences.each do |seq|
113
+ seq.delete_text(offset, len)
114
+ end
115
+ end
116
+ gaps.size
117
+ end
118
+
119
+ end
120
+
121
+ # A sequence within an alignment block.
122
+ # @api public
123
+ class Sequence
124
+ # @return [String] Source sequence name.
125
+ attr_reader :source
126
+ # @return [Integer] Zero-based start position.
127
+ attr_reader :start
128
+ # @return [Integer] Size of aligning region in source sequence.
129
+ attr_reader :size
130
+ # :+ or :-, indicating which strand the alignment is to.
131
+ # @return [Symbol]
132
+ attr_reader :strand
133
+ # Size of the entire source sequence, not just the aligning
134
+ # region.
135
+ # @return [Integer]
136
+ attr_reader :src_size
137
+ # Sequence data for the alignment, including insertions.
138
+ # @return [String]
139
+ attr_reader :text
140
+ # Array of raw synteny information from 'i' line.
141
+ # @return [Array<String>]
142
+ attr_accessor :i_data
143
+ # Quality string from 'q' line.
144
+ # @return [String]
145
+ attr_accessor :quality
146
+ alias_method :source_size, :src_size
147
+
148
+ def initialize(*args)
149
+ @source, @start, @size, @strand, @src_size, @text = args
150
+ end
151
+
152
+ def end
153
+ start + size
154
+ end
155
+
156
+ # Whether this sequence is empty. Only true for {EmptySequence}
157
+ # instances from 'e' lines.
158
+ def empty?
159
+ false
160
+ end
161
+
162
+ def gapped?
163
+ size != text.size
164
+ end
165
+
166
+ def species
167
+ parts = source.split('.', 2)
168
+ parts.size == 2 ? parts[0] : nil
169
+ end
170
+
171
+ def delete_text(offset, len)
172
+ unless empty?
173
+ text.slice!(offset, len)
174
+ if quality
175
+ quality.slice!(offset, len)
176
+ end
177
+ end
178
+ end
179
+
180
+ def write_fasta(writer)
181
+ writer.write("#{source}:#{start}-#{start + size}",
182
+ text)
183
+ end
184
+
185
+ # Maps the given zero-based genomic range onto a range of string
186
+ # offsets, suitable for extracting the text for the given range
187
+ # from #text.
188
+ #
189
+ # @see String#slice
190
+ def text_range(range)
191
+ r_end = range.exclude_end? ? range.end : range.end + 1
192
+ r_size = r_end - range.begin
193
+ if range.begin == start && r_size == size
194
+ # special case, entire text
195
+ 0...text.size
196
+ else
197
+ if range.begin < start || r_end > self.end
198
+ raise "Range #{range} outside sequence bounds; start #{start}, size #{size}"
199
+ end
200
+ if ! gapped?
201
+ # no gaps, can map indexes directly
202
+ (range.begin - start)...(r_end - start)
203
+ else
204
+ # gaps present
205
+ g_start = start # genomic position of the start
206
+ t_start = 0 # text position of the start
207
+ m_begin = nil # beginning of match
208
+ match = nil
209
+ text.scan(/(\w+|-+)/) do |parts|
210
+ part = parts[0]
211
+ if part[0] != '-'
212
+ # sequence text
213
+ g_end = g_start + part.size
214
+ if g_start <= range.begin && range.begin < g_end
215
+ offset_in_part = range.begin - g_start
216
+ m_begin = offset_in_part + t_start
217
+ end
218
+ if g_start <= r_end && r_end <= g_end
219
+ raise "reached end before start!" unless m_begin
220
+ offset_in_part = r_end - g_start
221
+ m_end = offset_in_part + t_start
222
+ match = m_begin...m_end
223
+ break
224
+ end
225
+ g_start = g_end
226
+ else
227
+ # gap
228
+ end
229
+ t_start += part.size
230
+ end
231
+ raise "no match found!" unless match
232
+ return match
233
+ end
234
+ end
235
+ end
236
+ end
237
+
238
+ # An empty sequence record from an 'e' line.
239
+ #
240
+ # This indicates that "there isn't aligning DNA for a species but
241
+ # that the current block is bridged by a chain that connects
242
+ # blocks before and after this block" (MAF spec).
243
+ # @api public
244
+ class EmptySequence < Sequence
245
+ attr_reader :status
246
+
247
+ def initialize(*args)
248
+ super(*args[0..4])
249
+ @status = args[5]
250
+ end
251
+
252
+ def text
253
+ ''
254
+ end
255
+
256
+ def empty?
257
+ true
258
+ end
259
+
260
+ def write_fasta(writer)
261
+ raise "empty sequence output not implemented!"
262
+ end
263
+ end
264
+
265
+ end
266
+
267
+ end