bio-maf 0.2.0-java → 0.3.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +3 -1
  3. data/README.md +98 -29
  4. data/Rakefile +6 -2
  5. data/bin/maf_tile +59 -35
  6. data/bio-maf.gemspec +4 -3
  7. data/features/block-joining.feature +32 -0
  8. data/features/dir-access.feature +46 -0
  9. data/features/maf-indexing.feature +23 -0
  10. data/features/maf-to-fasta.feature +9 -0
  11. data/features/slice.feature +54 -0
  12. data/features/step_definitions/dir-access_steps.rb +15 -0
  13. data/features/step_definitions/file_steps.rb +7 -0
  14. data/features/step_definitions/gap_removal_steps.rb +4 -0
  15. data/features/step_definitions/index_steps.rb +3 -3
  16. data/features/step_definitions/output_steps.rb +9 -1
  17. data/features/step_definitions/parse_steps.rb +13 -2
  18. data/features/step_definitions/query_steps.rb +7 -6
  19. data/features/step_definitions/slice_steps.rb +15 -0
  20. data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
  21. data/features/support/aruba.rb +1 -0
  22. data/features/support/env.rb +3 -1
  23. data/features/{gap-filling.feature → tiling.feature} +85 -0
  24. data/lib/bio/maf/index.rb +223 -11
  25. data/lib/bio/maf/maf.rb +209 -0
  26. data/lib/bio/maf/parser.rb +190 -111
  27. data/lib/bio/maf/tiler.rb +33 -6
  28. data/man/maf_index.1 +1 -1
  29. data/man/maf_tile.1 +7 -7
  30. data/man/maf_tile.1.ronn +21 -13
  31. data/man/maf_to_fasta.1 +1 -1
  32. data/spec/bio/maf/index_spec.rb +99 -0
  33. data/spec/bio/maf/maf_spec.rb +184 -0
  34. data/spec/bio/maf/parser_spec.rb +75 -115
  35. data/spec/bio/maf/tiler_spec.rb +44 -0
  36. data/test/data/chr22_ieq2.maf +11 -0
  37. data/test/data/gap-1.kct +0 -0
  38. data/test/data/gap-1.maf +9 -0
  39. data/test/data/gap-filled1.fa +6 -0
  40. data/test/data/gap-sp1.fa.gz +0 -0
  41. data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
  42. data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
  43. data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
  44. data/test/data/mm8_chrM_tiny.kct +0 -0
  45. data/test/data/mm8_chrM_tiny.maf +1000 -0
  46. metadata +65 -16
data/lib/bio/maf/tiler.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'pathname'
1
2
  require 'zlib'
2
3
 
3
4
  module Bio::MAF
@@ -9,7 +10,7 @@ module Bio::MAF
9
10
 
10
11
  attr_accessor :index
11
12
  attr_accessor :parser
12
- attr_accessor :reference
13
+ attr_reader :reference
13
14
  # GenomicInterval
14
15
  attr_accessor :interval
15
16
  attr_accessor :species
@@ -19,6 +20,25 @@ module Bio::MAF
19
20
  @species_map = {}
20
21
  end
21
22
 
23
+ # Set the reference sequence.
24
+ #
25
+ # @param source [FASTARangeReader, String, Pathname]
26
+ def reference=(source)
27
+ ref = case
28
+ when source.is_a?(FASTARangeReader)
29
+ source
30
+ when source.respond_to?(:seek)
31
+ # open file
32
+ FASTARangeReader.new(source)
33
+ when source.respond_to?(:start_with?) && source.start_with?('>')
34
+ # FASTA string
35
+ FASTARangeReader.new(StringIO.new(source))
36
+ else
37
+ FASTARangeReader.new(source.to_s)
38
+ end
39
+ @reference = ref
40
+ end
41
+
22
42
  def ref_data(range)
23
43
  if reference
24
44
  if reference.respond_to? :read_interval
@@ -33,8 +53,12 @@ module Bio::MAF
33
53
  end
34
54
  end
35
55
 
56
+ def species_to_use
57
+ species || species_map.keys
58
+ end
59
+
36
60
  def tile
37
- parser.sequence_filter[:only_species] = @species
61
+ parser.sequence_filter[:only_species] = species_to_use
38
62
  # TODO: remove gaps
39
63
  blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
40
64
  mask = Array.new(interval.length, :ref)
@@ -51,7 +75,7 @@ module Bio::MAF
51
75
  (slice_start - i_start)...(slice_end - i_start))
52
76
  end
53
77
  text = []
54
- species.each { |s| text << '' }
78
+ species_to_use.each { |s| text << '' }
55
79
  nonref_text = text[1...text.size]
56
80
  runs(mask) do |range, block|
57
81
  g_range = (range.begin + i_start)...(range.end + i_start)
@@ -69,7 +93,7 @@ module Bio::MAF
69
93
  else
70
94
  # covered by an alignment block
71
95
  t_range = block.ref_seq.text_range(g_range)
72
- species.each_with_index do |species, i|
96
+ species_to_use.each_with_index do |species, i|
73
97
  sp_text = text[i]
74
98
  seq = block.sequences.find { |s| s.source == species || s.species == species }
75
99
  if seq
@@ -86,7 +110,7 @@ module Bio::MAF
86
110
  end
87
111
 
88
112
  def write_fasta(f)
89
- species.zip(tile()) do |species, text|
113
+ species_to_use.zip(tile()) do |species, text|
90
114
  sp_out = species_map[species] || species
91
115
  f.puts ">#{sp_out}"
92
116
  f.puts text
@@ -147,10 +171,13 @@ module Bio::MAF
147
171
  line = line_raw.strip
148
172
  end_pos = pos + line.size
149
173
  if (! in_region) && pos <= z_start && z_start < end_pos
150
- data << line.slice((z_start - pos)...(line.size))
174
+ offset = z_start - pos
175
+ end_offset = [(offset + region_size), line.size].min
176
+ data << line.slice(offset...end_offset)
151
177
  in_region = true
152
178
  elsif in_region
153
179
  need = region_size - data.size
180
+ raise "should not happen: region #{region_size}, data #{data.size}, need #{need}" if need < 0
154
181
  if need > line.size
155
182
  data << line
156
183
  else
data/man/maf_index.1 CHANGED
@@ -1,7 +1,7 @@
1
1
  .\" generated with Ronn/v0.7.3
2
2
  .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
3
  .
4
- .TH "MAF_INDEX" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
4
+ .TH "MAF_INDEX" "1" "July 2012" "BioRuby" "BioRuby Manual"
5
5
  .
6
6
  .SH "NAME"
7
7
  \fBmaf_index\fR \- build and examine MAF indexes
data/man/maf_tile.1 CHANGED
@@ -1,22 +1,22 @@
1
1
  .\" generated with Ronn/v0.7.3
2
2
  .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
3
  .
4
- .TH "MAF_TILE" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
4
+ .TH "MAF_TILE" "1" "July 2012" "BioRuby" "BioRuby Manual"
5
5
  .
6
6
  .SH "NAME"
7
7
  \fBmaf_tile\fR \- synthesize an alignment for a given region
8
8
  .
9
9
  .SH "SYNOPSIS"
10
- \fBmaf_tile\fR [\fIoptions\fR] \-i BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
10
+ \fBmaf_tile\fR [\fIoptions\fR] \-i [SEQ:]BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
11
11
  .
12
12
  .P
13
- \fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
13
+ \fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
14
14
  .
15
15
  .SH "DESCRIPTION"
16
- \fBmaf_tile\fR takes a MAF file with index (generated by maf_index(1)), extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
16
+ \fBmaf_tile\fR takes a MAF file, with optional index, or directory of indexed MAF files, extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
17
17
  .
18
18
  .P
19
- If a single interval is specified, the output will be written to stdout in FASTA format\. If the \fB\-\-output\-base\fR option is specified, \fB_<start>:<end>\.fa\fR will be appended to the given parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
19
+ If a single interval is specified, the output will be written to stdout in FASTA format\. If a directory of MAF files is supplied as the \fImaf\fR parameter, the interval must include the sequence identifier in the form \fBsequence:begin:end\fR\. If the \fB\-\-output\-base\fR option is specified, \fB_<begin>:<end>\.fa\fR will be appended to the given parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
20
20
  .
21
21
  .P
22
22
  Species can be renamed for output by specifying them as SPECIES:NAME; the first component will be used to select the species from the MAF file, and the second will be used in the FASTA description line for output\.
@@ -84,10 +84,10 @@ $ maf_tile \-\-bed /tmp/mm8\.bed \-\-output\-base /tmp/mm8 \e
84
84
  The output is generated in FASTA format, with one sequence per species\.
85
85
  .
86
86
  .P
87
- The input \fImaf\fR file must be a Multiple Alignment Format file\.
87
+ The \fImaf\fR parameter must specify either a Multiple Alignment Format (MAF) file or a directory of such files, with indexes\.
88
88
  .
89
89
  .P
90
- The \fIindex\fR must be a MAF index built with maf_index(1)\.
90
+ The \fIindex\fR must be a MAF index built with maf_index(1)\. This parameter is ignored if the \fImaf\fR parameter is a directory\. It can be omitted if a single MAF file is given, but in this case the entire file will be parsed to build a temporary index\. For large files which will be reused, this is not advisable\.
91
91
  .
92
92
  .P
93
93
  If \fB\-\-bed\fR \fIbed\fR is specified, its argument must be a BED file\. Only the second and third columns will be used, to specify the zero\-based start and end positions of intervals\.
data/man/maf_tile.1.ronn CHANGED
@@ -3,23 +3,26 @@ maf_tile(1) -- synthesize an alignment for a given region
3
3
 
4
4
  ## SYNOPSIS
5
5
 
6
- `maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf> <index>
6
+ `maf_tile` [<options>] -i [SEQ:]BEGIN:END [-s SPECIES[:NAME] ...] <maf> [index]
7
7
 
8
- `maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> <index>
8
+ `maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> [index]
9
9
 
10
10
  ## DESCRIPTION
11
11
 
12
- **maf_tile** takes a MAF file with index (generated by maf_index(1)),
13
- extracts alignment blocks overlapping the given genomic interval, and
14
- constructs a single alignment block covering the entire interval for
15
- the specified species. Optionally, any gaps in coverage of the MAF
16
- file's reference sequence can be filled in from a FASTA sequence file.
12
+ **maf_tile** takes a MAF file, with optional index, or directory of
13
+ indexed MAF files, extracts alignment blocks overlapping the given
14
+ genomic interval, and constructs a single alignment block covering the
15
+ entire interval for the specified species. Optionally, any gaps in
16
+ coverage of the MAF file's reference sequence can be filled in from a
17
+ FASTA sequence file.
17
18
 
18
19
  If a single interval is specified, the output will be written to
19
- stdout in FASTA format. If the `--output-base` option is specified,
20
- `_<start>:<end>.fa` will be appended to the given <base> parameter and
21
- used to construct the output path. If a BED file is specified with
22
- `--bed`, `--output-base` is also required.
20
+ stdout in FASTA format. If a directory of MAF files is supplied as the
21
+ <maf> parameter, the interval must include the sequence identifier in
22
+ the form `sequence:begin:end`. If the `--output-base` option is
23
+ specified, `_<begin>:<end>.fa` will be appended to the given <base>
24
+ parameter and used to construct the output path. If a BED file is
25
+ specified with `--bed`, `--output-base` is also required.
23
26
 
24
27
  Species can be renamed for output by specifying them as SPECIES:NAME;
25
28
  the first component will be used to select the species from the MAF
@@ -80,9 +83,14 @@ sequence:
80
83
  The output is generated in FASTA format, with one sequence per
81
84
  species.
82
85
 
83
- The input <maf> file must be a Multiple Alignment Format file.
86
+ The <maf> parameter must specify either a Multiple Alignment Format
87
+ (MAF) file or a directory of such files, with indexes.
84
88
 
85
- The <index> must be a MAF index built with maf_index(1).
89
+ The <index> must be a MAF index built with maf_index(1). This
90
+ parameter is ignored if the <maf> parameter is a directory. It can be
91
+ omitted if a single MAF file is given, but in this case the entire
92
+ file will be parsed to build a temporary index. For large files which
93
+ will be reused, this is not advisable.
86
94
 
87
95
  If `--bed` <bed> is specified, its argument must be a BED file. Only
88
96
  the second and third columns will be used, to specify the zero-based
data/man/maf_to_fasta.1 CHANGED
@@ -1,7 +1,7 @@
1
1
  .\" generated with Ronn/v0.7.3
2
2
  .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
3
  .
4
- .TH "MAF_TO_FASTA" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
4
+ .TH "MAF_TO_FASTA" "1" "July 2012" "BioRuby" "BioRuby Manual"
5
5
  .
6
6
  .SH "NAME"
7
7
  \fBmaf_to_fasta\fR \- convert MAF file to FASTA
@@ -3,6 +3,73 @@ require 'spec_helper'
3
3
  module Bio
4
4
  module MAF
5
5
 
6
+ describe Access do
7
+ describe "#tile" do
8
+ it "gives correct output with a Pathname" do
9
+ access = Access.maf_dir(TestData)
10
+ interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
11
+ buf = StringIO.new
12
+ access.tile(interval) do |tiler|
13
+ tiler.reference = TestData + 'gap-sp1.fa'
14
+ tiler.species = %w(sp1 sp2 sp3)
15
+ tiler.write_fasta(buf)
16
+ end
17
+ buf.string.should == File.read(TestData + 'gap-filled1.fa')
18
+ end
19
+ it "gives correct output with only a species map" do
20
+ access = Access.maf_dir(TestData)
21
+ interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
22
+ buf = StringIO.new
23
+ access.tile(interval) do |tiler|
24
+ tiler.reference = TestData + 'gap-sp1.fa'
25
+ tiler.species_map = {
26
+ 'sp1' => 'sp1',
27
+ 'sp2' => 'sp2',
28
+ 'sp3' => 'sp3'
29
+ }
30
+ tiler.write_fasta(buf)
31
+ end
32
+ buf.string.should == File.read(TestData + 'gap-filled1.fa')
33
+ end
34
+ it "gives correct output with no species specified" do
35
+ pending("issue 88") do
36
+ access = Access.maf_dir(TestData)
37
+ interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
38
+ buf = StringIO.new
39
+ access.tile(interval) do |tiler|
40
+ tiler.reference = TestData + 'gap-sp1.fa'
41
+ tiler.write_fasta(buf)
42
+ end
43
+ buf.string.should == File.read(TestData + 'gap-filled1.fa')
44
+ end
45
+ end
46
+ end
47
+ describe ".file" do
48
+ it "accepts a MAF file and index" do
49
+ access = Access.file(TestData + 'gap-1.maf',
50
+ TestData + 'gap-1.kct')
51
+ blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
52
+ 10,
53
+ 23)]).to_a
54
+ blocks.size.should == 1
55
+ end
56
+ it "accepts a MAF file and finds the index" do
57
+ access = Access.file(TestData + 'gap-1.maf')
58
+ blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
59
+ 10,
60
+ 23)]).to_a
61
+ blocks.size.should == 1
62
+ end
63
+ it "accepts a MAF file and builds a temp index" do
64
+ access = Access.file(TestData + 'chrY-1block.maf')
65
+ blocks = access.find([GenomicInterval.zero_based('hg19.chrY',
66
+ 10501,
67
+ 10544)]).to_a
68
+ blocks.size.should == 1
69
+ end
70
+ end
71
+ end
72
+
6
73
  describe KyotoIndex do
7
74
  def has_at_least_n_with_prefix(n, start)
8
75
  @idx.db.cursor_process do |cur|
@@ -87,6 +154,38 @@ module Bio
87
154
  l[0].offset.should == 16
88
155
  end
89
156
 
157
+ it "takes a block arg" do
158
+ called = false
159
+ @idx.find([GenomicInterval.zero_based('mm8.chr7',
160
+ 80082334,
161
+ 80082338)],
162
+ @p) do |block|
163
+ block.offset.should == 16
164
+ called = true
165
+ end
166
+ called.should be_true
167
+ end
168
+
169
+ it "with a block and no match, returns" do
170
+ called = false
171
+ @idx.find([GenomicInterval.zero_based('mm8.chr7',
172
+ 20082334,
173
+ 20082338)],
174
+ @p) do |block|
175
+ called = true
176
+ end
177
+ called.should be_false
178
+ end
179
+
180
+ it "with no block and no match, returns an empty list" do
181
+ v = @idx.find([GenomicInterval.zero_based('mm8.chr7',
182
+ 20082334,
183
+ 20082338)],
184
+ @p)
185
+ v.should_not be_nil
186
+ v.should respond_to(:count)
187
+ end
188
+
90
189
  after(:each) do
91
190
  @idx.db.close
92
191
  @p.f.close
@@ -0,0 +1,184 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe Header do
7
+ before(:each) do
8
+ @p = Parser.new(TestData + 't1.maf')
9
+ end
10
+
11
+ it "provides version information" do
12
+ @p.header.version.should == '1'
13
+ end
14
+ it "provides the scoring scheme" do
15
+ @p.header.scoring.should == 'humor.v4'
16
+ end
17
+ it "provides alignment parameters" do
18
+ @p.header.alignment_params.should =~ /humor.v4 R=30/
19
+ end
20
+
21
+ it "presents multiline parameters correctly" do
22
+ @p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
23
+ end
24
+
25
+ it "provides arbitrary parameters"
26
+ end
27
+
28
+ describe Block do
29
+ describe "#find_gaps" do
30
+ it "finds a single 14-base gap" do
31
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
32
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
33
+ block = p.parse_block
34
+ gaps = block.find_gaps
35
+ gaps.size.should == 1
36
+ gaps[0][0].should == 34
37
+ gaps[0][1].should == 14
38
+ end
39
+ end
40
+ describe "#remove_gaps!" do
41
+ it "removes a single 14-base gap" do
42
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
43
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
44
+ block = p.parse_block
45
+ block.sequences.size.should == 5
46
+ block.text_size.should == 54
47
+ block.remove_gaps!
48
+ block.text_size.should == 40
49
+ end
50
+ end
51
+ describe "#joinable_with?" do
52
+ it "is false for blocks with different sequences" do
53
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
54
+ sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 dasNov1 loxAfr1 echTel1)
55
+ p.sequence_filter = { :only_species => sp }
56
+ b1 = p.parse_block
57
+ b2 = p.parse_block
58
+ b1.joinable_with?(b2).should be_false
59
+ end
60
+ it "is true for blocks with same sequences" do
61
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
62
+ sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 loxAfr1 echTel1)
63
+ p.sequence_filter = { :only_species => sp }
64
+ b1 = p.parse_block
65
+ b2 = p.parse_block
66
+ b1.joinable_with?(b2).should be_true
67
+ end
68
+ end
69
+ describe "#to_bio_alignment" do
70
+ it "returns a usable Bio::BioAlignment::Alignment" do
71
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
72
+ b = p.parse_block
73
+ ba = b.to_bio_alignment
74
+ ba.size.should == 10
75
+ ba.sequences[0].id.should == "mm8.chr7"
76
+ ba.sequences[0].seq.should =~ /^GGGCTGAGGGC--/
77
+ end
78
+ end
79
+ end
80
+
81
+ describe Sequence do
82
+ before(:each) do
83
+ @parser = DummyParser.new
84
+ end
85
+
86
+ describe "#gapped?" do
87
+ it "is false for sequences with no gaps" do
88
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
89
+ s = @parser.parse_seq_line(line, nil)
90
+ s.gapped?.should be_false
91
+ end
92
+ it "is true for sequences with gaps" do
93
+ line = "s human_unc 9077 8 + 10998 AC-AGTATT"
94
+ s = @parser.parse_seq_line(line, nil)
95
+ s.gapped?.should be_true
96
+ end
97
+ end
98
+
99
+ describe "#text_range" do
100
+ it "returns 0...text.size for a spanning interval" do
101
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
102
+ s = @parser.parse_seq_line(line, nil)
103
+ range = s.text_range(9077...(9077 + 8))
104
+ range.should == (0...(s.text.size))
105
+ end
106
+ it "returns 0...text.size for a gapped spanning interval" do
107
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
108
+ s = @parser.parse_seq_line(line, nil)
109
+ range = s.text_range(9077...(9077 + 8))
110
+ range.should == (0...(s.text.size))
111
+ end
112
+ it "handles a leading subset" do
113
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
114
+ s = @parser.parse_seq_line(line, nil)
115
+ range = s.text_range(9077...(9077 + 2))
116
+ range.should == (0...2)
117
+ end
118
+ it "handles a trailing subset" do
119
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
120
+ s = @parser.parse_seq_line(line, nil)
121
+ range = s.text_range(9079...9085)
122
+ range.should == (2...8)
123
+ end
124
+ it "handles a gap in the middle" do
125
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
126
+ s = @parser.parse_seq_line(line, nil)
127
+ range = s.text_range(9078...(9077 + 8))
128
+ range.should == (1...(s.text.size))
129
+ end
130
+ it "errors on a range starting before" do
131
+ expect {
132
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
133
+ s = @parser.parse_seq_line(line, nil)
134
+ range = s.text_range(9076...(9077 + 8))
135
+ }.to raise_error
136
+ end
137
+ it "errors on a range ending after" do
138
+ expect {
139
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
140
+ s = @parser.parse_seq_line(line, nil)
141
+ range = s.text_range(9076...(9077 + 9))
142
+ }.to raise_error
143
+ end
144
+ end
145
+
146
+ describe "synteny data" do
147
+ it "extracts basic data from i lines" do
148
+ p = Parser.new(TestData + 'chr22_ieq2.maf',
149
+ :parse_extended => true)
150
+ b = p.parse_block
151
+ b.sequences[0].left_status_char.should be_nil
152
+ b.sequences[0].left_status.should be_nil
153
+ b.sequences[0].left_count.should be_nil
154
+ b.sequences[0].right_status_char.should be_nil
155
+ b.sequences[0].right_status.should be_nil
156
+ b.sequences[0].right_count.should be_nil
157
+ # works but let's not over-specify internal state
158
+ #b.sequences[1].i_data.should == %w(N 0 C 0)
159
+ b.sequences[1].left_status_char.should == 'N'
160
+ b.sequences[1].left_status.should == :first
161
+ b.sequences[1].right_status_char.should == 'C'
162
+ b.sequences[1].right_status.should == :contiguous
163
+ b.sequences[2].left_status.should == :contiguous
164
+ b.sequences[2].right_status_char.should == 'I'
165
+ b.sequences[2].right_status.should == :intervening
166
+ b.sequences[2].right_count.should == 146
167
+ end
168
+ end
169
+
170
+ describe "#to_bioalignment" do
171
+ it "returns a usable Bio::BioAlignment::Sequence" do
172
+ @parser = DummyParser.new
173
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
174
+ s = @parser.parse_seq_line(line, nil)
175
+ as = s.to_bio_alignment
176
+ as.id.should == "human_unc"
177
+ as.seq.should == "ACAGTATT"
178
+ end
179
+ end
180
+
181
+ end
182
+
183
+ end
184
+ end