bio-maf 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.gitignore +1 -0
  2. data/Gemfile +2 -1
  3. data/README.md +98 -29
  4. data/Rakefile +6 -2
  5. data/bin/maf_tile +59 -35
  6. data/bio-maf.gemspec +4 -3
  7. data/features/block-joining.feature +32 -0
  8. data/features/dir-access.feature +46 -0
  9. data/features/maf-indexing.feature +23 -0
  10. data/features/maf-to-fasta.feature +9 -0
  11. data/features/slice.feature +54 -0
  12. data/features/step_definitions/dir-access_steps.rb +15 -0
  13. data/features/step_definitions/file_steps.rb +7 -0
  14. data/features/step_definitions/gap_removal_steps.rb +4 -0
  15. data/features/step_definitions/index_steps.rb +3 -3
  16. data/features/step_definitions/output_steps.rb +9 -1
  17. data/features/step_definitions/parse_steps.rb +13 -2
  18. data/features/step_definitions/query_steps.rb +7 -6
  19. data/features/step_definitions/slice_steps.rb +15 -0
  20. data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
  21. data/features/support/aruba.rb +1 -0
  22. data/features/support/env.rb +3 -1
  23. data/features/{gap-filling.feature → tiling.feature} +85 -0
  24. data/lib/bio/maf/index.rb +223 -11
  25. data/lib/bio/maf/maf.rb +209 -0
  26. data/lib/bio/maf/parser.rb +190 -111
  27. data/lib/bio/maf/tiler.rb +33 -6
  28. data/man/maf_index.1 +1 -1
  29. data/man/maf_tile.1 +7 -7
  30. data/man/maf_tile.1.ronn +21 -13
  31. data/man/maf_to_fasta.1 +1 -1
  32. data/spec/bio/maf/index_spec.rb +99 -0
  33. data/spec/bio/maf/maf_spec.rb +184 -0
  34. data/spec/bio/maf/parser_spec.rb +75 -115
  35. data/spec/bio/maf/tiler_spec.rb +44 -0
  36. data/test/data/chr22_ieq2.maf +11 -0
  37. data/test/data/gap-1.kct +0 -0
  38. data/test/data/gap-1.maf +9 -0
  39. data/test/data/gap-filled1.fa +6 -0
  40. data/test/data/gap-sp1.fa.gz +0 -0
  41. data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
  42. data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
  43. data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
  44. data/test/data/mm8_chrM_tiny.kct +0 -0
  45. data/test/data/mm8_chrM_tiny.maf +1000 -0
  46. metadata +59 -7
data/lib/bio/maf/tiler.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'pathname'
1
2
  require 'zlib'
2
3
 
3
4
  module Bio::MAF
@@ -9,7 +10,7 @@ module Bio::MAF
9
10
 
10
11
  attr_accessor :index
11
12
  attr_accessor :parser
12
- attr_accessor :reference
13
+ attr_reader :reference
13
14
  # GenomicInterval
14
15
  attr_accessor :interval
15
16
  attr_accessor :species
@@ -19,6 +20,25 @@ module Bio::MAF
19
20
  @species_map = {}
20
21
  end
21
22
 
23
+ # Set the reference sequence.
24
+ #
25
+ # @param source [FASTARangeReader, String, Pathname]
26
+ def reference=(source)
27
+ ref = case
28
+ when source.is_a?(FASTARangeReader)
29
+ source
30
+ when source.respond_to?(:seek)
31
+ # open file
32
+ FASTARangeReader.new(source)
33
+ when source.respond_to?(:start_with?) && source.start_with?('>')
34
+ # FASTA string
35
+ FASTARangeReader.new(StringIO.new(source))
36
+ else
37
+ FASTARangeReader.new(source.to_s)
38
+ end
39
+ @reference = ref
40
+ end
41
+
22
42
  def ref_data(range)
23
43
  if reference
24
44
  if reference.respond_to? :read_interval
@@ -33,8 +53,12 @@ module Bio::MAF
33
53
  end
34
54
  end
35
55
 
56
+ def species_to_use
57
+ species || species_map.keys
58
+ end
59
+
36
60
  def tile
37
- parser.sequence_filter[:only_species] = @species
61
+ parser.sequence_filter[:only_species] = species_to_use
38
62
  # TODO: remove gaps
39
63
  blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
40
64
  mask = Array.new(interval.length, :ref)
@@ -51,7 +75,7 @@ module Bio::MAF
51
75
  (slice_start - i_start)...(slice_end - i_start))
52
76
  end
53
77
  text = []
54
- species.each { |s| text << '' }
78
+ species_to_use.each { |s| text << '' }
55
79
  nonref_text = text[1...text.size]
56
80
  runs(mask) do |range, block|
57
81
  g_range = (range.begin + i_start)...(range.end + i_start)
@@ -69,7 +93,7 @@ module Bio::MAF
69
93
  else
70
94
  # covered by an alignment block
71
95
  t_range = block.ref_seq.text_range(g_range)
72
- species.each_with_index do |species, i|
96
+ species_to_use.each_with_index do |species, i|
73
97
  sp_text = text[i]
74
98
  seq = block.sequences.find { |s| s.source == species || s.species == species }
75
99
  if seq
@@ -86,7 +110,7 @@ module Bio::MAF
86
110
  end
87
111
 
88
112
  def write_fasta(f)
89
- species.zip(tile()) do |species, text|
113
+ species_to_use.zip(tile()) do |species, text|
90
114
  sp_out = species_map[species] || species
91
115
  f.puts ">#{sp_out}"
92
116
  f.puts text
@@ -147,10 +171,13 @@ module Bio::MAF
147
171
  line = line_raw.strip
148
172
  end_pos = pos + line.size
149
173
  if (! in_region) && pos <= z_start && z_start < end_pos
150
- data << line.slice((z_start - pos)...(line.size))
174
+ offset = z_start - pos
175
+ end_offset = [(offset + region_size), line.size].min
176
+ data << line.slice(offset...end_offset)
151
177
  in_region = true
152
178
  elsif in_region
153
179
  need = region_size - data.size
180
+ raise "should not happen: region #{region_size}, data #{data.size}, need #{need}" if need < 0
154
181
  if need > line.size
155
182
  data << line
156
183
  else
data/man/maf_index.1 CHANGED
@@ -1,7 +1,7 @@
1
1
  .\" generated with Ronn/v0.7.3
2
2
  .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
3
  .
4
- .TH "MAF_INDEX" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
4
+ .TH "MAF_INDEX" "1" "July 2012" "BioRuby" "BioRuby Manual"
5
5
  .
6
6
  .SH "NAME"
7
7
  \fBmaf_index\fR \- build and examine MAF indexes
data/man/maf_tile.1 CHANGED
@@ -1,22 +1,22 @@
1
1
  .\" generated with Ronn/v0.7.3
2
2
  .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
3
  .
4
- .TH "MAF_TILE" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
4
+ .TH "MAF_TILE" "1" "July 2012" "BioRuby" "BioRuby Manual"
5
5
  .
6
6
  .SH "NAME"
7
7
  \fBmaf_tile\fR \- synthesize an alignment for a given region
8
8
  .
9
9
  .SH "SYNOPSIS"
10
- \fBmaf_tile\fR [\fIoptions\fR] \-i BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
10
+ \fBmaf_tile\fR [\fIoptions\fR] \-i [SEQ:]BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
11
11
  .
12
12
  .P
13
- \fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR \fIindex\fR
13
+ \fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
14
14
  .
15
15
  .SH "DESCRIPTION"
16
- \fBmaf_tile\fR takes a MAF file with index (generated by maf_index(1)), extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
16
+ \fBmaf_tile\fR takes a MAF file, with optional index, or directory of indexed MAF files, extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
17
17
  .
18
18
  .P
19
- If a single interval is specified, the output will be written to stdout in FASTA format\. If the \fB\-\-output\-base\fR option is specified, \fB_<start>:<end>\.fa\fR will be appended to the given parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
19
+ If a single interval is specified, the output will be written to stdout in FASTA format\. If a directory of MAF files is supplied as the \fImaf\fR parameter, the interval must include the sequence identifier in the form \fBsequence:begin:end\fR\. If the \fB\-\-output\-base\fR option is specified, \fB_<begin>:<end>\.fa\fR will be appended to the given parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
20
20
  .
21
21
  .P
22
22
  Species can be renamed for output by specifying them as SPECIES:NAME; the first component will be used to select the species from the MAF file, and the second will be used in the FASTA description line for output\.
@@ -84,10 +84,10 @@ $ maf_tile \-\-bed /tmp/mm8\.bed \-\-output\-base /tmp/mm8 \e
84
84
  The output is generated in FASTA format, with one sequence per species\.
85
85
  .
86
86
  .P
87
- The input \fImaf\fR file must be a Multiple Alignment Format file\.
87
+ The \fImaf\fR parameter must specify either a Multiple Alignment Format (MAF) file or a directory of such files, with indexes\.
88
88
  .
89
89
  .P
90
- The \fIindex\fR must be a MAF index built with maf_index(1)\.
90
+ The \fIindex\fR must be a MAF index built with maf_index(1)\. This parameter is ignored if the \fImaf\fR parameter is a directory\. It can be omitted if a single MAF file is given, but in this case the entire file will be parsed to build a temporary index\. For large files which will be reused, this is not advisable\.
91
91
  .
92
92
  .P
93
93
  If \fB\-\-bed\fR \fIbed\fR is specified, its argument must be a BED file\. Only the second and third columns will be used, to specify the zero\-based start and end positions of intervals\.
data/man/maf_tile.1.ronn CHANGED
@@ -3,23 +3,26 @@ maf_tile(1) -- synthesize an alignment for a given region
3
3
 
4
4
  ## SYNOPSIS
5
5
 
6
- `maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf> <index>
6
+ `maf_tile` [<options>] -i [SEQ:]BEGIN:END [-s SPECIES[:NAME] ...] <maf> [index]
7
7
 
8
- `maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> <index>
8
+ `maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> [index]
9
9
 
10
10
  ## DESCRIPTION
11
11
 
12
- **maf_tile** takes a MAF file with index (generated by maf_index(1)),
13
- extracts alignment blocks overlapping the given genomic interval, and
14
- constructs a single alignment block covering the entire interval for
15
- the specified species. Optionally, any gaps in coverage of the MAF
16
- file's reference sequence can be filled in from a FASTA sequence file.
12
+ **maf_tile** takes a MAF file, with optional index, or directory of
13
+ indexed MAF files, extracts alignment blocks overlapping the given
14
+ genomic interval, and constructs a single alignment block covering the
15
+ entire interval for the specified species. Optionally, any gaps in
16
+ coverage of the MAF file's reference sequence can be filled in from a
17
+ FASTA sequence file.
17
18
 
18
19
  If a single interval is specified, the output will be written to
19
- stdout in FASTA format. If the `--output-base` option is specified,
20
- `_<start>:<end>.fa` will be appended to the given <base> parameter and
21
- used to construct the output path. If a BED file is specified with
22
- `--bed`, `--output-base` is also required.
20
+ stdout in FASTA format. If a directory of MAF files is supplied as the
21
+ <maf> parameter, the interval must include the sequence identifier in
22
+ the form `sequence:begin:end`. If the `--output-base` option is
23
+ specified, `_<begin>:<end>.fa` will be appended to the given <base>
24
+ parameter and used to construct the output path. If a BED file is
25
+ specified with `--bed`, `--output-base` is also required.
23
26
 
24
27
  Species can be renamed for output by specifying them as SPECIES:NAME;
25
28
  the first component will be used to select the species from the MAF
@@ -80,9 +83,14 @@ sequence:
80
83
  The output is generated in FASTA format, with one sequence per
81
84
  species.
82
85
 
83
- The input <maf> file must be a Multiple Alignment Format file.
86
+ The <maf> parameter must specify either a Multiple Alignment Format
87
+ (MAF) file or a directory of such files, with indexes.
84
88
 
85
- The <index> must be a MAF index built with maf_index(1).
89
+ The <index> must be a MAF index built with maf_index(1). This
90
+ parameter is ignored if the <maf> parameter is a directory. It can be
91
+ omitted if a single MAF file is given, but in this case the entire
92
+ file will be parsed to build a temporary index. For large files which
93
+ will be reused, this is not advisable.
86
94
 
87
95
  If `--bed` <bed> is specified, its argument must be a BED file. Only
88
96
  the second and third columns will be used, to specify the zero-based
data/man/maf_to_fasta.1 CHANGED
@@ -1,7 +1,7 @@
1
1
  .\" generated with Ronn/v0.7.3
2
2
  .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
3
  .
4
- .TH "MAF_TO_FASTA" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
4
+ .TH "MAF_TO_FASTA" "1" "July 2012" "BioRuby" "BioRuby Manual"
5
5
  .
6
6
  .SH "NAME"
7
7
  \fBmaf_to_fasta\fR \- convert MAF file to FASTA
@@ -3,6 +3,73 @@ require 'spec_helper'
3
3
  module Bio
4
4
  module MAF
5
5
 
6
+ describe Access do
7
+ describe "#tile" do
8
+ it "gives correct output with a Pathname" do
9
+ access = Access.maf_dir(TestData)
10
+ interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
11
+ buf = StringIO.new
12
+ access.tile(interval) do |tiler|
13
+ tiler.reference = TestData + 'gap-sp1.fa'
14
+ tiler.species = %w(sp1 sp2 sp3)
15
+ tiler.write_fasta(buf)
16
+ end
17
+ buf.string.should == File.read(TestData + 'gap-filled1.fa')
18
+ end
19
+ it "gives correct output with only a species map" do
20
+ access = Access.maf_dir(TestData)
21
+ interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
22
+ buf = StringIO.new
23
+ access.tile(interval) do |tiler|
24
+ tiler.reference = TestData + 'gap-sp1.fa'
25
+ tiler.species_map = {
26
+ 'sp1' => 'sp1',
27
+ 'sp2' => 'sp2',
28
+ 'sp3' => 'sp3'
29
+ }
30
+ tiler.write_fasta(buf)
31
+ end
32
+ buf.string.should == File.read(TestData + 'gap-filled1.fa')
33
+ end
34
+ it "gives correct output with no species specified" do
35
+ pending("issue 88") do
36
+ access = Access.maf_dir(TestData)
37
+ interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
38
+ buf = StringIO.new
39
+ access.tile(interval) do |tiler|
40
+ tiler.reference = TestData + 'gap-sp1.fa'
41
+ tiler.write_fasta(buf)
42
+ end
43
+ buf.string.should == File.read(TestData + 'gap-filled1.fa')
44
+ end
45
+ end
46
+ end
47
+ describe ".file" do
48
+ it "accepts a MAF file and index" do
49
+ access = Access.file(TestData + 'gap-1.maf',
50
+ TestData + 'gap-1.kct')
51
+ blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
52
+ 10,
53
+ 23)]).to_a
54
+ blocks.size.should == 1
55
+ end
56
+ it "accepts a MAF file and finds the index" do
57
+ access = Access.file(TestData + 'gap-1.maf')
58
+ blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
59
+ 10,
60
+ 23)]).to_a
61
+ blocks.size.should == 1
62
+ end
63
+ it "accepts a MAF file and builds a temp index" do
64
+ access = Access.file(TestData + 'chrY-1block.maf')
65
+ blocks = access.find([GenomicInterval.zero_based('hg19.chrY',
66
+ 10501,
67
+ 10544)]).to_a
68
+ blocks.size.should == 1
69
+ end
70
+ end
71
+ end
72
+
6
73
  describe KyotoIndex do
7
74
  def has_at_least_n_with_prefix(n, start)
8
75
  @idx.db.cursor_process do |cur|
@@ -87,6 +154,38 @@ module Bio
87
154
  l[0].offset.should == 16
88
155
  end
89
156
 
157
+ it "takes a block arg" do
158
+ called = false
159
+ @idx.find([GenomicInterval.zero_based('mm8.chr7',
160
+ 80082334,
161
+ 80082338)],
162
+ @p) do |block|
163
+ block.offset.should == 16
164
+ called = true
165
+ end
166
+ called.should be_true
167
+ end
168
+
169
+ it "with a block and no match, returns" do
170
+ called = false
171
+ @idx.find([GenomicInterval.zero_based('mm8.chr7',
172
+ 20082334,
173
+ 20082338)],
174
+ @p) do |block|
175
+ called = true
176
+ end
177
+ called.should be_false
178
+ end
179
+
180
+ it "with no block and no match, returns an empty list" do
181
+ v = @idx.find([GenomicInterval.zero_based('mm8.chr7',
182
+ 20082334,
183
+ 20082338)],
184
+ @p)
185
+ v.should_not be_nil
186
+ v.should respond_to(:count)
187
+ end
188
+
90
189
  after(:each) do
91
190
  @idx.db.close
92
191
  @p.f.close
@@ -0,0 +1,184 @@
1
+ require 'spec_helper'
2
+
3
+ module Bio
4
+ module MAF
5
+
6
+ describe Header do
7
+ before(:each) do
8
+ @p = Parser.new(TestData + 't1.maf')
9
+ end
10
+
11
+ it "provides version information" do
12
+ @p.header.version.should == '1'
13
+ end
14
+ it "provides the scoring scheme" do
15
+ @p.header.scoring.should == 'humor.v4'
16
+ end
17
+ it "provides alignment parameters" do
18
+ @p.header.alignment_params.should =~ /humor.v4 R=30/
19
+ end
20
+
21
+ it "presents multiline parameters correctly" do
22
+ @p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
23
+ end
24
+
25
+ it "provides arbitrary parameters"
26
+ end
27
+
28
+ describe Block do
29
+ describe "#find_gaps" do
30
+ it "finds a single 14-base gap" do
31
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
32
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
33
+ block = p.parse_block
34
+ gaps = block.find_gaps
35
+ gaps.size.should == 1
36
+ gaps[0][0].should == 34
37
+ gaps[0][1].should == 14
38
+ end
39
+ end
40
+ describe "#remove_gaps!" do
41
+ it "removes a single 14-base gap" do
42
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
43
+ p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
44
+ block = p.parse_block
45
+ block.sequences.size.should == 5
46
+ block.text_size.should == 54
47
+ block.remove_gaps!
48
+ block.text_size.should == 40
49
+ end
50
+ end
51
+ describe "#joinable_with?" do
52
+ it "is false for blocks with different sequences" do
53
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
54
+ sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 dasNov1 loxAfr1 echTel1)
55
+ p.sequence_filter = { :only_species => sp }
56
+ b1 = p.parse_block
57
+ b2 = p.parse_block
58
+ b1.joinable_with?(b2).should be_false
59
+ end
60
+ it "is true for blocks with same sequences" do
61
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
62
+ sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 loxAfr1 echTel1)
63
+ p.sequence_filter = { :only_species => sp }
64
+ b1 = p.parse_block
65
+ b2 = p.parse_block
66
+ b1.joinable_with?(b2).should be_true
67
+ end
68
+ end
69
+ describe "#to_bio_alignment" do
70
+ it "returns a usable Bio::BioAlignment::Alignment" do
71
+ p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
72
+ b = p.parse_block
73
+ ba = b.to_bio_alignment
74
+ ba.size.should == 10
75
+ ba.sequences[0].id.should == "mm8.chr7"
76
+ ba.sequences[0].seq.should =~ /^GGGCTGAGGGC--/
77
+ end
78
+ end
79
+ end
80
+
81
+ describe Sequence do
82
+ before(:each) do
83
+ @parser = DummyParser.new
84
+ end
85
+
86
+ describe "#gapped?" do
87
+ it "is false for sequences with no gaps" do
88
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
89
+ s = @parser.parse_seq_line(line, nil)
90
+ s.gapped?.should be_false
91
+ end
92
+ it "is true for sequences with gaps" do
93
+ line = "s human_unc 9077 8 + 10998 AC-AGTATT"
94
+ s = @parser.parse_seq_line(line, nil)
95
+ s.gapped?.should be_true
96
+ end
97
+ end
98
+
99
+ describe "#text_range" do
100
+ it "returns 0...text.size for a spanning interval" do
101
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
102
+ s = @parser.parse_seq_line(line, nil)
103
+ range = s.text_range(9077...(9077 + 8))
104
+ range.should == (0...(s.text.size))
105
+ end
106
+ it "returns 0...text.size for a gapped spanning interval" do
107
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
108
+ s = @parser.parse_seq_line(line, nil)
109
+ range = s.text_range(9077...(9077 + 8))
110
+ range.should == (0...(s.text.size))
111
+ end
112
+ it "handles a leading subset" do
113
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
114
+ s = @parser.parse_seq_line(line, nil)
115
+ range = s.text_range(9077...(9077 + 2))
116
+ range.should == (0...2)
117
+ end
118
+ it "handles a trailing subset" do
119
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
120
+ s = @parser.parse_seq_line(line, nil)
121
+ range = s.text_range(9079...9085)
122
+ range.should == (2...8)
123
+ end
124
+ it "handles a gap in the middle" do
125
+ line = "s human_unc 9077 8 + 10998 AC--AGTATT"
126
+ s = @parser.parse_seq_line(line, nil)
127
+ range = s.text_range(9078...(9077 + 8))
128
+ range.should == (1...(s.text.size))
129
+ end
130
+ it "errors on a range starting before" do
131
+ expect {
132
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
133
+ s = @parser.parse_seq_line(line, nil)
134
+ range = s.text_range(9076...(9077 + 8))
135
+ }.to raise_error
136
+ end
137
+ it "errors on a range ending after" do
138
+ expect {
139
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
140
+ s = @parser.parse_seq_line(line, nil)
141
+ range = s.text_range(9076...(9077 + 9))
142
+ }.to raise_error
143
+ end
144
+ end
145
+
146
+ describe "synteny data" do
147
+ it "extracts basic data from i lines" do
148
+ p = Parser.new(TestData + 'chr22_ieq2.maf',
149
+ :parse_extended => true)
150
+ b = p.parse_block
151
+ b.sequences[0].left_status_char.should be_nil
152
+ b.sequences[0].left_status.should be_nil
153
+ b.sequences[0].left_count.should be_nil
154
+ b.sequences[0].right_status_char.should be_nil
155
+ b.sequences[0].right_status.should be_nil
156
+ b.sequences[0].right_count.should be_nil
157
+ # works but let's not over-specify internal state
158
+ #b.sequences[1].i_data.should == %w(N 0 C 0)
159
+ b.sequences[1].left_status_char.should == 'N'
160
+ b.sequences[1].left_status.should == :first
161
+ b.sequences[1].right_status_char.should == 'C'
162
+ b.sequences[1].right_status.should == :contiguous
163
+ b.sequences[2].left_status.should == :contiguous
164
+ b.sequences[2].right_status_char.should == 'I'
165
+ b.sequences[2].right_status.should == :intervening
166
+ b.sequences[2].right_count.should == 146
167
+ end
168
+ end
169
+
170
+ describe "#to_bioalignment" do
171
+ it "returns a usable Bio::BioAlignment::Sequence" do
172
+ @parser = DummyParser.new
173
+ line = "s human_unc 9077 8 + 10998 ACAGTATT"
174
+ s = @parser.parse_seq_line(line, nil)
175
+ as = s.to_bio_alignment
176
+ as.id.should == "human_unc"
177
+ as.seq.should == "ACAGTATT"
178
+ end
179
+ end
180
+
181
+ end
182
+
183
+ end
184
+ end