bio-maf 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +2 -1
- data/README.md +98 -29
- data/Rakefile +6 -2
- data/bin/maf_tile +59 -35
- data/bio-maf.gemspec +4 -3
- data/features/block-joining.feature +32 -0
- data/features/dir-access.feature +46 -0
- data/features/maf-indexing.feature +23 -0
- data/features/maf-to-fasta.feature +9 -0
- data/features/slice.feature +54 -0
- data/features/step_definitions/dir-access_steps.rb +15 -0
- data/features/step_definitions/file_steps.rb +7 -0
- data/features/step_definitions/gap_removal_steps.rb +4 -0
- data/features/step_definitions/index_steps.rb +3 -3
- data/features/step_definitions/output_steps.rb +9 -1
- data/features/step_definitions/parse_steps.rb +13 -2
- data/features/step_definitions/query_steps.rb +7 -6
- data/features/step_definitions/slice_steps.rb +15 -0
- data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
- data/features/support/aruba.rb +1 -0
- data/features/support/env.rb +3 -1
- data/features/{gap-filling.feature → tiling.feature} +85 -0
- data/lib/bio/maf/index.rb +223 -11
- data/lib/bio/maf/maf.rb +209 -0
- data/lib/bio/maf/parser.rb +190 -111
- data/lib/bio/maf/tiler.rb +33 -6
- data/man/maf_index.1 +1 -1
- data/man/maf_tile.1 +7 -7
- data/man/maf_tile.1.ronn +21 -13
- data/man/maf_to_fasta.1 +1 -1
- data/spec/bio/maf/index_spec.rb +99 -0
- data/spec/bio/maf/maf_spec.rb +184 -0
- data/spec/bio/maf/parser_spec.rb +75 -115
- data/spec/bio/maf/tiler_spec.rb +44 -0
- data/test/data/chr22_ieq2.maf +11 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/gap-1.maf +9 -0
- data/test/data/gap-filled1.fa +6 -0
- data/test/data/gap-sp1.fa.gz +0 -0
- data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
- data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
- data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.maf +1000 -0
- metadata +59 -7
data/lib/bio/maf/tiler.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'pathname'
|
1
2
|
require 'zlib'
|
2
3
|
|
3
4
|
module Bio::MAF
|
@@ -9,7 +10,7 @@ module Bio::MAF
|
|
9
10
|
|
10
11
|
attr_accessor :index
|
11
12
|
attr_accessor :parser
|
12
|
-
|
13
|
+
attr_reader :reference
|
13
14
|
# GenomicInterval
|
14
15
|
attr_accessor :interval
|
15
16
|
attr_accessor :species
|
@@ -19,6 +20,25 @@ module Bio::MAF
|
|
19
20
|
@species_map = {}
|
20
21
|
end
|
21
22
|
|
23
|
+
# Set the reference sequence.
|
24
|
+
#
|
25
|
+
# @param source [FASTARangeReader, String, Pathname]
|
26
|
+
def reference=(source)
|
27
|
+
ref = case
|
28
|
+
when source.is_a?(FASTARangeReader)
|
29
|
+
source
|
30
|
+
when source.respond_to?(:seek)
|
31
|
+
# open file
|
32
|
+
FASTARangeReader.new(source)
|
33
|
+
when source.respond_to?(:start_with?) && source.start_with?('>')
|
34
|
+
# FASTA string
|
35
|
+
FASTARangeReader.new(StringIO.new(source))
|
36
|
+
else
|
37
|
+
FASTARangeReader.new(source.to_s)
|
38
|
+
end
|
39
|
+
@reference = ref
|
40
|
+
end
|
41
|
+
|
22
42
|
def ref_data(range)
|
23
43
|
if reference
|
24
44
|
if reference.respond_to? :read_interval
|
@@ -33,8 +53,12 @@ module Bio::MAF
|
|
33
53
|
end
|
34
54
|
end
|
35
55
|
|
56
|
+
def species_to_use
|
57
|
+
species || species_map.keys
|
58
|
+
end
|
59
|
+
|
36
60
|
def tile
|
37
|
-
parser.sequence_filter[:only_species] =
|
61
|
+
parser.sequence_filter[:only_species] = species_to_use
|
38
62
|
# TODO: remove gaps
|
39
63
|
blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
|
40
64
|
mask = Array.new(interval.length, :ref)
|
@@ -51,7 +75,7 @@ module Bio::MAF
|
|
51
75
|
(slice_start - i_start)...(slice_end - i_start))
|
52
76
|
end
|
53
77
|
text = []
|
54
|
-
|
78
|
+
species_to_use.each { |s| text << '' }
|
55
79
|
nonref_text = text[1...text.size]
|
56
80
|
runs(mask) do |range, block|
|
57
81
|
g_range = (range.begin + i_start)...(range.end + i_start)
|
@@ -69,7 +93,7 @@ module Bio::MAF
|
|
69
93
|
else
|
70
94
|
# covered by an alignment block
|
71
95
|
t_range = block.ref_seq.text_range(g_range)
|
72
|
-
|
96
|
+
species_to_use.each_with_index do |species, i|
|
73
97
|
sp_text = text[i]
|
74
98
|
seq = block.sequences.find { |s| s.source == species || s.species == species }
|
75
99
|
if seq
|
@@ -86,7 +110,7 @@ module Bio::MAF
|
|
86
110
|
end
|
87
111
|
|
88
112
|
def write_fasta(f)
|
89
|
-
|
113
|
+
species_to_use.zip(tile()) do |species, text|
|
90
114
|
sp_out = species_map[species] || species
|
91
115
|
f.puts ">#{sp_out}"
|
92
116
|
f.puts text
|
@@ -147,10 +171,13 @@ module Bio::MAF
|
|
147
171
|
line = line_raw.strip
|
148
172
|
end_pos = pos + line.size
|
149
173
|
if (! in_region) && pos <= z_start && z_start < end_pos
|
150
|
-
|
174
|
+
offset = z_start - pos
|
175
|
+
end_offset = [(offset + region_size), line.size].min
|
176
|
+
data << line.slice(offset...end_offset)
|
151
177
|
in_region = true
|
152
178
|
elsif in_region
|
153
179
|
need = region_size - data.size
|
180
|
+
raise "should not happen: region #{region_size}, data #{data.size}, need #{need}" if need < 0
|
154
181
|
if need > line.size
|
155
182
|
data << line
|
156
183
|
else
|
data/man/maf_index.1
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
.\" generated with Ronn/v0.7.3
|
2
2
|
.\" http://github.com/rtomayko/ronn/tree/0.7.3
|
3
3
|
.
|
4
|
-
.TH "MAF_INDEX" "1" "
|
4
|
+
.TH "MAF_INDEX" "1" "July 2012" "BioRuby" "BioRuby Manual"
|
5
5
|
.
|
6
6
|
.SH "NAME"
|
7
7
|
\fBmaf_index\fR \- build and examine MAF indexes
|
data/man/maf_tile.1
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
.\" generated with Ronn/v0.7.3
|
2
2
|
.\" http://github.com/rtomayko/ronn/tree/0.7.3
|
3
3
|
.
|
4
|
-
.TH "MAF_TILE" "1" "
|
4
|
+
.TH "MAF_TILE" "1" "July 2012" "BioRuby" "BioRuby Manual"
|
5
5
|
.
|
6
6
|
.SH "NAME"
|
7
7
|
\fBmaf_tile\fR \- synthesize an alignment for a given region
|
8
8
|
.
|
9
9
|
.SH "SYNOPSIS"
|
10
|
-
\fBmaf_tile\fR [\fIoptions\fR] \-i BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR
|
10
|
+
\fBmaf_tile\fR [\fIoptions\fR] \-i [SEQ:]BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
|
11
11
|
.
|
12
12
|
.P
|
13
|
-
\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR
|
13
|
+
\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
|
14
14
|
.
|
15
15
|
.SH "DESCRIPTION"
|
16
|
-
\fBmaf_tile\fR takes a MAF file with index
|
16
|
+
\fBmaf_tile\fR takes a MAF file, with optional index, or directory of indexed MAF files, extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
|
17
17
|
.
|
18
18
|
.P
|
19
|
-
If a single interval is specified, the output will be written to stdout in FASTA format\. If the \fB\-\-output\-base\fR option is specified, \fB_<
|
19
|
+
If a single interval is specified, the output will be written to stdout in FASTA format\. If a directory of MAF files is supplied as the \fImaf\fR parameter, the interval must include the sequence identifier in the form \fBsequence:begin:end\fR\. If the \fB\-\-output\-base\fR option is specified, \fB_<begin>:<end>\.fa\fR will be appended to the given parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
|
20
20
|
.
|
21
21
|
.P
|
22
22
|
Species can be renamed for output by specifying them as SPECIES:NAME; the first component will be used to select the species from the MAF file, and the second will be used in the FASTA description line for output\.
|
@@ -84,10 +84,10 @@ $ maf_tile \-\-bed /tmp/mm8\.bed \-\-output\-base /tmp/mm8 \e
|
|
84
84
|
The output is generated in FASTA format, with one sequence per species\.
|
85
85
|
.
|
86
86
|
.P
|
87
|
-
The
|
87
|
+
The \fImaf\fR parameter must specify either a Multiple Alignment Format (MAF) file or a directory of such files, with indexes\.
|
88
88
|
.
|
89
89
|
.P
|
90
|
-
The \fIindex\fR must be a MAF index built with maf_index(1)\.
|
90
|
+
The \fIindex\fR must be a MAF index built with maf_index(1)\. This parameter is ignored if the \fImaf\fR parameter is a directory\. It can be omitted if a single MAF file is given, but in this case the entire file will be parsed to build a temporary index\. For large files which will be reused, this is not advisable\.
|
91
91
|
.
|
92
92
|
.P
|
93
93
|
If \fB\-\-bed\fR \fIbed\fR is specified, its argument must be a BED file\. Only the second and third columns will be used, to specify the zero\-based start and end positions of intervals\.
|
data/man/maf_tile.1.ronn
CHANGED
@@ -3,23 +3,26 @@ maf_tile(1) -- synthesize an alignment for a given region
|
|
3
3
|
|
4
4
|
## SYNOPSIS
|
5
5
|
|
6
|
-
`maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf>
|
6
|
+
`maf_tile` [<options>] -i [SEQ:]BEGIN:END [-s SPECIES[:NAME] ...] <maf> [index]
|
7
7
|
|
8
|
-
`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf>
|
8
|
+
`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> [index]
|
9
9
|
|
10
10
|
## DESCRIPTION
|
11
11
|
|
12
|
-
**maf_tile** takes a MAF file with index
|
13
|
-
extracts alignment blocks overlapping the given
|
14
|
-
constructs a single alignment block covering the
|
15
|
-
the specified species. Optionally, any gaps in
|
16
|
-
file's reference sequence can be filled in from a
|
12
|
+
**maf_tile** takes a MAF file, with optional index, or directory of
|
13
|
+
indexed MAF files, extracts alignment blocks overlapping the given
|
14
|
+
genomic interval, and constructs a single alignment block covering the
|
15
|
+
entire interval for the specified species. Optionally, any gaps in
|
16
|
+
coverage of the MAF file's reference sequence can be filled in from a
|
17
|
+
FASTA sequence file.
|
17
18
|
|
18
19
|
If a single interval is specified, the output will be written to
|
19
|
-
stdout in FASTA format. If
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
stdout in FASTA format. If a directory of MAF files is supplied as the
|
21
|
+
<maf> parameter, the interval must include the sequence identifier in
|
22
|
+
the form `sequence:begin:end`. If the `--output-base` option is
|
23
|
+
specified, `_<begin>:<end>.fa` will be appended to the given <base>
|
24
|
+
parameter and used to construct the output path. If a BED file is
|
25
|
+
specified with `--bed`, `--output-base` is also required.
|
23
26
|
|
24
27
|
Species can be renamed for output by specifying them as SPECIES:NAME;
|
25
28
|
the first component will be used to select the species from the MAF
|
@@ -80,9 +83,14 @@ sequence:
|
|
80
83
|
The output is generated in FASTA format, with one sequence per
|
81
84
|
species.
|
82
85
|
|
83
|
-
The
|
86
|
+
The <maf> parameter must specify either a Multiple Alignment Format
|
87
|
+
(MAF) file or a directory of such files, with indexes.
|
84
88
|
|
85
|
-
The <index> must be a MAF index built with maf_index(1).
|
89
|
+
The <index> must be a MAF index built with maf_index(1). This
|
90
|
+
parameter is ignored if the <maf> parameter is a directory. It can be
|
91
|
+
omitted if a single MAF file is given, but in this case the entire
|
92
|
+
file will be parsed to build a temporary index. For large files which
|
93
|
+
will be reused, this is not advisable.
|
86
94
|
|
87
95
|
If `--bed` <bed> is specified, its argument must be a BED file. Only
|
88
96
|
the second and third columns will be used, to specify the zero-based
|
data/man/maf_to_fasta.1
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
.\" generated with Ronn/v0.7.3
|
2
2
|
.\" http://github.com/rtomayko/ronn/tree/0.7.3
|
3
3
|
.
|
4
|
-
.TH "MAF_TO_FASTA" "1" "
|
4
|
+
.TH "MAF_TO_FASTA" "1" "July 2012" "BioRuby" "BioRuby Manual"
|
5
5
|
.
|
6
6
|
.SH "NAME"
|
7
7
|
\fBmaf_to_fasta\fR \- convert MAF file to FASTA
|
data/spec/bio/maf/index_spec.rb
CHANGED
@@ -3,6 +3,73 @@ require 'spec_helper'
|
|
3
3
|
module Bio
|
4
4
|
module MAF
|
5
5
|
|
6
|
+
describe Access do
|
7
|
+
describe "#tile" do
|
8
|
+
it "gives correct output with a Pathname" do
|
9
|
+
access = Access.maf_dir(TestData)
|
10
|
+
interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
|
11
|
+
buf = StringIO.new
|
12
|
+
access.tile(interval) do |tiler|
|
13
|
+
tiler.reference = TestData + 'gap-sp1.fa'
|
14
|
+
tiler.species = %w(sp1 sp2 sp3)
|
15
|
+
tiler.write_fasta(buf)
|
16
|
+
end
|
17
|
+
buf.string.should == File.read(TestData + 'gap-filled1.fa')
|
18
|
+
end
|
19
|
+
it "gives correct output with only a species map" do
|
20
|
+
access = Access.maf_dir(TestData)
|
21
|
+
interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
|
22
|
+
buf = StringIO.new
|
23
|
+
access.tile(interval) do |tiler|
|
24
|
+
tiler.reference = TestData + 'gap-sp1.fa'
|
25
|
+
tiler.species_map = {
|
26
|
+
'sp1' => 'sp1',
|
27
|
+
'sp2' => 'sp2',
|
28
|
+
'sp3' => 'sp3'
|
29
|
+
}
|
30
|
+
tiler.write_fasta(buf)
|
31
|
+
end
|
32
|
+
buf.string.should == File.read(TestData + 'gap-filled1.fa')
|
33
|
+
end
|
34
|
+
it "gives correct output with no species specified" do
|
35
|
+
pending("issue 88") do
|
36
|
+
access = Access.maf_dir(TestData)
|
37
|
+
interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
|
38
|
+
buf = StringIO.new
|
39
|
+
access.tile(interval) do |tiler|
|
40
|
+
tiler.reference = TestData + 'gap-sp1.fa'
|
41
|
+
tiler.write_fasta(buf)
|
42
|
+
end
|
43
|
+
buf.string.should == File.read(TestData + 'gap-filled1.fa')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
describe ".file" do
|
48
|
+
it "accepts a MAF file and index" do
|
49
|
+
access = Access.file(TestData + 'gap-1.maf',
|
50
|
+
TestData + 'gap-1.kct')
|
51
|
+
blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
|
52
|
+
10,
|
53
|
+
23)]).to_a
|
54
|
+
blocks.size.should == 1
|
55
|
+
end
|
56
|
+
it "accepts a MAF file and finds the index" do
|
57
|
+
access = Access.file(TestData + 'gap-1.maf')
|
58
|
+
blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
|
59
|
+
10,
|
60
|
+
23)]).to_a
|
61
|
+
blocks.size.should == 1
|
62
|
+
end
|
63
|
+
it "accepts a MAF file and builds a temp index" do
|
64
|
+
access = Access.file(TestData + 'chrY-1block.maf')
|
65
|
+
blocks = access.find([GenomicInterval.zero_based('hg19.chrY',
|
66
|
+
10501,
|
67
|
+
10544)]).to_a
|
68
|
+
blocks.size.should == 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
6
73
|
describe KyotoIndex do
|
7
74
|
def has_at_least_n_with_prefix(n, start)
|
8
75
|
@idx.db.cursor_process do |cur|
|
@@ -87,6 +154,38 @@ module Bio
|
|
87
154
|
l[0].offset.should == 16
|
88
155
|
end
|
89
156
|
|
157
|
+
it "takes a block arg" do
|
158
|
+
called = false
|
159
|
+
@idx.find([GenomicInterval.zero_based('mm8.chr7',
|
160
|
+
80082334,
|
161
|
+
80082338)],
|
162
|
+
@p) do |block|
|
163
|
+
block.offset.should == 16
|
164
|
+
called = true
|
165
|
+
end
|
166
|
+
called.should be_true
|
167
|
+
end
|
168
|
+
|
169
|
+
it "with a block and no match, returns" do
|
170
|
+
called = false
|
171
|
+
@idx.find([GenomicInterval.zero_based('mm8.chr7',
|
172
|
+
20082334,
|
173
|
+
20082338)],
|
174
|
+
@p) do |block|
|
175
|
+
called = true
|
176
|
+
end
|
177
|
+
called.should be_false
|
178
|
+
end
|
179
|
+
|
180
|
+
it "with no block and no match, returns an empty list" do
|
181
|
+
v = @idx.find([GenomicInterval.zero_based('mm8.chr7',
|
182
|
+
20082334,
|
183
|
+
20082338)],
|
184
|
+
@p)
|
185
|
+
v.should_not be_nil
|
186
|
+
v.should respond_to(:count)
|
187
|
+
end
|
188
|
+
|
90
189
|
after(:each) do
|
91
190
|
@idx.db.close
|
92
191
|
@p.f.close
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module MAF
|
5
|
+
|
6
|
+
describe Header do
|
7
|
+
before(:each) do
|
8
|
+
@p = Parser.new(TestData + 't1.maf')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "provides version information" do
|
12
|
+
@p.header.version.should == '1'
|
13
|
+
end
|
14
|
+
it "provides the scoring scheme" do
|
15
|
+
@p.header.scoring.should == 'humor.v4'
|
16
|
+
end
|
17
|
+
it "provides alignment parameters" do
|
18
|
+
@p.header.alignment_params.should =~ /humor.v4 R=30/
|
19
|
+
end
|
20
|
+
|
21
|
+
it "presents multiline parameters correctly" do
|
22
|
+
@p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "provides arbitrary parameters"
|
26
|
+
end
|
27
|
+
|
28
|
+
describe Block do
|
29
|
+
describe "#find_gaps" do
|
30
|
+
it "finds a single 14-base gap" do
|
31
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
32
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
33
|
+
block = p.parse_block
|
34
|
+
gaps = block.find_gaps
|
35
|
+
gaps.size.should == 1
|
36
|
+
gaps[0][0].should == 34
|
37
|
+
gaps[0][1].should == 14
|
38
|
+
end
|
39
|
+
end
|
40
|
+
describe "#remove_gaps!" do
|
41
|
+
it "removes a single 14-base gap" do
|
42
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
43
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
44
|
+
block = p.parse_block
|
45
|
+
block.sequences.size.should == 5
|
46
|
+
block.text_size.should == 54
|
47
|
+
block.remove_gaps!
|
48
|
+
block.text_size.should == 40
|
49
|
+
end
|
50
|
+
end
|
51
|
+
describe "#joinable_with?" do
|
52
|
+
it "is false for blocks with different sequences" do
|
53
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
54
|
+
sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 dasNov1 loxAfr1 echTel1)
|
55
|
+
p.sequence_filter = { :only_species => sp }
|
56
|
+
b1 = p.parse_block
|
57
|
+
b2 = p.parse_block
|
58
|
+
b1.joinable_with?(b2).should be_false
|
59
|
+
end
|
60
|
+
it "is true for blocks with same sequences" do
|
61
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
62
|
+
sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 loxAfr1 echTel1)
|
63
|
+
p.sequence_filter = { :only_species => sp }
|
64
|
+
b1 = p.parse_block
|
65
|
+
b2 = p.parse_block
|
66
|
+
b1.joinable_with?(b2).should be_true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
describe "#to_bio_alignment" do
|
70
|
+
it "returns a usable Bio::BioAlignment::Alignment" do
|
71
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
72
|
+
b = p.parse_block
|
73
|
+
ba = b.to_bio_alignment
|
74
|
+
ba.size.should == 10
|
75
|
+
ba.sequences[0].id.should == "mm8.chr7"
|
76
|
+
ba.sequences[0].seq.should =~ /^GGGCTGAGGGC--/
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe Sequence do
|
82
|
+
before(:each) do
|
83
|
+
@parser = DummyParser.new
|
84
|
+
end
|
85
|
+
|
86
|
+
describe "#gapped?" do
|
87
|
+
it "is false for sequences with no gaps" do
|
88
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
89
|
+
s = @parser.parse_seq_line(line, nil)
|
90
|
+
s.gapped?.should be_false
|
91
|
+
end
|
92
|
+
it "is true for sequences with gaps" do
|
93
|
+
line = "s human_unc 9077 8 + 10998 AC-AGTATT"
|
94
|
+
s = @parser.parse_seq_line(line, nil)
|
95
|
+
s.gapped?.should be_true
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "#text_range" do
|
100
|
+
it "returns 0...text.size for a spanning interval" do
|
101
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
102
|
+
s = @parser.parse_seq_line(line, nil)
|
103
|
+
range = s.text_range(9077...(9077 + 8))
|
104
|
+
range.should == (0...(s.text.size))
|
105
|
+
end
|
106
|
+
it "returns 0...text.size for a gapped spanning interval" do
|
107
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
108
|
+
s = @parser.parse_seq_line(line, nil)
|
109
|
+
range = s.text_range(9077...(9077 + 8))
|
110
|
+
range.should == (0...(s.text.size))
|
111
|
+
end
|
112
|
+
it "handles a leading subset" do
|
113
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
114
|
+
s = @parser.parse_seq_line(line, nil)
|
115
|
+
range = s.text_range(9077...(9077 + 2))
|
116
|
+
range.should == (0...2)
|
117
|
+
end
|
118
|
+
it "handles a trailing subset" do
|
119
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
120
|
+
s = @parser.parse_seq_line(line, nil)
|
121
|
+
range = s.text_range(9079...9085)
|
122
|
+
range.should == (2...8)
|
123
|
+
end
|
124
|
+
it "handles a gap in the middle" do
|
125
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
126
|
+
s = @parser.parse_seq_line(line, nil)
|
127
|
+
range = s.text_range(9078...(9077 + 8))
|
128
|
+
range.should == (1...(s.text.size))
|
129
|
+
end
|
130
|
+
it "errors on a range starting before" do
|
131
|
+
expect {
|
132
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
133
|
+
s = @parser.parse_seq_line(line, nil)
|
134
|
+
range = s.text_range(9076...(9077 + 8))
|
135
|
+
}.to raise_error
|
136
|
+
end
|
137
|
+
it "errors on a range ending after" do
|
138
|
+
expect {
|
139
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
140
|
+
s = @parser.parse_seq_line(line, nil)
|
141
|
+
range = s.text_range(9076...(9077 + 9))
|
142
|
+
}.to raise_error
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
describe "synteny data" do
|
147
|
+
it "extracts basic data from i lines" do
|
148
|
+
p = Parser.new(TestData + 'chr22_ieq2.maf',
|
149
|
+
:parse_extended => true)
|
150
|
+
b = p.parse_block
|
151
|
+
b.sequences[0].left_status_char.should be_nil
|
152
|
+
b.sequences[0].left_status.should be_nil
|
153
|
+
b.sequences[0].left_count.should be_nil
|
154
|
+
b.sequences[0].right_status_char.should be_nil
|
155
|
+
b.sequences[0].right_status.should be_nil
|
156
|
+
b.sequences[0].right_count.should be_nil
|
157
|
+
# works but let's not over-specify internal state
|
158
|
+
#b.sequences[1].i_data.should == %w(N 0 C 0)
|
159
|
+
b.sequences[1].left_status_char.should == 'N'
|
160
|
+
b.sequences[1].left_status.should == :first
|
161
|
+
b.sequences[1].right_status_char.should == 'C'
|
162
|
+
b.sequences[1].right_status.should == :contiguous
|
163
|
+
b.sequences[2].left_status.should == :contiguous
|
164
|
+
b.sequences[2].right_status_char.should == 'I'
|
165
|
+
b.sequences[2].right_status.should == :intervening
|
166
|
+
b.sequences[2].right_count.should == 146
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
describe "#to_bioalignment" do
|
171
|
+
it "returns a usable Bio::BioAlignment::Sequence" do
|
172
|
+
@parser = DummyParser.new
|
173
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
174
|
+
s = @parser.parse_seq_line(line, nil)
|
175
|
+
as = s.to_bio_alignment
|
176
|
+
as.id.should == "human_unc"
|
177
|
+
as.seq.should == "ACAGTATT"
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|