bio-maf 0.2.0-java → 0.3.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +3 -1
- data/README.md +98 -29
- data/Rakefile +6 -2
- data/bin/maf_tile +59 -35
- data/bio-maf.gemspec +4 -3
- data/features/block-joining.feature +32 -0
- data/features/dir-access.feature +46 -0
- data/features/maf-indexing.feature +23 -0
- data/features/maf-to-fasta.feature +9 -0
- data/features/slice.feature +54 -0
- data/features/step_definitions/dir-access_steps.rb +15 -0
- data/features/step_definitions/file_steps.rb +7 -0
- data/features/step_definitions/gap_removal_steps.rb +4 -0
- data/features/step_definitions/index_steps.rb +3 -3
- data/features/step_definitions/output_steps.rb +9 -1
- data/features/step_definitions/parse_steps.rb +13 -2
- data/features/step_definitions/query_steps.rb +7 -6
- data/features/step_definitions/slice_steps.rb +15 -0
- data/features/step_definitions/{gap-filling_steps.rb → tiling_steps.rb} +0 -0
- data/features/support/aruba.rb +1 -0
- data/features/support/env.rb +3 -1
- data/features/{gap-filling.feature → tiling.feature} +85 -0
- data/lib/bio/maf/index.rb +223 -11
- data/lib/bio/maf/maf.rb +209 -0
- data/lib/bio/maf/parser.rb +190 -111
- data/lib/bio/maf/tiler.rb +33 -6
- data/man/maf_index.1 +1 -1
- data/man/maf_tile.1 +7 -7
- data/man/maf_tile.1.ronn +21 -13
- data/man/maf_to_fasta.1 +1 -1
- data/spec/bio/maf/index_spec.rb +99 -0
- data/spec/bio/maf/maf_spec.rb +184 -0
- data/spec/bio/maf/parser_spec.rb +75 -115
- data/spec/bio/maf/tiler_spec.rb +44 -0
- data/test/data/chr22_ieq2.maf +11 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/gap-1.maf +9 -0
- data/test/data/gap-filled1.fa +6 -0
- data/test/data/gap-sp1.fa.gz +0 -0
- data/test/data/mm8_chr7_tiny_slice1.maf +9 -0
- data/test/data/mm8_chr7_tiny_slice2.maf +10 -0
- data/test/data/mm8_chr7_tiny_slice3.maf +10 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.maf +1000 -0
- metadata +65 -16
data/lib/bio/maf/tiler.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'pathname'
|
1
2
|
require 'zlib'
|
2
3
|
|
3
4
|
module Bio::MAF
|
@@ -9,7 +10,7 @@ module Bio::MAF
|
|
9
10
|
|
10
11
|
attr_accessor :index
|
11
12
|
attr_accessor :parser
|
12
|
-
|
13
|
+
attr_reader :reference
|
13
14
|
# GenomicInterval
|
14
15
|
attr_accessor :interval
|
15
16
|
attr_accessor :species
|
@@ -19,6 +20,25 @@ module Bio::MAF
|
|
19
20
|
@species_map = {}
|
20
21
|
end
|
21
22
|
|
23
|
+
# Set the reference sequence.
|
24
|
+
#
|
25
|
+
# @param source [FASTARangeReader, String, Pathname]
|
26
|
+
def reference=(source)
|
27
|
+
ref = case
|
28
|
+
when source.is_a?(FASTARangeReader)
|
29
|
+
source
|
30
|
+
when source.respond_to?(:seek)
|
31
|
+
# open file
|
32
|
+
FASTARangeReader.new(source)
|
33
|
+
when source.respond_to?(:start_with?) && source.start_with?('>')
|
34
|
+
# FASTA string
|
35
|
+
FASTARangeReader.new(StringIO.new(source))
|
36
|
+
else
|
37
|
+
FASTARangeReader.new(source.to_s)
|
38
|
+
end
|
39
|
+
@reference = ref
|
40
|
+
end
|
41
|
+
|
22
42
|
def ref_data(range)
|
23
43
|
if reference
|
24
44
|
if reference.respond_to? :read_interval
|
@@ -33,8 +53,12 @@ module Bio::MAF
|
|
33
53
|
end
|
34
54
|
end
|
35
55
|
|
56
|
+
def species_to_use
|
57
|
+
species || species_map.keys
|
58
|
+
end
|
59
|
+
|
36
60
|
def tile
|
37
|
-
parser.sequence_filter[:only_species] =
|
61
|
+
parser.sequence_filter[:only_species] = species_to_use
|
38
62
|
# TODO: remove gaps
|
39
63
|
blocks = index.find([interval], parser).sort_by { |b| b.vars[:score] }
|
40
64
|
mask = Array.new(interval.length, :ref)
|
@@ -51,7 +75,7 @@ module Bio::MAF
|
|
51
75
|
(slice_start - i_start)...(slice_end - i_start))
|
52
76
|
end
|
53
77
|
text = []
|
54
|
-
|
78
|
+
species_to_use.each { |s| text << '' }
|
55
79
|
nonref_text = text[1...text.size]
|
56
80
|
runs(mask) do |range, block|
|
57
81
|
g_range = (range.begin + i_start)...(range.end + i_start)
|
@@ -69,7 +93,7 @@ module Bio::MAF
|
|
69
93
|
else
|
70
94
|
# covered by an alignment block
|
71
95
|
t_range = block.ref_seq.text_range(g_range)
|
72
|
-
|
96
|
+
species_to_use.each_with_index do |species, i|
|
73
97
|
sp_text = text[i]
|
74
98
|
seq = block.sequences.find { |s| s.source == species || s.species == species }
|
75
99
|
if seq
|
@@ -86,7 +110,7 @@ module Bio::MAF
|
|
86
110
|
end
|
87
111
|
|
88
112
|
def write_fasta(f)
|
89
|
-
|
113
|
+
species_to_use.zip(tile()) do |species, text|
|
90
114
|
sp_out = species_map[species] || species
|
91
115
|
f.puts ">#{sp_out}"
|
92
116
|
f.puts text
|
@@ -147,10 +171,13 @@ module Bio::MAF
|
|
147
171
|
line = line_raw.strip
|
148
172
|
end_pos = pos + line.size
|
149
173
|
if (! in_region) && pos <= z_start && z_start < end_pos
|
150
|
-
|
174
|
+
offset = z_start - pos
|
175
|
+
end_offset = [(offset + region_size), line.size].min
|
176
|
+
data << line.slice(offset...end_offset)
|
151
177
|
in_region = true
|
152
178
|
elsif in_region
|
153
179
|
need = region_size - data.size
|
180
|
+
raise "should not happen: region #{region_size}, data #{data.size}, need #{need}" if need < 0
|
154
181
|
if need > line.size
|
155
182
|
data << line
|
156
183
|
else
|
data/man/maf_index.1
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
.\" generated with Ronn/v0.7.3
|
2
2
|
.\" http://github.com/rtomayko/ronn/tree/0.7.3
|
3
3
|
.
|
4
|
-
.TH "MAF_INDEX" "1" "
|
4
|
+
.TH "MAF_INDEX" "1" "July 2012" "BioRuby" "BioRuby Manual"
|
5
5
|
.
|
6
6
|
.SH "NAME"
|
7
7
|
\fBmaf_index\fR \- build and examine MAF indexes
|
data/man/maf_tile.1
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
.\" generated with Ronn/v0.7.3
|
2
2
|
.\" http://github.com/rtomayko/ronn/tree/0.7.3
|
3
3
|
.
|
4
|
-
.TH "MAF_TILE" "1" "
|
4
|
+
.TH "MAF_TILE" "1" "July 2012" "BioRuby" "BioRuby Manual"
|
5
5
|
.
|
6
6
|
.SH "NAME"
|
7
7
|
\fBmaf_tile\fR \- synthesize an alignment for a given region
|
8
8
|
.
|
9
9
|
.SH "SYNOPSIS"
|
10
|
-
\fBmaf_tile\fR [\fIoptions\fR] \-i BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR
|
10
|
+
\fBmaf_tile\fR [\fIoptions\fR] \-i [SEQ:]BEGIN:END [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
|
11
11
|
.
|
12
12
|
.P
|
13
|
-
\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR
|
13
|
+
\fBmaf_tile\fR [\fIoptions\fR] \-\-bed BED \-o BASE [\-s SPECIES[:NAME] \.\.\.] \fImaf\fR [index]
|
14
14
|
.
|
15
15
|
.SH "DESCRIPTION"
|
16
|
-
\fBmaf_tile\fR takes a MAF file with index
|
16
|
+
\fBmaf_tile\fR takes a MAF file, with optional index, or directory of indexed MAF files, extracts alignment blocks overlapping the given genomic interval, and constructs a single alignment block covering the entire interval for the specified species\. Optionally, any gaps in coverage of the MAF file\'s reference sequence can be filled in from a FASTA sequence file\.
|
17
17
|
.
|
18
18
|
.P
|
19
|
-
If a single interval is specified, the output will be written to stdout in FASTA format\. If the \fB\-\-output\-base\fR option is specified, \fB_<
|
19
|
+
If a single interval is specified, the output will be written to stdout in FASTA format\. If a directory of MAF files is supplied as the \fImaf\fR parameter, the interval must include the sequence identifier in the form \fBsequence:begin:end\fR\. If the \fB\-\-output\-base\fR option is specified, \fB_<begin>:<end>\.fa\fR will be appended to the given parameter and used to construct the output path\. If a BED file is specified with \fB\-\-bed\fR, \fB\-\-output\-base\fR is also required\.
|
20
20
|
.
|
21
21
|
.P
|
22
22
|
Species can be renamed for output by specifying them as SPECIES:NAME; the first component will be used to select the species from the MAF file, and the second will be used in the FASTA description line for output\.
|
@@ -84,10 +84,10 @@ $ maf_tile \-\-bed /tmp/mm8\.bed \-\-output\-base /tmp/mm8 \e
|
|
84
84
|
The output is generated in FASTA format, with one sequence per species\.
|
85
85
|
.
|
86
86
|
.P
|
87
|
-
The
|
87
|
+
The \fImaf\fR parameter must specify either a Multiple Alignment Format (MAF) file or a directory of such files, with indexes\.
|
88
88
|
.
|
89
89
|
.P
|
90
|
-
The \fIindex\fR must be a MAF index built with maf_index(1)\.
|
90
|
+
The \fIindex\fR must be a MAF index built with maf_index(1)\. This parameter is ignored if the \fImaf\fR parameter is a directory\. It can be omitted if a single MAF file is given, but in this case the entire file will be parsed to build a temporary index\. For large files which will be reused, this is not advisable\.
|
91
91
|
.
|
92
92
|
.P
|
93
93
|
If \fB\-\-bed\fR \fIbed\fR is specified, its argument must be a BED file\. Only the second and third columns will be used, to specify the zero\-based start and end positions of intervals\.
|
data/man/maf_tile.1.ronn
CHANGED
@@ -3,23 +3,26 @@ maf_tile(1) -- synthesize an alignment for a given region
|
|
3
3
|
|
4
4
|
## SYNOPSIS
|
5
5
|
|
6
|
-
`maf_tile` [<options>] -i BEGIN:END [-s SPECIES[:NAME] ...] <maf>
|
6
|
+
`maf_tile` [<options>] -i [SEQ:]BEGIN:END [-s SPECIES[:NAME] ...] <maf> [index]
|
7
7
|
|
8
|
-
`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf>
|
8
|
+
`maf_tile` [<options>] --bed BED -o BASE [-s SPECIES[:NAME] ...] <maf> [index]
|
9
9
|
|
10
10
|
## DESCRIPTION
|
11
11
|
|
12
|
-
**maf_tile** takes a MAF file with index
|
13
|
-
extracts alignment blocks overlapping the given
|
14
|
-
constructs a single alignment block covering the
|
15
|
-
the specified species. Optionally, any gaps in
|
16
|
-
file's reference sequence can be filled in from a
|
12
|
+
**maf_tile** takes a MAF file, with optional index, or directory of
|
13
|
+
indexed MAF files, extracts alignment blocks overlapping the given
|
14
|
+
genomic interval, and constructs a single alignment block covering the
|
15
|
+
entire interval for the specified species. Optionally, any gaps in
|
16
|
+
coverage of the MAF file's reference sequence can be filled in from a
|
17
|
+
FASTA sequence file.
|
17
18
|
|
18
19
|
If a single interval is specified, the output will be written to
|
19
|
-
stdout in FASTA format. If
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
stdout in FASTA format. If a directory of MAF files is supplied as the
|
21
|
+
<maf> parameter, the interval must include the sequence identifier in
|
22
|
+
the form `sequence:begin:end`. If the `--output-base` option is
|
23
|
+
specified, `_<begin>:<end>.fa` will be appended to the given <base>
|
24
|
+
parameter and used to construct the output path. If a BED file is
|
25
|
+
specified with `--bed`, `--output-base` is also required.
|
23
26
|
|
24
27
|
Species can be renamed for output by specifying them as SPECIES:NAME;
|
25
28
|
the first component will be used to select the species from the MAF
|
@@ -80,9 +83,14 @@ sequence:
|
|
80
83
|
The output is generated in FASTA format, with one sequence per
|
81
84
|
species.
|
82
85
|
|
83
|
-
The
|
86
|
+
The <maf> parameter must specify either a Multiple Alignment Format
|
87
|
+
(MAF) file or a directory of such files, with indexes.
|
84
88
|
|
85
|
-
The <index> must be a MAF index built with maf_index(1).
|
89
|
+
The <index> must be a MAF index built with maf_index(1). This
|
90
|
+
parameter is ignored if the <maf> parameter is a directory. It can be
|
91
|
+
omitted if a single MAF file is given, but in this case the entire
|
92
|
+
file will be parsed to build a temporary index. For large files which
|
93
|
+
will be reused, this is not advisable.
|
86
94
|
|
87
95
|
If `--bed` <bed> is specified, its argument must be a BED file. Only
|
88
96
|
the second and third columns will be used, to specify the zero-based
|
data/man/maf_to_fasta.1
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
.\" generated with Ronn/v0.7.3
|
2
2
|
.\" http://github.com/rtomayko/ronn/tree/0.7.3
|
3
3
|
.
|
4
|
-
.TH "MAF_TO_FASTA" "1" "
|
4
|
+
.TH "MAF_TO_FASTA" "1" "July 2012" "BioRuby" "BioRuby Manual"
|
5
5
|
.
|
6
6
|
.SH "NAME"
|
7
7
|
\fBmaf_to_fasta\fR \- convert MAF file to FASTA
|
data/spec/bio/maf/index_spec.rb
CHANGED
@@ -3,6 +3,73 @@ require 'spec_helper'
|
|
3
3
|
module Bio
|
4
4
|
module MAF
|
5
5
|
|
6
|
+
describe Access do
|
7
|
+
describe "#tile" do
|
8
|
+
it "gives correct output with a Pathname" do
|
9
|
+
access = Access.maf_dir(TestData)
|
10
|
+
interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
|
11
|
+
buf = StringIO.new
|
12
|
+
access.tile(interval) do |tiler|
|
13
|
+
tiler.reference = TestData + 'gap-sp1.fa'
|
14
|
+
tiler.species = %w(sp1 sp2 sp3)
|
15
|
+
tiler.write_fasta(buf)
|
16
|
+
end
|
17
|
+
buf.string.should == File.read(TestData + 'gap-filled1.fa')
|
18
|
+
end
|
19
|
+
it "gives correct output with only a species map" do
|
20
|
+
access = Access.maf_dir(TestData)
|
21
|
+
interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
|
22
|
+
buf = StringIO.new
|
23
|
+
access.tile(interval) do |tiler|
|
24
|
+
tiler.reference = TestData + 'gap-sp1.fa'
|
25
|
+
tiler.species_map = {
|
26
|
+
'sp1' => 'sp1',
|
27
|
+
'sp2' => 'sp2',
|
28
|
+
'sp3' => 'sp3'
|
29
|
+
}
|
30
|
+
tiler.write_fasta(buf)
|
31
|
+
end
|
32
|
+
buf.string.should == File.read(TestData + 'gap-filled1.fa')
|
33
|
+
end
|
34
|
+
it "gives correct output with no species specified" do
|
35
|
+
pending("issue 88") do
|
36
|
+
access = Access.maf_dir(TestData)
|
37
|
+
interval = GenomicInterval.zero_based('sp1.chr1', 0, 50)
|
38
|
+
buf = StringIO.new
|
39
|
+
access.tile(interval) do |tiler|
|
40
|
+
tiler.reference = TestData + 'gap-sp1.fa'
|
41
|
+
tiler.write_fasta(buf)
|
42
|
+
end
|
43
|
+
buf.string.should == File.read(TestData + 'gap-filled1.fa')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
describe ".file" do
|
48
|
+
it "accepts a MAF file and index" do
|
49
|
+
access = Access.file(TestData + 'gap-1.maf',
|
50
|
+
TestData + 'gap-1.kct')
|
51
|
+
blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
|
52
|
+
10,
|
53
|
+
23)]).to_a
|
54
|
+
blocks.size.should == 1
|
55
|
+
end
|
56
|
+
it "accepts a MAF file and finds the index" do
|
57
|
+
access = Access.file(TestData + 'gap-1.maf')
|
58
|
+
blocks = access.find([GenomicInterval.zero_based('sp1.chr1',
|
59
|
+
10,
|
60
|
+
23)]).to_a
|
61
|
+
blocks.size.should == 1
|
62
|
+
end
|
63
|
+
it "accepts a MAF file and builds a temp index" do
|
64
|
+
access = Access.file(TestData + 'chrY-1block.maf')
|
65
|
+
blocks = access.find([GenomicInterval.zero_based('hg19.chrY',
|
66
|
+
10501,
|
67
|
+
10544)]).to_a
|
68
|
+
blocks.size.should == 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
6
73
|
describe KyotoIndex do
|
7
74
|
def has_at_least_n_with_prefix(n, start)
|
8
75
|
@idx.db.cursor_process do |cur|
|
@@ -87,6 +154,38 @@ module Bio
|
|
87
154
|
l[0].offset.should == 16
|
88
155
|
end
|
89
156
|
|
157
|
+
it "takes a block arg" do
|
158
|
+
called = false
|
159
|
+
@idx.find([GenomicInterval.zero_based('mm8.chr7',
|
160
|
+
80082334,
|
161
|
+
80082338)],
|
162
|
+
@p) do |block|
|
163
|
+
block.offset.should == 16
|
164
|
+
called = true
|
165
|
+
end
|
166
|
+
called.should be_true
|
167
|
+
end
|
168
|
+
|
169
|
+
it "with a block and no match, returns" do
|
170
|
+
called = false
|
171
|
+
@idx.find([GenomicInterval.zero_based('mm8.chr7',
|
172
|
+
20082334,
|
173
|
+
20082338)],
|
174
|
+
@p) do |block|
|
175
|
+
called = true
|
176
|
+
end
|
177
|
+
called.should be_false
|
178
|
+
end
|
179
|
+
|
180
|
+
it "with no block and no match, returns an empty list" do
|
181
|
+
v = @idx.find([GenomicInterval.zero_based('mm8.chr7',
|
182
|
+
20082334,
|
183
|
+
20082338)],
|
184
|
+
@p)
|
185
|
+
v.should_not be_nil
|
186
|
+
v.should respond_to(:count)
|
187
|
+
end
|
188
|
+
|
90
189
|
after(:each) do
|
91
190
|
@idx.db.close
|
92
191
|
@p.f.close
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Bio
|
4
|
+
module MAF
|
5
|
+
|
6
|
+
describe Header do
|
7
|
+
before(:each) do
|
8
|
+
@p = Parser.new(TestData + 't1.maf')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "provides version information" do
|
12
|
+
@p.header.version.should == '1'
|
13
|
+
end
|
14
|
+
it "provides the scoring scheme" do
|
15
|
+
@p.header.scoring.should == 'humor.v4'
|
16
|
+
end
|
17
|
+
it "provides alignment parameters" do
|
18
|
+
@p.header.alignment_params.should =~ /humor.v4 R=30/
|
19
|
+
end
|
20
|
+
|
21
|
+
it "presents multiline parameters correctly" do
|
22
|
+
@p.header.alignment_params.should == "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "provides arbitrary parameters"
|
26
|
+
end
|
27
|
+
|
28
|
+
describe Block do
|
29
|
+
describe "#find_gaps" do
|
30
|
+
it "finds a single 14-base gap" do
|
31
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
32
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
33
|
+
block = p.parse_block
|
34
|
+
gaps = block.find_gaps
|
35
|
+
gaps.size.should == 1
|
36
|
+
gaps[0][0].should == 34
|
37
|
+
gaps[0][1].should == 14
|
38
|
+
end
|
39
|
+
end
|
40
|
+
describe "#remove_gaps!" do
|
41
|
+
it "removes a single 14-base gap" do
|
42
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
43
|
+
p.sequence_filter = { :only_species => %w(mm8 rn4 hg18 canFam2 loxAfr1) }
|
44
|
+
block = p.parse_block
|
45
|
+
block.sequences.size.should == 5
|
46
|
+
block.text_size.should == 54
|
47
|
+
block.remove_gaps!
|
48
|
+
block.text_size.should == 40
|
49
|
+
end
|
50
|
+
end
|
51
|
+
describe "#joinable_with?" do
|
52
|
+
it "is false for blocks with different sequences" do
|
53
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
54
|
+
sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 dasNov1 loxAfr1 echTel1)
|
55
|
+
p.sequence_filter = { :only_species => sp }
|
56
|
+
b1 = p.parse_block
|
57
|
+
b2 = p.parse_block
|
58
|
+
b1.joinable_with?(b2).should be_false
|
59
|
+
end
|
60
|
+
it "is true for blocks with same sequences" do
|
61
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
62
|
+
sp = %w(mm8 rn4 oryCun1 hg18 panTro2 rheMac2 canFam2 loxAfr1 echTel1)
|
63
|
+
p.sequence_filter = { :only_species => sp }
|
64
|
+
b1 = p.parse_block
|
65
|
+
b2 = p.parse_block
|
66
|
+
b1.joinable_with?(b2).should be_true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
describe "#to_bio_alignment" do
|
70
|
+
it "returns a usable Bio::BioAlignment::Alignment" do
|
71
|
+
p = Parser.new(TestData + 'mm8_chr7_tiny.maf')
|
72
|
+
b = p.parse_block
|
73
|
+
ba = b.to_bio_alignment
|
74
|
+
ba.size.should == 10
|
75
|
+
ba.sequences[0].id.should == "mm8.chr7"
|
76
|
+
ba.sequences[0].seq.should =~ /^GGGCTGAGGGC--/
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe Sequence do
|
82
|
+
before(:each) do
|
83
|
+
@parser = DummyParser.new
|
84
|
+
end
|
85
|
+
|
86
|
+
describe "#gapped?" do
|
87
|
+
it "is false for sequences with no gaps" do
|
88
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
89
|
+
s = @parser.parse_seq_line(line, nil)
|
90
|
+
s.gapped?.should be_false
|
91
|
+
end
|
92
|
+
it "is true for sequences with gaps" do
|
93
|
+
line = "s human_unc 9077 8 + 10998 AC-AGTATT"
|
94
|
+
s = @parser.parse_seq_line(line, nil)
|
95
|
+
s.gapped?.should be_true
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "#text_range" do
|
100
|
+
it "returns 0...text.size for a spanning interval" do
|
101
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
102
|
+
s = @parser.parse_seq_line(line, nil)
|
103
|
+
range = s.text_range(9077...(9077 + 8))
|
104
|
+
range.should == (0...(s.text.size))
|
105
|
+
end
|
106
|
+
it "returns 0...text.size for a gapped spanning interval" do
|
107
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
108
|
+
s = @parser.parse_seq_line(line, nil)
|
109
|
+
range = s.text_range(9077...(9077 + 8))
|
110
|
+
range.should == (0...(s.text.size))
|
111
|
+
end
|
112
|
+
it "handles a leading subset" do
|
113
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
114
|
+
s = @parser.parse_seq_line(line, nil)
|
115
|
+
range = s.text_range(9077...(9077 + 2))
|
116
|
+
range.should == (0...2)
|
117
|
+
end
|
118
|
+
it "handles a trailing subset" do
|
119
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
120
|
+
s = @parser.parse_seq_line(line, nil)
|
121
|
+
range = s.text_range(9079...9085)
|
122
|
+
range.should == (2...8)
|
123
|
+
end
|
124
|
+
it "handles a gap in the middle" do
|
125
|
+
line = "s human_unc 9077 8 + 10998 AC--AGTATT"
|
126
|
+
s = @parser.parse_seq_line(line, nil)
|
127
|
+
range = s.text_range(9078...(9077 + 8))
|
128
|
+
range.should == (1...(s.text.size))
|
129
|
+
end
|
130
|
+
it "errors on a range starting before" do
|
131
|
+
expect {
|
132
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
133
|
+
s = @parser.parse_seq_line(line, nil)
|
134
|
+
range = s.text_range(9076...(9077 + 8))
|
135
|
+
}.to raise_error
|
136
|
+
end
|
137
|
+
it "errors on a range ending after" do
|
138
|
+
expect {
|
139
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
140
|
+
s = @parser.parse_seq_line(line, nil)
|
141
|
+
range = s.text_range(9076...(9077 + 9))
|
142
|
+
}.to raise_error
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
describe "synteny data" do
|
147
|
+
it "extracts basic data from i lines" do
|
148
|
+
p = Parser.new(TestData + 'chr22_ieq2.maf',
|
149
|
+
:parse_extended => true)
|
150
|
+
b = p.parse_block
|
151
|
+
b.sequences[0].left_status_char.should be_nil
|
152
|
+
b.sequences[0].left_status.should be_nil
|
153
|
+
b.sequences[0].left_count.should be_nil
|
154
|
+
b.sequences[0].right_status_char.should be_nil
|
155
|
+
b.sequences[0].right_status.should be_nil
|
156
|
+
b.sequences[0].right_count.should be_nil
|
157
|
+
# works but let's not over-specify internal state
|
158
|
+
#b.sequences[1].i_data.should == %w(N 0 C 0)
|
159
|
+
b.sequences[1].left_status_char.should == 'N'
|
160
|
+
b.sequences[1].left_status.should == :first
|
161
|
+
b.sequences[1].right_status_char.should == 'C'
|
162
|
+
b.sequences[1].right_status.should == :contiguous
|
163
|
+
b.sequences[2].left_status.should == :contiguous
|
164
|
+
b.sequences[2].right_status_char.should == 'I'
|
165
|
+
b.sequences[2].right_status.should == :intervening
|
166
|
+
b.sequences[2].right_count.should == 146
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
describe "#to_bioalignment" do
|
171
|
+
it "returns a usable Bio::BioAlignment::Sequence" do
|
172
|
+
@parser = DummyParser.new
|
173
|
+
line = "s human_unc 9077 8 + 10998 ACAGTATT"
|
174
|
+
s = @parser.parse_seq_line(line, nil)
|
175
|
+
as = s.to_bio_alignment
|
176
|
+
as.id.should == "human_unc"
|
177
|
+
as.seq.should == "ACAGTATT"
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|