bio-maf 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.simplecov +1 -0
- data/.travis.yml +16 -0
- data/.yardopts +3 -0
- data/DEVELOPMENT.md +40 -0
- data/Gemfile +23 -0
- data/LICENSE.txt +20 -0
- data/README.md +209 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/benchmarks/dispatch_bench +53 -0
- data/benchmarks/iter_bench +44 -0
- data/benchmarks/read_bench +40 -0
- data/benchmarks/sort_bench +33 -0
- data/benchmarks/split_bench +33 -0
- data/bin/maf_count +82 -0
- data/bin/maf_dump_blocks +27 -0
- data/bin/maf_extract_ranges_count +44 -0
- data/bin/maf_index +88 -0
- data/bin/maf_parse_bench +94 -0
- data/bin/maf_to_fasta +68 -0
- data/bin/maf_write +84 -0
- data/bin/random_ranges +35 -0
- data/features/maf-indexing.feature +31 -0
- data/features/maf-output.feature +29 -0
- data/features/maf-parsing.feature +44 -0
- data/features/maf-querying.feature +75 -0
- data/features/maf-to-fasta.feature +50 -0
- data/features/step_definitions/convert_steps.rb +45 -0
- data/features/step_definitions/index_steps.rb +20 -0
- data/features/step_definitions/output_steps.rb +27 -0
- data/features/step_definitions/parse_steps.rb +63 -0
- data/features/step_definitions/query_steps.rb +31 -0
- data/features/step_definitions/ucsc_bin_steps.rb +14 -0
- data/features/support/env.rb +16 -0
- data/features/ucsc-bins.feature +24 -0
- data/lib/bio/maf/index.rb +620 -0
- data/lib/bio/maf/parser.rb +888 -0
- data/lib/bio/maf/struct.rb +63 -0
- data/lib/bio/maf/writer.rb +63 -0
- data/lib/bio/maf.rb +4 -0
- data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
- data/lib/bio/ucsc/ucsc_bin.rb +117 -0
- data/lib/bio/ucsc.rb +2 -0
- data/lib/bio-maf/maf.rb +3 -0
- data/lib/bio-maf.rb +12 -0
- data/man/.gitignore +1 -0
- data/man/maf_index.1 +105 -0
- data/man/maf_index.1.markdown +97 -0
- data/man/maf_index.1.ronn +83 -0
- data/man/maf_to_fasta.1 +53 -0
- data/man/maf_to_fasta.1.ronn +51 -0
- data/spec/bio/maf/index_spec.rb +363 -0
- data/spec/bio/maf/parser_spec.rb +354 -0
- data/spec/bio/maf/struct_spec.rb +75 -0
- data/spec/spec_helper.rb +14 -0
- data/test/data/big-block.maf +15999 -0
- data/test/data/chr22_ieq.maf +11 -0
- data/test/data/chrY-1block.maf +6 -0
- data/test/data/empty +0 -0
- data/test/data/empty.db +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf +76 -0
- data/test/data/mm8_mod_a.maf +7 -0
- data/test/data/mm8_single.maf +13 -0
- data/test/data/mm8_subset_a.maf +23 -0
- data/test/data/t1-bad1.maf +15 -0
- data/test/data/t1.fasta +12 -0
- data/test/data/t1.maf +15 -0
- data/test/data/t1a.maf +17 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-maf.rb +7 -0
- data/travis-ci/install_kc +13 -0
- data/travis-ci/install_kc_java +13 -0
- data/travis-ci/report_errors +4 -0
- metadata +181 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
@milestone_3
|
2
|
+
Feature: Filter results from MAF files
|
3
|
+
In order to work with only relevant data from a MAF file
|
4
|
+
Such as only species recognized by PhyloCSF
|
5
|
+
I want to filter the results of MAF queries
|
6
|
+
|
7
|
+
Scenario: Return only specified species
|
8
|
+
Given MAF data:
|
9
|
+
"""
|
10
|
+
##maf version=1
|
11
|
+
a score=10542.0
|
12
|
+
s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
|
13
|
+
s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
|
14
|
+
s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
|
15
|
+
s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
|
16
|
+
s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
|
17
|
+
s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
|
18
|
+
s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
|
19
|
+
s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
|
20
|
+
s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
|
21
|
+
s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
|
22
|
+
"""
|
23
|
+
When I open it with a MAF reader
|
24
|
+
And filter for only the species
|
25
|
+
| hg18 |
|
26
|
+
| mm8 |
|
27
|
+
| rheMac2 |
|
28
|
+
Then an alignment block can be obtained
|
29
|
+
And the alignment block has 3 sequences
|
30
|
+
|
31
|
+
Scenario: Return only blocks having all specified species
|
32
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
33
|
+
When I open it with a MAF reader
|
34
|
+
And build an index on the reference sequence
|
35
|
+
And filter for blocks with the species
|
36
|
+
| panTro2 |
|
37
|
+
| loxAfr1 |
|
38
|
+
And search for blocks between positions 80082471 and 80082730 of mm8.chr7
|
39
|
+
Then 1 block is obtained
|
40
|
+
|
41
|
+
Scenario: Return only blocks having a certain number of sequences
|
42
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
43
|
+
When I open it with a MAF reader
|
44
|
+
And build an index on the reference sequence
|
45
|
+
And filter for blocks with at least 6 sequences
|
46
|
+
And search for blocks between positions 80082767 and 80083008 of mm8.chr7
|
47
|
+
Then 1 block is obtained
|
48
|
+
|
49
|
+
# sizes present:
|
50
|
+
# 55 64 128 148 157 163 165 192
|
51
|
+
|
52
|
+
Scenario: Return blocks with a maximum text size
|
53
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
54
|
+
When I open it with a MAF reader
|
55
|
+
And build an index on the reference sequence
|
56
|
+
And filter for blocks with text size at least 150
|
57
|
+
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
58
|
+
Then 4 blocks are obtained
|
59
|
+
|
60
|
+
Scenario: Return blocks with a minimum text size
|
61
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
62
|
+
When I open it with a MAF reader
|
63
|
+
And build an index on the reference sequence
|
64
|
+
And filter for blocks with text size at most 72
|
65
|
+
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
66
|
+
Then 2 blocks are obtained
|
67
|
+
|
68
|
+
Scenario: Return blocks within a text size range
|
69
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
70
|
+
When I open it with a MAF reader
|
71
|
+
And build an index on the reference sequence
|
72
|
+
And filter for blocks with text size between 72 and 160
|
73
|
+
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
74
|
+
Then 3 blocks are obtained
|
75
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
Feature: Convert MAF file to FASTA
|
2
|
+
In order to use multiple alignment data with other tools
|
3
|
+
I want to read a Multiple Alignment Format (MAF) file and write out its data as FASTA
|
4
|
+
|
5
|
+
Scenario: Convert simple MAF file
|
6
|
+
Given a MAF source file "t1.maf"
|
7
|
+
When I select FASTA output
|
8
|
+
And I open it with a MAF reader
|
9
|
+
And process the file
|
10
|
+
Then the output should match "t1.fasta"
|
11
|
+
|
12
|
+
Scenario: Convert simple MAF data
|
13
|
+
Given MAF data:
|
14
|
+
"""
|
15
|
+
##maf version=1 scoring=humor.v4
|
16
|
+
# humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
|
17
|
+
# /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
|
18
|
+
|
19
|
+
a score=0.128
|
20
|
+
s human_hoxa 100 8 + 100257 ACA-TTACT
|
21
|
+
s horse_hoxa 120 9 - 98892 ACAATTGCT
|
22
|
+
s fugu_hoxa 88 7 + 90788 ACA--TGCT
|
23
|
+
|
24
|
+
|
25
|
+
a score=0.071
|
26
|
+
s human_unc 9077 8 + 10998 ACAGTATT
|
27
|
+
# Comment
|
28
|
+
s horse_unc 4555 6 - 5099 ACA--ATT
|
29
|
+
s fugu_unc 4000 4 + 4038 AC----TT
|
30
|
+
"""
|
31
|
+
When I select FASTA output
|
32
|
+
And I open it with a MAF reader
|
33
|
+
And process the file
|
34
|
+
Then the output should be:
|
35
|
+
"""
|
36
|
+
>human_hoxa:100-108
|
37
|
+
ACA-TTACT
|
38
|
+
>horse_hoxa:120-129
|
39
|
+
ACAATTGCT
|
40
|
+
>fugu_hoxa:88-95
|
41
|
+
ACA--TGCT
|
42
|
+
>human_unc:9077-9085
|
43
|
+
ACAGTATT
|
44
|
+
>horse_unc:4555-4561
|
45
|
+
ACA--ATT
|
46
|
+
>fugu_unc:4000-4004
|
47
|
+
AC----TT
|
48
|
+
|
49
|
+
"""
|
50
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'bigbio' # FASTA support
|
2
|
+
|
3
|
+
Given /^a MAF source file "(.*?)"$/ do |src|
|
4
|
+
@src_f = $test_data + src
|
5
|
+
@src_f.exist?.should be_true
|
6
|
+
end
|
7
|
+
|
8
|
+
Given /^MAF data:$/ do |string|
|
9
|
+
@src_f = Tempfile.new(['rspec', '.maf'])
|
10
|
+
@src_f.write(string)
|
11
|
+
@src_f.close
|
12
|
+
end
|
13
|
+
|
14
|
+
When /^I select FASTA output$/ do
|
15
|
+
@dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
|
16
|
+
@dst.close
|
17
|
+
@writer = FastaWriter.new(@dst.path)
|
18
|
+
end
|
19
|
+
|
20
|
+
When /^process the file$/ do
|
21
|
+
@parser.each_block do |block|
|
22
|
+
block.each_raw_seq do |seq|
|
23
|
+
seq.write_fasta(@writer)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
@writer.close
|
27
|
+
end
|
28
|
+
|
29
|
+
Then /^the output should match "(.*?)"$/ do |ref|
|
30
|
+
ref_p = $test_data + ref
|
31
|
+
ref_p.exist?.should be_true
|
32
|
+
#system("diff #{ref} #{@dst.path} >/dev/null 2>&1").should be_true
|
33
|
+
File.read(@dst.path).should == File.read(ref_p)
|
34
|
+
end
|
35
|
+
|
36
|
+
Then /^the output should be:$/ do |string|
|
37
|
+
File.read(@dst.path).should == string
|
38
|
+
end
|
39
|
+
|
40
|
+
After do
|
41
|
+
if @dst
|
42
|
+
@dst.close
|
43
|
+
@dst.unlink
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
When /^build an index on the reference sequence$/ do
|
2
|
+
@idx = Bio::MAF::KyotoIndex.build(@parser, '%')
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
|
6
|
+
@idx = Bio::MAF::KyotoIndex.open($test_data + name)
|
7
|
+
end
|
8
|
+
|
9
|
+
Then /^the index has at least (\d+) entries$/ do |size_spec|
|
10
|
+
@idx.db.count.should be >= size_spec.to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
|
14
|
+
int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
|
15
|
+
@blocks = @idx.find([int], @parser, @block_filter).to_a
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
|
19
|
+
@blocks.size.should == num.to_i
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
When /^open a new MAF writer$/ do
|
2
|
+
@dst = Tempfile.new(["cuke", ".maf"])
|
3
|
+
@writer = Bio::MAF::Writer.new(@dst)
|
4
|
+
end
|
5
|
+
|
6
|
+
When /^write the header from the original MAF file$/ do
|
7
|
+
@writer.write_header(@parser.header)
|
8
|
+
end
|
9
|
+
|
10
|
+
When /^write all the parsed blocks$/ do
|
11
|
+
@writer.write_blocks(@parser.parse_blocks)
|
12
|
+
end
|
13
|
+
|
14
|
+
RSpec::Matchers.define :match_except_ws do |expected|
|
15
|
+
match do |actual|
|
16
|
+
system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
|
17
|
+
end
|
18
|
+
|
19
|
+
failure_message_for_should do |actual|
|
20
|
+
msg = "File contents did not match. Diff:\n"
|
21
|
+
msg << `diff --unified --ignore-space-change #{expected} #{actual}`
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
Then /^the output should match, except whitespace, "(.+)"$/ do |ref|
|
26
|
+
@dst.path.should match_except_ws($test_data + ref)
|
27
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
When /^I open it with a MAF reader$/ do
|
2
|
+
@parser = Bio::MAF::Parser.new(@src_f, @opts || {})
|
3
|
+
end
|
4
|
+
|
5
|
+
When /^I enable the :(\S+) parser option$/ do |opt_s|
|
6
|
+
@opts ||= {}
|
7
|
+
@opts[opt_s.to_sym] = true
|
8
|
+
end
|
9
|
+
|
10
|
+
Then /^the MAF version should be "(.*?)"$/ do |v_spec|
|
11
|
+
@parser.header.version.to_s.should == v_spec
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /^the scoring scheme should be "(.*?)"$/ do |s_spec|
|
15
|
+
@parser.header.scoring.should == s_spec
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^the alignment parameters should be "(.*?)"$/ do |a_spec|
|
19
|
+
@parser.header.alignment_params.should == a_spec
|
20
|
+
end
|
21
|
+
|
22
|
+
Then /^an alignment block can be obtained$/ do
|
23
|
+
@block = @parser.parse_block
|
24
|
+
@block.should_not be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
Then /^the alignment block has (\d+) sequences$/ do |n_seq|
|
28
|
+
@block.sequences.size.should == n_seq.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
|
32
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
33
|
+
@block.raw_seq(i.to_i).send(method_sym).should == str
|
34
|
+
end
|
35
|
+
|
36
|
+
Then /^sequence (\d+) has (\w.*?) (\d+)\s*$/ do |i, method, num|
|
37
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
38
|
+
@block.raw_seq(i.to_i).send(method_sym).should == num.to_i
|
39
|
+
end
|
40
|
+
|
41
|
+
Then /^sequence (\d+) has (\w.*?) :(\S+)\s*$/ do |i, method, sym_s|
|
42
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
43
|
+
value_sym = sym_s.to_sym
|
44
|
+
@block.raw_seq(i.to_i).send(method_sym).should == value_sym
|
45
|
+
end
|
46
|
+
|
47
|
+
Then /^sequence (\S+) of block (\d+) has (\w.*?) "(.*?)"$/ do |chr, i, method, str|
|
48
|
+
seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
|
49
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
50
|
+
seq.send(method_sym).should == str
|
51
|
+
end
|
52
|
+
|
53
|
+
Then /^sequence (\S+) of block (\d+) has (\w.*?) (\d+)$/ do |chr, i, method, num|
|
54
|
+
seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
|
55
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
56
|
+
seq.send(method_sym).should == num.to_i
|
57
|
+
end
|
58
|
+
|
59
|
+
Then /^sequence (\S+) of block (\d+) has (\w.*?) :(\S+)$/ do |chr, i, method, sym_s|
|
60
|
+
seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
|
61
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
62
|
+
seq.send(method_sym).should == sym_s.to_sym
|
63
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
When /^filter for only the species$/ do |table|
|
2
|
+
# table is a Cucumber::Ast::Table
|
3
|
+
sp = table.raw.collect { |row| row[0] }
|
4
|
+
@parser.sequence_filter = { :only_species => sp }
|
5
|
+
end
|
6
|
+
|
7
|
+
When /^filter for blocks with the species$/ do |table|
|
8
|
+
# table is a Cucumber::Ast::Table
|
9
|
+
sp = table.raw.collect { |row| row[0] }
|
10
|
+
@block_filter = { :with_all_species => sp }
|
11
|
+
end
|
12
|
+
|
13
|
+
When /^filter for blocks with at least (\d+) sequences$/ do |n|
|
14
|
+
@block_filter = { :at_least_n_sequences => n.to_i }
|
15
|
+
end
|
16
|
+
|
17
|
+
When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
18
|
+
constraint = case op
|
19
|
+
when 'least' then :min_size
|
20
|
+
when 'most' then :max_size
|
21
|
+
else raise "bad operator #{op}!"
|
22
|
+
end
|
23
|
+
@block_filter = { constraint => len.to_i}
|
24
|
+
end
|
25
|
+
|
26
|
+
When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
|
27
|
+
@block_filter = {
|
28
|
+
:min_size => min.to_i,
|
29
|
+
:max_size => max.to_i
|
30
|
+
}
|
31
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#require 'bio-ucsc-api'
|
2
|
+
|
3
|
+
Given /^I have a region with start (\d+) and end (\d+)$/ do |r_start, r_end|
|
4
|
+
@r_start = r_start.to_i
|
5
|
+
@r_end = r_end.to_i
|
6
|
+
end
|
7
|
+
|
8
|
+
When /^I compute the smallest containing bin$/ do
|
9
|
+
@bin = Bio::Ucsc::UcscBin.bin_from_range(@r_start, @r_end)
|
10
|
+
end
|
11
|
+
|
12
|
+
Then /^the bin should be (\d+)$/ do |expected_bin|
|
13
|
+
@bin.should == expected_bin.to_i
|
14
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
|
2
|
+
begin
|
3
|
+
require 'simplecov'
|
4
|
+
rescue LoadError
|
5
|
+
$stderr.puts "WARNING: could not require 'simplecov': #{$!}"
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
require 'tempfile'
|
11
|
+
|
12
|
+
$LOAD_PATH << File.expand_path('../../../lib', __FILE__)
|
13
|
+
|
14
|
+
require 'bio-maf'
|
15
|
+
|
16
|
+
$test_data = Pathname.new 'test/data'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
Feature: Computation of UCSC bins
|
2
|
+
In order to efficiently use indexes
|
3
|
+
We will use the UCSC bin indexing system
|
4
|
+
Per http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
|
5
|
+
|
6
|
+
Scenario Outline: Compute smallest containing bin
|
7
|
+
Given I have a region with start <Start> and end <End>
|
8
|
+
When I compute the smallest containing bin
|
9
|
+
Then the bin should be <Bin>
|
10
|
+
|
11
|
+
Examples:
|
12
|
+
| Start | End | Bin |
|
13
|
+
| 25079603 | 25079787 | 776 |
|
14
|
+
| 25128173 | 25128248 | 776 |
|
15
|
+
| 50312474 | 50312703 | 968 |
|
16
|
+
| 41905591 | 41906101 | 904 |
|
17
|
+
| 16670899 | 16673060 | 712 |
|
18
|
+
| 75495356 | 75495494 | 1160 |
|
19
|
+
| 92259501 | 92261053 | 1288 |
|
20
|
+
| 83834063 | 83838132 | 1224 |
|
21
|
+
| 7309597 | 7310411 | 640 |
|
22
|
+
| 6190410 | 6190999 | 632 |
|
23
|
+
# from https://github.com/polyatail/biopython/blob/af34c033d78c4c72dffbb500e513e568a2ba5e29/Tests/test_MafIO_index.py#L48
|
24
|
+
|