bio-maf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.simplecov +1 -0
- data/.travis.yml +16 -0
- data/.yardopts +3 -0
- data/DEVELOPMENT.md +40 -0
- data/Gemfile +23 -0
- data/LICENSE.txt +20 -0
- data/README.md +209 -0
- data/Rakefile +76 -0
- data/VERSION +1 -0
- data/benchmarks/dispatch_bench +53 -0
- data/benchmarks/iter_bench +44 -0
- data/benchmarks/read_bench +40 -0
- data/benchmarks/sort_bench +33 -0
- data/benchmarks/split_bench +33 -0
- data/bin/maf_count +82 -0
- data/bin/maf_dump_blocks +27 -0
- data/bin/maf_extract_ranges_count +44 -0
- data/bin/maf_index +88 -0
- data/bin/maf_parse_bench +94 -0
- data/bin/maf_to_fasta +68 -0
- data/bin/maf_write +84 -0
- data/bin/random_ranges +35 -0
- data/features/maf-indexing.feature +31 -0
- data/features/maf-output.feature +29 -0
- data/features/maf-parsing.feature +44 -0
- data/features/maf-querying.feature +75 -0
- data/features/maf-to-fasta.feature +50 -0
- data/features/step_definitions/convert_steps.rb +45 -0
- data/features/step_definitions/index_steps.rb +20 -0
- data/features/step_definitions/output_steps.rb +27 -0
- data/features/step_definitions/parse_steps.rb +63 -0
- data/features/step_definitions/query_steps.rb +31 -0
- data/features/step_definitions/ucsc_bin_steps.rb +14 -0
- data/features/support/env.rb +16 -0
- data/features/ucsc-bins.feature +24 -0
- data/lib/bio/maf/index.rb +620 -0
- data/lib/bio/maf/parser.rb +888 -0
- data/lib/bio/maf/struct.rb +63 -0
- data/lib/bio/maf/writer.rb +63 -0
- data/lib/bio/maf.rb +4 -0
- data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
- data/lib/bio/ucsc/ucsc_bin.rb +117 -0
- data/lib/bio/ucsc.rb +2 -0
- data/lib/bio-maf/maf.rb +3 -0
- data/lib/bio-maf.rb +12 -0
- data/man/.gitignore +1 -0
- data/man/maf_index.1 +105 -0
- data/man/maf_index.1.markdown +97 -0
- data/man/maf_index.1.ronn +83 -0
- data/man/maf_to_fasta.1 +53 -0
- data/man/maf_to_fasta.1.ronn +51 -0
- data/spec/bio/maf/index_spec.rb +363 -0
- data/spec/bio/maf/parser_spec.rb +354 -0
- data/spec/bio/maf/struct_spec.rb +75 -0
- data/spec/spec_helper.rb +14 -0
- data/test/data/big-block.maf +15999 -0
- data/test/data/chr22_ieq.maf +11 -0
- data/test/data/chrY-1block.maf +6 -0
- data/test/data/empty +0 -0
- data/test/data/empty.db +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chr7_tiny.maf +76 -0
- data/test/data/mm8_mod_a.maf +7 -0
- data/test/data/mm8_single.maf +13 -0
- data/test/data/mm8_subset_a.maf +23 -0
- data/test/data/t1-bad1.maf +15 -0
- data/test/data/t1.fasta +12 -0
- data/test/data/t1.maf +15 -0
- data/test/data/t1a.maf +17 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-maf.rb +7 -0
- data/travis-ci/install_kc +13 -0
- data/travis-ci/install_kc_java +13 -0
- data/travis-ci/report_errors +4 -0
- metadata +181 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
@milestone_3
|
2
|
+
Feature: Filter results from MAF files
|
3
|
+
In order to work with only relevant data from a MAF file
|
4
|
+
Such as only species recognized by PhyloCSF
|
5
|
+
I want to filter the results of MAF queries
|
6
|
+
|
7
|
+
Scenario: Return only specified species
|
8
|
+
Given MAF data:
|
9
|
+
"""
|
10
|
+
##maf version=1
|
11
|
+
a score=10542.0
|
12
|
+
s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
|
13
|
+
s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
|
14
|
+
s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
|
15
|
+
s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
|
16
|
+
s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
|
17
|
+
s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
|
18
|
+
s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
|
19
|
+
s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
|
20
|
+
s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
|
21
|
+
s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
|
22
|
+
"""
|
23
|
+
When I open it with a MAF reader
|
24
|
+
And filter for only the species
|
25
|
+
| hg18 |
|
26
|
+
| mm8 |
|
27
|
+
| rheMac2 |
|
28
|
+
Then an alignment block can be obtained
|
29
|
+
And the alignment block has 3 sequences
|
30
|
+
|
31
|
+
Scenario: Return only blocks having all specified species
|
32
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
33
|
+
When I open it with a MAF reader
|
34
|
+
And build an index on the reference sequence
|
35
|
+
And filter for blocks with the species
|
36
|
+
| panTro2 |
|
37
|
+
| loxAfr1 |
|
38
|
+
And search for blocks between positions 80082471 and 80082730 of mm8.chr7
|
39
|
+
Then 1 block is obtained
|
40
|
+
|
41
|
+
Scenario: Return only blocks having a certain number of sequences
|
42
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
43
|
+
When I open it with a MAF reader
|
44
|
+
And build an index on the reference sequence
|
45
|
+
And filter for blocks with at least 6 sequences
|
46
|
+
And search for blocks between positions 80082767 and 80083008 of mm8.chr7
|
47
|
+
Then 1 block is obtained
|
48
|
+
|
49
|
+
# sizes present:
|
50
|
+
# 55 64 128 148 157 163 165 192
|
51
|
+
|
52
|
+
Scenario: Return blocks with a maximum text size
|
53
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
54
|
+
When I open it with a MAF reader
|
55
|
+
And build an index on the reference sequence
|
56
|
+
And filter for blocks with text size at least 150
|
57
|
+
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
58
|
+
Then 4 blocks are obtained
|
59
|
+
|
60
|
+
Scenario: Return blocks with a minimum text size
|
61
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
62
|
+
When I open it with a MAF reader
|
63
|
+
And build an index on the reference sequence
|
64
|
+
And filter for blocks with text size at most 72
|
65
|
+
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
66
|
+
Then 2 blocks are obtained
|
67
|
+
|
68
|
+
Scenario: Return blocks within a text size range
|
69
|
+
Given a MAF source file "mm8_chr7_tiny.maf"
|
70
|
+
When I open it with a MAF reader
|
71
|
+
And build an index on the reference sequence
|
72
|
+
And filter for blocks with text size between 72 and 160
|
73
|
+
And search for blocks between positions 0 and 80100000 of mm8.chr7
|
74
|
+
Then 3 blocks are obtained
|
75
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
Feature: Convert MAF file to FASTA
|
2
|
+
In order to use multiple alignment data with other tools
|
3
|
+
I want to read a Multiple Alignment Format (MAF) file and write out its data as FASTA
|
4
|
+
|
5
|
+
Scenario: Convert simple MAF file
|
6
|
+
Given a MAF source file "t1.maf"
|
7
|
+
When I select FASTA output
|
8
|
+
And I open it with a MAF reader
|
9
|
+
And process the file
|
10
|
+
Then the output should match "t1.fasta"
|
11
|
+
|
12
|
+
Scenario: Convert simple MAF data
|
13
|
+
Given MAF data:
|
14
|
+
"""
|
15
|
+
##maf version=1 scoring=humor.v4
|
16
|
+
# humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
|
17
|
+
# /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
|
18
|
+
|
19
|
+
a score=0.128
|
20
|
+
s human_hoxa 100 8 + 100257 ACA-TTACT
|
21
|
+
s horse_hoxa 120 9 - 98892 ACAATTGCT
|
22
|
+
s fugu_hoxa 88 7 + 90788 ACA--TGCT
|
23
|
+
|
24
|
+
|
25
|
+
a score=0.071
|
26
|
+
s human_unc 9077 8 + 10998 ACAGTATT
|
27
|
+
# Comment
|
28
|
+
s horse_unc 4555 6 - 5099 ACA--ATT
|
29
|
+
s fugu_unc 4000 4 + 4038 AC----TT
|
30
|
+
"""
|
31
|
+
When I select FASTA output
|
32
|
+
And I open it with a MAF reader
|
33
|
+
And process the file
|
34
|
+
Then the output should be:
|
35
|
+
"""
|
36
|
+
>human_hoxa:100-108
|
37
|
+
ACA-TTACT
|
38
|
+
>horse_hoxa:120-129
|
39
|
+
ACAATTGCT
|
40
|
+
>fugu_hoxa:88-95
|
41
|
+
ACA--TGCT
|
42
|
+
>human_unc:9077-9085
|
43
|
+
ACAGTATT
|
44
|
+
>horse_unc:4555-4561
|
45
|
+
ACA--ATT
|
46
|
+
>fugu_unc:4000-4004
|
47
|
+
AC----TT
|
48
|
+
|
49
|
+
"""
|
50
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'bigbio' # FASTA support
|
2
|
+
|
3
|
+
Given /^a MAF source file "(.*?)"$/ do |src|
|
4
|
+
@src_f = $test_data + src
|
5
|
+
@src_f.exist?.should be_true
|
6
|
+
end
|
7
|
+
|
8
|
+
Given /^MAF data:$/ do |string|
|
9
|
+
@src_f = Tempfile.new(['rspec', '.maf'])
|
10
|
+
@src_f.write(string)
|
11
|
+
@src_f.close
|
12
|
+
end
|
13
|
+
|
14
|
+
When /^I select FASTA output$/ do
|
15
|
+
@dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
|
16
|
+
@dst.close
|
17
|
+
@writer = FastaWriter.new(@dst.path)
|
18
|
+
end
|
19
|
+
|
20
|
+
When /^process the file$/ do
|
21
|
+
@parser.each_block do |block|
|
22
|
+
block.each_raw_seq do |seq|
|
23
|
+
seq.write_fasta(@writer)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
@writer.close
|
27
|
+
end
|
28
|
+
|
29
|
+
Then /^the output should match "(.*?)"$/ do |ref|
|
30
|
+
ref_p = $test_data + ref
|
31
|
+
ref_p.exist?.should be_true
|
32
|
+
#system("diff #{ref} #{@dst.path} >/dev/null 2>&1").should be_true
|
33
|
+
File.read(@dst.path).should == File.read(ref_p)
|
34
|
+
end
|
35
|
+
|
36
|
+
Then /^the output should be:$/ do |string|
|
37
|
+
File.read(@dst.path).should == string
|
38
|
+
end
|
39
|
+
|
40
|
+
After do
|
41
|
+
if @dst
|
42
|
+
@dst.close
|
43
|
+
@dst.unlink
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
When /^build an index on the reference sequence$/ do
|
2
|
+
@idx = Bio::MAF::KyotoIndex.build(@parser, '%')
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
|
6
|
+
@idx = Bio::MAF::KyotoIndex.open($test_data + name)
|
7
|
+
end
|
8
|
+
|
9
|
+
Then /^the index has at least (\d+) entries$/ do |size_spec|
|
10
|
+
@idx.db.count.should be >= size_spec.to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
|
14
|
+
int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
|
15
|
+
@blocks = @idx.find([int], @parser, @block_filter).to_a
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
|
19
|
+
@blocks.size.should == num.to_i
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
When /^open a new MAF writer$/ do
|
2
|
+
@dst = Tempfile.new(["cuke", ".maf"])
|
3
|
+
@writer = Bio::MAF::Writer.new(@dst)
|
4
|
+
end
|
5
|
+
|
6
|
+
When /^write the header from the original MAF file$/ do
|
7
|
+
@writer.write_header(@parser.header)
|
8
|
+
end
|
9
|
+
|
10
|
+
When /^write all the parsed blocks$/ do
|
11
|
+
@writer.write_blocks(@parser.parse_blocks)
|
12
|
+
end
|
13
|
+
|
14
|
+
RSpec::Matchers.define :match_except_ws do |expected|
|
15
|
+
match do |actual|
|
16
|
+
system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
|
17
|
+
end
|
18
|
+
|
19
|
+
failure_message_for_should do |actual|
|
20
|
+
msg = "File contents did not match. Diff:\n"
|
21
|
+
msg << `diff --unified --ignore-space-change #{expected} #{actual}`
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
Then /^the output should match, except whitespace, "(.+)"$/ do |ref|
|
26
|
+
@dst.path.should match_except_ws($test_data + ref)
|
27
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
When /^I open it with a MAF reader$/ do
|
2
|
+
@parser = Bio::MAF::Parser.new(@src_f, @opts || {})
|
3
|
+
end
|
4
|
+
|
5
|
+
When /^I enable the :(\S+) parser option$/ do |opt_s|
|
6
|
+
@opts ||= {}
|
7
|
+
@opts[opt_s.to_sym] = true
|
8
|
+
end
|
9
|
+
|
10
|
+
Then /^the MAF version should be "(.*?)"$/ do |v_spec|
|
11
|
+
@parser.header.version.to_s.should == v_spec
|
12
|
+
end
|
13
|
+
|
14
|
+
Then /^the scoring scheme should be "(.*?)"$/ do |s_spec|
|
15
|
+
@parser.header.scoring.should == s_spec
|
16
|
+
end
|
17
|
+
|
18
|
+
Then /^the alignment parameters should be "(.*?)"$/ do |a_spec|
|
19
|
+
@parser.header.alignment_params.should == a_spec
|
20
|
+
end
|
21
|
+
|
22
|
+
Then /^an alignment block can be obtained$/ do
|
23
|
+
@block = @parser.parse_block
|
24
|
+
@block.should_not be_nil
|
25
|
+
end
|
26
|
+
|
27
|
+
Then /^the alignment block has (\d+) sequences$/ do |n_seq|
|
28
|
+
@block.sequences.size.should == n_seq.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
|
32
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
33
|
+
@block.raw_seq(i.to_i).send(method_sym).should == str
|
34
|
+
end
|
35
|
+
|
36
|
+
Then /^sequence (\d+) has (\w.*?) (\d+)\s*$/ do |i, method, num|
|
37
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
38
|
+
@block.raw_seq(i.to_i).send(method_sym).should == num.to_i
|
39
|
+
end
|
40
|
+
|
41
|
+
Then /^sequence (\d+) has (\w.*?) :(\S+)\s*$/ do |i, method, sym_s|
|
42
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
43
|
+
value_sym = sym_s.to_sym
|
44
|
+
@block.raw_seq(i.to_i).send(method_sym).should == value_sym
|
45
|
+
end
|
46
|
+
|
47
|
+
Then /^sequence (\S+) of block (\d+) has (\w.*?) "(.*?)"$/ do |chr, i, method, str|
|
48
|
+
seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
|
49
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
50
|
+
seq.send(method_sym).should == str
|
51
|
+
end
|
52
|
+
|
53
|
+
Then /^sequence (\S+) of block (\d+) has (\w.*?) (\d+)$/ do |chr, i, method, num|
|
54
|
+
seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
|
55
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
56
|
+
seq.send(method_sym).should == num.to_i
|
57
|
+
end
|
58
|
+
|
59
|
+
Then /^sequence (\S+) of block (\d+) has (\w.*?) :(\S+)$/ do |chr, i, method, sym_s|
|
60
|
+
seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
|
61
|
+
method_sym = method.gsub(/ /, '_').to_sym
|
62
|
+
seq.send(method_sym).should == sym_s.to_sym
|
63
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
When /^filter for only the species$/ do |table|
|
2
|
+
# table is a Cucumber::Ast::Table
|
3
|
+
sp = table.raw.collect { |row| row[0] }
|
4
|
+
@parser.sequence_filter = { :only_species => sp }
|
5
|
+
end
|
6
|
+
|
7
|
+
When /^filter for blocks with the species$/ do |table|
|
8
|
+
# table is a Cucumber::Ast::Table
|
9
|
+
sp = table.raw.collect { |row| row[0] }
|
10
|
+
@block_filter = { :with_all_species => sp }
|
11
|
+
end
|
12
|
+
|
13
|
+
When /^filter for blocks with at least (\d+) sequences$/ do |n|
|
14
|
+
@block_filter = { :at_least_n_sequences => n.to_i }
|
15
|
+
end
|
16
|
+
|
17
|
+
When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
|
18
|
+
constraint = case op
|
19
|
+
when 'least' then :min_size
|
20
|
+
when 'most' then :max_size
|
21
|
+
else raise "bad operator #{op}!"
|
22
|
+
end
|
23
|
+
@block_filter = { constraint => len.to_i}
|
24
|
+
end
|
25
|
+
|
26
|
+
When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
|
27
|
+
@block_filter = {
|
28
|
+
:min_size => min.to_i,
|
29
|
+
:max_size => max.to_i
|
30
|
+
}
|
31
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#require 'bio-ucsc-api'
|
2
|
+
|
3
|
+
Given /^I have a region with start (\d+) and end (\d+)$/ do |r_start, r_end|
|
4
|
+
@r_start = r_start.to_i
|
5
|
+
@r_end = r_end.to_i
|
6
|
+
end
|
7
|
+
|
8
|
+
When /^I compute the smallest containing bin$/ do
|
9
|
+
@bin = Bio::Ucsc::UcscBin.bin_from_range(@r_start, @r_end)
|
10
|
+
end
|
11
|
+
|
12
|
+
Then /^the bin should be (\d+)$/ do |expected_bin|
|
13
|
+
@bin.should == expected_bin.to_i
|
14
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
|
2
|
+
begin
|
3
|
+
require 'simplecov'
|
4
|
+
rescue LoadError
|
5
|
+
$stderr.puts "WARNING: could not require 'simplecov': #{$!}"
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
require 'tempfile'
|
11
|
+
|
12
|
+
$LOAD_PATH << File.expand_path('../../../lib', __FILE__)
|
13
|
+
|
14
|
+
require 'bio-maf'
|
15
|
+
|
16
|
+
$test_data = Pathname.new 'test/data'
|
@@ -0,0 +1,24 @@
|
|
1
|
+
Feature: Computation of UCSC bins
|
2
|
+
In order to efficiently use indexes
|
3
|
+
We will use the UCSC bin indexing system
|
4
|
+
Per http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
|
5
|
+
|
6
|
+
Scenario Outline: Compute smallest containing bin
|
7
|
+
Given I have a region with start <Start> and end <End>
|
8
|
+
When I compute the smallest containing bin
|
9
|
+
Then the bin should be <Bin>
|
10
|
+
|
11
|
+
Examples:
|
12
|
+
| Start | End | Bin |
|
13
|
+
| 25079603 | 25079787 | 776 |
|
14
|
+
| 25128173 | 25128248 | 776 |
|
15
|
+
| 50312474 | 50312703 | 968 |
|
16
|
+
| 41905591 | 41906101 | 904 |
|
17
|
+
| 16670899 | 16673060 | 712 |
|
18
|
+
| 75495356 | 75495494 | 1160 |
|
19
|
+
| 92259501 | 92261053 | 1288 |
|
20
|
+
| 83834063 | 83838132 | 1224 |
|
21
|
+
| 7309597 | 7310411 | 640 |
|
22
|
+
| 6190410 | 6190999 | 632 |
|
23
|
+
# from https://github.com/polyatail/biopython/blob/af34c033d78c4c72dffbb500e513e568a2ba5e29/Tests/test_MafIO_index.py#L48
|
24
|
+
|