bio-maf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
@@ -0,0 +1,75 @@
1
+ @milestone_3
2
+ Feature: Filter results from MAF files
3
+ In order to work with only relevant data from a MAF file
4
+ Such as only species recognized by PhyloCSF
5
+ I want to filter the results of MAF queries
6
+
7
+ Scenario: Return only specified species
8
+ Given MAF data:
9
+ """
10
+ ##maf version=1
11
+ a score=10542.0
12
+ s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
13
+ s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
14
+ s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
15
+ s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
16
+ s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
17
+ s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
18
+ s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
19
+ s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
20
+ s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
21
+ s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
22
+ """
23
+ When I open it with a MAF reader
24
+ And filter for only the species
25
+ | hg18 |
26
+ | mm8 |
27
+ | rheMac2 |
28
+ Then an alignment block can be obtained
29
+ And the alignment block has 3 sequences
30
+
31
+ Scenario: Return only blocks having all specified species
32
+ Given a MAF source file "mm8_chr7_tiny.maf"
33
+ When I open it with a MAF reader
34
+ And build an index on the reference sequence
35
+ And filter for blocks with the species
36
+ | panTro2 |
37
+ | loxAfr1 |
38
+ And search for blocks between positions 80082471 and 80082730 of mm8.chr7
39
+ Then 1 block is obtained
40
+
41
+ Scenario: Return only blocks having a certain number of sequences
42
+ Given a MAF source file "mm8_chr7_tiny.maf"
43
+ When I open it with a MAF reader
44
+ And build an index on the reference sequence
45
+ And filter for blocks with at least 6 sequences
46
+ And search for blocks between positions 80082767 and 80083008 of mm8.chr7
47
+ Then 1 block is obtained
48
+
49
+ # sizes present:
50
+ # 55 64 128 148 157 163 165 192
51
+
52
+ Scenario: Return blocks with a maximum text size
53
+ Given a MAF source file "mm8_chr7_tiny.maf"
54
+ When I open it with a MAF reader
55
+ And build an index on the reference sequence
56
+ And filter for blocks with text size at least 150
57
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
58
+ Then 4 blocks are obtained
59
+
60
+ Scenario: Return blocks with a minimum text size
61
+ Given a MAF source file "mm8_chr7_tiny.maf"
62
+ When I open it with a MAF reader
63
+ And build an index on the reference sequence
64
+ And filter for blocks with text size at most 72
65
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
66
+ Then 2 blocks are obtained
67
+
68
+ Scenario: Return blocks within a text size range
69
+ Given a MAF source file "mm8_chr7_tiny.maf"
70
+ When I open it with a MAF reader
71
+ And build an index on the reference sequence
72
+ And filter for blocks with text size between 72 and 160
73
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
74
+ Then 3 blocks are obtained
75
+
@@ -0,0 +1,50 @@
1
+ Feature: Convert MAF file to FASTA
2
+ In order to use multiple alignment data with other tools
3
+ I want to read a Multiple Alignment Format (MAF) file and write out its data as FASTA
4
+
5
+ Scenario: Convert simple MAF file
6
+ Given a MAF source file "t1.maf"
7
+ When I select FASTA output
8
+ And I open it with a MAF reader
9
+ And process the file
10
+ Then the output should match "t1.fasta"
11
+
12
+ Scenario: Convert simple MAF data
13
+ Given MAF data:
14
+ """
15
+ ##maf version=1 scoring=humor.v4
16
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
17
+ # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
18
+
19
+ a score=0.128
20
+ s human_hoxa 100 8 + 100257 ACA-TTACT
21
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
22
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
23
+
24
+
25
+ a score=0.071
26
+ s human_unc 9077 8 + 10998 ACAGTATT
27
+ # Comment
28
+ s horse_unc 4555 6 - 5099 ACA--ATT
29
+ s fugu_unc 4000 4 + 4038 AC----TT
30
+ """
31
+ When I select FASTA output
32
+ And I open it with a MAF reader
33
+ And process the file
34
+ Then the output should be:
35
+ """
36
+ >human_hoxa:100-108
37
+ ACA-TTACT
38
+ >horse_hoxa:120-129
39
+ ACAATTGCT
40
+ >fugu_hoxa:88-95
41
+ ACA--TGCT
42
+ >human_unc:9077-9085
43
+ ACAGTATT
44
+ >horse_unc:4555-4561
45
+ ACA--ATT
46
+ >fugu_unc:4000-4004
47
+ AC----TT
48
+
49
+ """
50
+
@@ -0,0 +1,45 @@
1
+ require 'bigbio' # FASTA support
2
+
3
+ Given /^a MAF source file "(.*?)"$/ do |src|
4
+ @src_f = $test_data + src
5
+ @src_f.exist?.should be_true
6
+ end
7
+
8
+ Given /^MAF data:$/ do |string|
9
+ @src_f = Tempfile.new(['rspec', '.maf'])
10
+ @src_f.write(string)
11
+ @src_f.close
12
+ end
13
+
14
+ When /^I select FASTA output$/ do
15
+ @dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
16
+ @dst.close
17
+ @writer = FastaWriter.new(@dst.path)
18
+ end
19
+
20
+ When /^process the file$/ do
21
+ @parser.each_block do |block|
22
+ block.each_raw_seq do |seq|
23
+ seq.write_fasta(@writer)
24
+ end
25
+ end
26
+ @writer.close
27
+ end
28
+
29
+ Then /^the output should match "(.*?)"$/ do |ref|
30
+ ref_p = $test_data + ref
31
+ ref_p.exist?.should be_true
32
+ #system("diff #{ref} #{@dst.path} >/dev/null 2>&1").should be_true
33
+ File.read(@dst.path).should == File.read(ref_p)
34
+ end
35
+
36
+ Then /^the output should be:$/ do |string|
37
+ File.read(@dst.path).should == string
38
+ end
39
+
40
+ After do
41
+ if @dst
42
+ @dst.close
43
+ @dst.unlink
44
+ end
45
+ end
@@ -0,0 +1,20 @@
1
+ When /^build an index on the reference sequence$/ do
2
+ @idx = Bio::MAF::KyotoIndex.build(@parser, '%')
3
+ end
4
+
5
+ Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
6
+ @idx = Bio::MAF::KyotoIndex.open($test_data + name)
7
+ end
8
+
9
+ Then /^the index has at least (\d+) entries$/ do |size_spec|
10
+ @idx.db.count.should be >= size_spec.to_i
11
+ end
12
+
13
+ When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
14
+ int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
+ @blocks = @idx.find([int], @parser, @block_filter).to_a
16
+ end
17
+
18
+ Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
19
+ @blocks.size.should == num.to_i
20
+ end
@@ -0,0 +1,27 @@
1
+ When /^open a new MAF writer$/ do
2
+ @dst = Tempfile.new(["cuke", ".maf"])
3
+ @writer = Bio::MAF::Writer.new(@dst)
4
+ end
5
+
6
+ When /^write the header from the original MAF file$/ do
7
+ @writer.write_header(@parser.header)
8
+ end
9
+
10
+ When /^write all the parsed blocks$/ do
11
+ @writer.write_blocks(@parser.parse_blocks)
12
+ end
13
+
14
+ RSpec::Matchers.define :match_except_ws do |expected|
15
+ match do |actual|
16
+ system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
17
+ end
18
+
19
+ failure_message_for_should do |actual|
20
+ msg = "File contents did not match. Diff:\n"
21
+ msg << `diff --unified --ignore-space-change #{expected} #{actual}`
22
+ end
23
+ end
24
+
25
+ Then /^the output should match, except whitespace, "(.+)"$/ do |ref|
26
+ @dst.path.should match_except_ws($test_data + ref)
27
+ end
@@ -0,0 +1,63 @@
1
+ When /^I open it with a MAF reader$/ do
2
+ @parser = Bio::MAF::Parser.new(@src_f, @opts || {})
3
+ end
4
+
5
+ When /^I enable the :(\S+) parser option$/ do |opt_s|
6
+ @opts ||= {}
7
+ @opts[opt_s.to_sym] = true
8
+ end
9
+
10
+ Then /^the MAF version should be "(.*?)"$/ do |v_spec|
11
+ @parser.header.version.to_s.should == v_spec
12
+ end
13
+
14
+ Then /^the scoring scheme should be "(.*?)"$/ do |s_spec|
15
+ @parser.header.scoring.should == s_spec
16
+ end
17
+
18
+ Then /^the alignment parameters should be "(.*?)"$/ do |a_spec|
19
+ @parser.header.alignment_params.should == a_spec
20
+ end
21
+
22
+ Then /^an alignment block can be obtained$/ do
23
+ @block = @parser.parse_block
24
+ @block.should_not be_nil
25
+ end
26
+
27
+ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
28
+ @block.sequences.size.should == n_seq.to_i
29
+ end
30
+
31
+ Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
32
+ method_sym = method.gsub(/ /, '_').to_sym
33
+ @block.raw_seq(i.to_i).send(method_sym).should == str
34
+ end
35
+
36
+ Then /^sequence (\d+) has (\w.*?) (\d+)\s*$/ do |i, method, num|
37
+ method_sym = method.gsub(/ /, '_').to_sym
38
+ @block.raw_seq(i.to_i).send(method_sym).should == num.to_i
39
+ end
40
+
41
+ Then /^sequence (\d+) has (\w.*?) :(\S+)\s*$/ do |i, method, sym_s|
42
+ method_sym = method.gsub(/ /, '_').to_sym
43
+ value_sym = sym_s.to_sym
44
+ @block.raw_seq(i.to_i).send(method_sym).should == value_sym
45
+ end
46
+
47
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) "(.*?)"$/ do |chr, i, method, str|
48
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
49
+ method_sym = method.gsub(/ /, '_').to_sym
50
+ seq.send(method_sym).should == str
51
+ end
52
+
53
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) (\d+)$/ do |chr, i, method, num|
54
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
55
+ method_sym = method.gsub(/ /, '_').to_sym
56
+ seq.send(method_sym).should == num.to_i
57
+ end
58
+
59
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) :(\S+)$/ do |chr, i, method, sym_s|
60
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
61
+ method_sym = method.gsub(/ /, '_').to_sym
62
+ seq.send(method_sym).should == sym_s.to_sym
63
+ end
@@ -0,0 +1,31 @@
1
+ When /^filter for only the species$/ do |table|
2
+ # table is a Cucumber::Ast::Table
3
+ sp = table.raw.collect { |row| row[0] }
4
+ @parser.sequence_filter = { :only_species => sp }
5
+ end
6
+
7
+ When /^filter for blocks with the species$/ do |table|
8
+ # table is a Cucumber::Ast::Table
9
+ sp = table.raw.collect { |row| row[0] }
10
+ @block_filter = { :with_all_species => sp }
11
+ end
12
+
13
+ When /^filter for blocks with at least (\d+) sequences$/ do |n|
14
+ @block_filter = { :at_least_n_sequences => n.to_i }
15
+ end
16
+
17
+ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
+ constraint = case op
19
+ when 'least' then :min_size
20
+ when 'most' then :max_size
21
+ else raise "bad operator #{op}!"
22
+ end
23
+ @block_filter = { constraint => len.to_i}
24
+ end
25
+
26
+ When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
+ @block_filter = {
28
+ :min_size => min.to_i,
29
+ :max_size => max.to_i
30
+ }
31
+ end
@@ -0,0 +1,14 @@
1
+ #require 'bio-ucsc-api'
2
+
3
+ Given /^I have a region with start (\d+) and end (\d+)$/ do |r_start, r_end|
4
+ @r_start = r_start.to_i
5
+ @r_end = r_end.to_i
6
+ end
7
+
8
+ When /^I compute the smallest containing bin$/ do
9
+ @bin = Bio::Ucsc::UcscBin.bin_from_range(@r_start, @r_end)
10
+ end
11
+
12
+ Then /^the bin should be (\d+)$/ do |expected_bin|
13
+ @bin.should == expected_bin.to_i
14
+ end
@@ -0,0 +1,16 @@
1
+ unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
+ begin
3
+ require 'simplecov'
4
+ rescue LoadError
5
+ $stderr.puts "WARNING: could not require 'simplecov': #{$!}"
6
+ end
7
+ end
8
+
9
+ require 'pathname'
10
+ require 'tempfile'
11
+
12
+ $LOAD_PATH << File.expand_path('../../../lib', __FILE__)
13
+
14
+ require 'bio-maf'
15
+
16
+ $test_data = Pathname.new 'test/data'
@@ -0,0 +1,24 @@
1
+ Feature: Computation of UCSC bins
2
+ In order to efficiently use indexes
3
+ We will use the UCSC bin indexing system
4
+ Per http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
5
+
6
+ Scenario Outline: Compute smallest containing bin
7
+ Given I have a region with start <Start> and end <End>
8
+ When I compute the smallest containing bin
9
+ Then the bin should be <Bin>
10
+
11
+ Examples:
12
+ | Start | End | Bin |
13
+ | 25079603 | 25079787 | 776 |
14
+ | 25128173 | 25128248 | 776 |
15
+ | 50312474 | 50312703 | 968 |
16
+ | 41905591 | 41906101 | 904 |
17
+ | 16670899 | 16673060 | 712 |
18
+ | 75495356 | 75495494 | 1160 |
19
+ | 92259501 | 92261053 | 1288 |
20
+ | 83834063 | 83838132 | 1224 |
21
+ | 7309597 | 7310411 | 640 |
22
+ | 6190410 | 6190999 | 632 |
23
+ # from https://github.com/polyatail/biopython/blob/af34c033d78c4c72dffbb500e513e568a2ba5e29/Tests/test_MafIO_index.py#L48
24
+