bio-maf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
@@ -0,0 +1,75 @@
1
+ @milestone_3
2
+ Feature: Filter results from MAF files
3
+ In order to work with only relevant data from a MAF file
4
+ Such as only species recognized by PhyloCSF
5
+ I want to filter the results of MAF queries
6
+
7
+ Scenario: Return only specified species
8
+ Given MAF data:
9
+ """
10
+ ##maf version=1
11
+ a score=10542.0
12
+ s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
13
+ s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
14
+ s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
15
+ s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
16
+ s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
17
+ s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
18
+ s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
19
+ s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
20
+ s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
21
+ s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
22
+ """
23
+ When I open it with a MAF reader
24
+ And filter for only the species
25
+ | hg18 |
26
+ | mm8 |
27
+ | rheMac2 |
28
+ Then an alignment block can be obtained
29
+ And the alignment block has 3 sequences
30
+
31
+ Scenario: Return only blocks having all specified species
32
+ Given a MAF source file "mm8_chr7_tiny.maf"
33
+ When I open it with a MAF reader
34
+ And build an index on the reference sequence
35
+ And filter for blocks with the species
36
+ | panTro2 |
37
+ | loxAfr1 |
38
+ And search for blocks between positions 80082471 and 80082730 of mm8.chr7
39
+ Then 1 block is obtained
40
+
41
+ Scenario: Return only blocks having a certain number of sequences
42
+ Given a MAF source file "mm8_chr7_tiny.maf"
43
+ When I open it with a MAF reader
44
+ And build an index on the reference sequence
45
+ And filter for blocks with at least 6 sequences
46
+ And search for blocks between positions 80082767 and 80083008 of mm8.chr7
47
+ Then 1 block is obtained
48
+
49
+ # sizes present:
50
+ # 55 64 128 148 157 163 165 192
51
+
52
+ Scenario: Return blocks with a maximum text size
53
+ Given a MAF source file "mm8_chr7_tiny.maf"
54
+ When I open it with a MAF reader
55
+ And build an index on the reference sequence
56
+ And filter for blocks with text size at least 150
57
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
58
+ Then 4 blocks are obtained
59
+
60
+ Scenario: Return blocks with a minimum text size
61
+ Given a MAF source file "mm8_chr7_tiny.maf"
62
+ When I open it with a MAF reader
63
+ And build an index on the reference sequence
64
+ And filter for blocks with text size at most 72
65
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
66
+ Then 2 blocks are obtained
67
+
68
+ Scenario: Return blocks within a text size range
69
+ Given a MAF source file "mm8_chr7_tiny.maf"
70
+ When I open it with a MAF reader
71
+ And build an index on the reference sequence
72
+ And filter for blocks with text size between 72 and 160
73
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
74
+ Then 3 blocks are obtained
75
+
@@ -0,0 +1,50 @@
1
+ Feature: Convert MAF file to FASTA
2
+ In order to use multiple alignment data with other tools
3
+ I want to read a Multiple Alignment Format (MAF) file and write out its data as FASTA
4
+
5
+ Scenario: Convert simple MAF file
6
+ Given a MAF source file "t1.maf"
7
+ When I select FASTA output
8
+ And I open it with a MAF reader
9
+ And process the file
10
+ Then the output should match "t1.fasta"
11
+
12
+ Scenario: Convert simple MAF data
13
+ Given MAF data:
14
+ """
15
+ ##maf version=1 scoring=humor.v4
16
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
17
+ # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
18
+
19
+ a score=0.128
20
+ s human_hoxa 100 8 + 100257 ACA-TTACT
21
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
22
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
23
+
24
+
25
+ a score=0.071
26
+ s human_unc 9077 8 + 10998 ACAGTATT
27
+ # Comment
28
+ s horse_unc 4555 6 - 5099 ACA--ATT
29
+ s fugu_unc 4000 4 + 4038 AC----TT
30
+ """
31
+ When I select FASTA output
32
+ And I open it with a MAF reader
33
+ And process the file
34
+ Then the output should be:
35
+ """
36
+ >human_hoxa:100-108
37
+ ACA-TTACT
38
+ >horse_hoxa:120-129
39
+ ACAATTGCT
40
+ >fugu_hoxa:88-95
41
+ ACA--TGCT
42
+ >human_unc:9077-9085
43
+ ACAGTATT
44
+ >horse_unc:4555-4561
45
+ ACA--ATT
46
+ >fugu_unc:4000-4004
47
+ AC----TT
48
+
49
+ """
50
+
@@ -0,0 +1,45 @@
1
+ require 'bigbio' # FASTA support
2
+
3
+ Given /^a MAF source file "(.*?)"$/ do |src|
4
+ @src_f = $test_data + src
5
+ @src_f.exist?.should be_true
6
+ end
7
+
8
+ Given /^MAF data:$/ do |string|
9
+ @src_f = Tempfile.new(['rspec', '.maf'])
10
+ @src_f.write(string)
11
+ @src_f.close
12
+ end
13
+
14
+ When /^I select FASTA output$/ do
15
+ @dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
16
+ @dst.close
17
+ @writer = FastaWriter.new(@dst.path)
18
+ end
19
+
20
+ When /^process the file$/ do
21
+ @parser.each_block do |block|
22
+ block.each_raw_seq do |seq|
23
+ seq.write_fasta(@writer)
24
+ end
25
+ end
26
+ @writer.close
27
+ end
28
+
29
+ Then /^the output should match "(.*?)"$/ do |ref|
30
+ ref_p = $test_data + ref
31
+ ref_p.exist?.should be_true
32
+ #system("diff #{ref} #{@dst.path} >/dev/null 2>&1").should be_true
33
+ File.read(@dst.path).should == File.read(ref_p)
34
+ end
35
+
36
+ Then /^the output should be:$/ do |string|
37
+ File.read(@dst.path).should == string
38
+ end
39
+
40
+ After do
41
+ if @dst
42
+ @dst.close
43
+ @dst.unlink
44
+ end
45
+ end
@@ -0,0 +1,20 @@
1
+ When /^build an index on the reference sequence$/ do
2
+ @idx = Bio::MAF::KyotoIndex.build(@parser, '%')
3
+ end
4
+
5
+ Given /^a Kyoto Cabinet index file "(.*?)"$/ do |name|
6
+ @idx = Bio::MAF::KyotoIndex.open($test_data + name)
7
+ end
8
+
9
+ Then /^the index has at least (\d+) entries$/ do |size_spec|
10
+ @idx.db.count.should be >= size_spec.to_i
11
+ end
12
+
13
+ When /^search for blocks between positions (\d+) and (\d+) of (\S+)$/ do |i_start, i_end, chr|
14
+ int = Bio::GenomicInterval.zero_based(chr, i_start.to_i, i_end.to_i)
15
+ @blocks = @idx.find([int], @parser, @block_filter).to_a
16
+ end
17
+
18
+ Then /^(\d+) blocks? (?:is|are) obtained$/ do |num|
19
+ @blocks.size.should == num.to_i
20
+ end
@@ -0,0 +1,27 @@
1
+ When /^open a new MAF writer$/ do
2
+ @dst = Tempfile.new(["cuke", ".maf"])
3
+ @writer = Bio::MAF::Writer.new(@dst)
4
+ end
5
+
6
+ When /^write the header from the original MAF file$/ do
7
+ @writer.write_header(@parser.header)
8
+ end
9
+
10
+ When /^write all the parsed blocks$/ do
11
+ @writer.write_blocks(@parser.parse_blocks)
12
+ end
13
+
14
+ RSpec::Matchers.define :match_except_ws do |expected|
15
+ match do |actual|
16
+ system("diff --ignore-space-change --brief #{expected} #{actual} >/dev/null 2>&1")
17
+ end
18
+
19
+ failure_message_for_should do |actual|
20
+ msg = "File contents did not match. Diff:\n"
21
+ msg << `diff --unified --ignore-space-change #{expected} #{actual}`
22
+ end
23
+ end
24
+
25
+ Then /^the output should match, except whitespace, "(.+)"$/ do |ref|
26
+ @dst.path.should match_except_ws($test_data + ref)
27
+ end
@@ -0,0 +1,63 @@
1
+ When /^I open it with a MAF reader$/ do
2
+ @parser = Bio::MAF::Parser.new(@src_f, @opts || {})
3
+ end
4
+
5
+ When /^I enable the :(\S+) parser option$/ do |opt_s|
6
+ @opts ||= {}
7
+ @opts[opt_s.to_sym] = true
8
+ end
9
+
10
+ Then /^the MAF version should be "(.*?)"$/ do |v_spec|
11
+ @parser.header.version.to_s.should == v_spec
12
+ end
13
+
14
+ Then /^the scoring scheme should be "(.*?)"$/ do |s_spec|
15
+ @parser.header.scoring.should == s_spec
16
+ end
17
+
18
+ Then /^the alignment parameters should be "(.*?)"$/ do |a_spec|
19
+ @parser.header.alignment_params.should == a_spec
20
+ end
21
+
22
+ Then /^an alignment block can be obtained$/ do
23
+ @block = @parser.parse_block
24
+ @block.should_not be_nil
25
+ end
26
+
27
+ Then /^the alignment block has (\d+) sequences$/ do |n_seq|
28
+ @block.sequences.size.should == n_seq.to_i
29
+ end
30
+
31
+ Then /^sequence (\d+) has (\w.*?) "(.*?)"$/ do |i, method, str|
32
+ method_sym = method.gsub(/ /, '_').to_sym
33
+ @block.raw_seq(i.to_i).send(method_sym).should == str
34
+ end
35
+
36
+ Then /^sequence (\d+) has (\w.*?) (\d+)\s*$/ do |i, method, num|
37
+ method_sym = method.gsub(/ /, '_').to_sym
38
+ @block.raw_seq(i.to_i).send(method_sym).should == num.to_i
39
+ end
40
+
41
+ Then /^sequence (\d+) has (\w.*?) :(\S+)\s*$/ do |i, method, sym_s|
42
+ method_sym = method.gsub(/ /, '_').to_sym
43
+ value_sym = sym_s.to_sym
44
+ @block.raw_seq(i.to_i).send(method_sym).should == value_sym
45
+ end
46
+
47
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) "(.*?)"$/ do |chr, i, method, str|
48
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
49
+ method_sym = method.gsub(/ /, '_').to_sym
50
+ seq.send(method_sym).should == str
51
+ end
52
+
53
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) (\d+)$/ do |chr, i, method, num|
54
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
55
+ method_sym = method.gsub(/ /, '_').to_sym
56
+ seq.send(method_sym).should == num.to_i
57
+ end
58
+
59
+ Then /^sequence (\S+) of block (\d+) has (\w.*?) :(\S+)$/ do |chr, i, method, sym_s|
60
+ seq = @blocks[i.to_i].sequences.find { |seq| seq.source == chr }
61
+ method_sym = method.gsub(/ /, '_').to_sym
62
+ seq.send(method_sym).should == sym_s.to_sym
63
+ end
@@ -0,0 +1,31 @@
1
+ When /^filter for only the species$/ do |table|
2
+ # table is a Cucumber::Ast::Table
3
+ sp = table.raw.collect { |row| row[0] }
4
+ @parser.sequence_filter = { :only_species => sp }
5
+ end
6
+
7
+ When /^filter for blocks with the species$/ do |table|
8
+ # table is a Cucumber::Ast::Table
9
+ sp = table.raw.collect { |row| row[0] }
10
+ @block_filter = { :with_all_species => sp }
11
+ end
12
+
13
+ When /^filter for blocks with at least (\d+) sequences$/ do |n|
14
+ @block_filter = { :at_least_n_sequences => n.to_i }
15
+ end
16
+
17
+ When /^filter for blocks with text size at (least|most) (\d+)$/ do |op, len|
18
+ constraint = case op
19
+ when 'least' then :min_size
20
+ when 'most' then :max_size
21
+ else raise "bad operator #{op}!"
22
+ end
23
+ @block_filter = { constraint => len.to_i}
24
+ end
25
+
26
+ When /^filter for blocks with text size between (\d+) and (\d+)$/ do |min, max|
27
+ @block_filter = {
28
+ :min_size => min.to_i,
29
+ :max_size => max.to_i
30
+ }
31
+ end
@@ -0,0 +1,14 @@
1
+ #require 'bio-ucsc-api'
2
+
3
+ Given /^I have a region with start (\d+) and end (\d+)$/ do |r_start, r_end|
4
+ @r_start = r_start.to_i
5
+ @r_end = r_end.to_i
6
+ end
7
+
8
+ When /^I compute the smallest containing bin$/ do
9
+ @bin = Bio::Ucsc::UcscBin.bin_from_range(@r_start, @r_end)
10
+ end
11
+
12
+ Then /^the bin should be (\d+)$/ do |expected_bin|
13
+ @bin.should == expected_bin.to_i
14
+ end
@@ -0,0 +1,16 @@
1
+ unless ENV.has_key?('TRAVIS') || RUBY_PLATFORM == 'java'
2
+ begin
3
+ require 'simplecov'
4
+ rescue LoadError
5
+ $stderr.puts "WARNING: could not require 'simplecov': #{$!}"
6
+ end
7
+ end
8
+
9
+ require 'pathname'
10
+ require 'tempfile'
11
+
12
+ $LOAD_PATH << File.expand_path('../../../lib', __FILE__)
13
+
14
+ require 'bio-maf'
15
+
16
+ $test_data = Pathname.new 'test/data'
@@ -0,0 +1,24 @@
1
+ Feature: Computation of UCSC bins
2
+ In order to efficiently use indexes
3
+ We will use the UCSC bin indexing system
4
+ Per http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
5
+
6
+ Scenario Outline: Compute smallest containing bin
7
+ Given I have a region with start <Start> and end <End>
8
+ When I compute the smallest containing bin
9
+ Then the bin should be <Bin>
10
+
11
+ Examples:
12
+ | Start | End | Bin |
13
+ | 25079603 | 25079787 | 776 |
14
+ | 25128173 | 25128248 | 776 |
15
+ | 50312474 | 50312703 | 968 |
16
+ | 41905591 | 41906101 | 904 |
17
+ | 16670899 | 16673060 | 712 |
18
+ | 75495356 | 75495494 | 1160 |
19
+ | 92259501 | 92261053 | 1288 |
20
+ | 83834063 | 83838132 | 1224 |
21
+ | 7309597 | 7310411 | 640 |
22
+ | 6190410 | 6190999 | 632 |
23
+ # from https://github.com/polyatail/biopython/blob/af34c033d78c4c72dffbb500e513e568a2ba5e29/Tests/test_MafIO_index.py#L48
24
+