bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'bigbio'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+
11
+ OptionParser.new do |opts|
12
+ opts.banner = "Usage: maf_to_fasta [options] <maf> <fasta>"
13
+ opts.separator ""
14
+ opts.separator "Options:"
15
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
16
+ options.prof = prof
17
+ end
18
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |path|
19
+ options.ruby_prof = path
20
+ end
21
+ opts.on("--profile-gc", "Profile GC") do |prof|
22
+ options.profile_gc = true
23
+ end
24
+ opts.on("--parser PARSER", "parser") do |name|
25
+ options.parser = Bio::MAF.const_get(name)
26
+ end
27
+ end.parse!(ARGV)
28
+
29
+ src_path = ARGV.shift
30
+ dst_path = ARGV.shift
31
+
32
+ if options.prof
33
+ require 'perftools'
34
+ PerfTools::CpuProfiler.start(options.prof)
35
+ elsif options.ruby_prof
36
+ require 'ruby-prof'
37
+ RubyProf.start
38
+ end
39
+
40
+ if options.profile_gc
41
+ GC::Profiler.enable
42
+ end
43
+
44
+ parser = options.parser.new(src_path)
45
+ writer = FastaWriter.new(dst_path)
46
+
47
+ parser.each_block do |block|
48
+ block.each_raw_seq do |seq|
49
+ seq.write_fasta(writer)
50
+ end
51
+ end
52
+
53
+ writer.close
54
+
55
+ if options.profile_gc
56
+ $stderr.puts GC::Profiler.result
57
+ GC::Profiler.disable
58
+ end
59
+
60
+ if options.prof
61
+ PerfTools::CpuProfiler.stop
62
+ elsif options.ruby_prof
63
+ res = RubyProf.stop
64
+ printer = RubyProf::FlatPrinter.new(res)
65
+ File.open(options.ruby_prof, 'w') do |f|
66
+ printer.print(f)
67
+ end
68
+ end
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'optparse'
5
+ require 'ostruct'
6
+
7
+ options = OpenStruct.new
8
+ options.parser = Bio::MAF::Parser
9
+ options.opts = {
10
+ :chunk_reader => Bio::MAF::ChunkReader,
11
+ :parse_extended => false
12
+ }
13
+
14
+ PRINTERS = {
15
+ 'flat' => :FlatPrinter,
16
+ 'stack' => :CallStackPrinter
17
+ }
18
+
19
+ OptionParser.new do |opts|
20
+ opts.banner = "Usage: maf_write [options] <maf>"
21
+ opts.separator ""
22
+ opts.separator "Options:"
23
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
24
+ options.prof = prof
25
+ end
26
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
27
+ if pspec =~ /(\w+):(.+)/
28
+ require 'ruby-prof'
29
+ options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
30
+ options.ruby_prof_path = $2
31
+ else
32
+ options.ruby_prof_printer = RubyProf::FlatPrinter
33
+ options.ruby_prof_path = pspec
34
+ end
35
+ end
36
+ opts.on("--profile-gc", "Profile GC") do |prof|
37
+ options.profile_gc = true
38
+ end
39
+ opts.on("--parser PARSER", "parser") do |name|
40
+ options.parser = Bio::MAF.const_get(name)
41
+ end
42
+ opts.on("-t", "--threaded") do
43
+ options.opts[:chunk_reader] = Bio::MAF::ThreadedChunkReader
44
+ options.opts[:threads] = 1
45
+ end
46
+ opts.on("-e", "--extended") do
47
+ options.opts[:parse_extended] = true
48
+ options.opts[:parse_empty] = true
49
+ end
50
+ end.parse!(ARGV)
51
+
52
+ src_path = ARGV.shift
53
+
54
+ if options.prof
55
+ require 'perftools'
56
+ PerfTools::CpuProfiler.start(options.prof)
57
+ elsif options.ruby_prof_path
58
+ require 'ruby-prof'
59
+ RubyProf.start
60
+ end
61
+
62
+ if options.profile_gc
63
+ GC::Profiler.enable
64
+ end
65
+
66
+ parser = options.parser.new(src_path, options.opts)
67
+ writer = Bio::MAF::Writer.new($stdout)
68
+ writer.write_header(parser.header)
69
+ writer.write_blocks(parser.parse_blocks)
70
+
71
+ if options.profile_gc
72
+ $stderr.puts GC::Profiler.result
73
+ GC::Profiler.disable
74
+ end
75
+
76
+ if options.prof
77
+ PerfTools::CpuProfiler.stop
78
+ elsif options.ruby_prof_path
79
+ res = RubyProf.stop
80
+ printer = options.ruby_prof_printer.new(res)
81
+ File.open(options.ruby_prof_path, 'w') do |f|
82
+ printer.print(f)
83
+ end
84
+ end
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'ostruct'
5
+
6
+ options = OpenStruct.new
7
+
8
+ op = OptionParser.new do |opts|
9
+ opts.banner = "Usage: random_ranges [options]"
10
+ opts.on("-r", "--range START:END", "range") do |range|
11
+ s, e = range.split(':')
12
+ options.start = s.to_i
13
+ options.end = e.to_i
14
+ end
15
+ opts.on("-l", "--length LEN", "block length") do |len|
16
+ options.length = len.to_i
17
+ end
18
+ opts.on("-n", "--number NUM", "number of blocks") do |num|
19
+ options.num = num.to_i
20
+ end
21
+ opts.on("-s", "--sequence SEQ", "sequence") do |seq|
22
+ options.seq = seq
23
+ end
24
+ end.parse!(ARGV)
25
+
26
+ rand = Random.new
27
+ range = options.end - options.start
28
+ block_range = range / options.num
29
+ block_start_range = block_range - options.length
30
+ (0...options.num).each do |n|
31
+ block_offset = rand.rand(block_start_range)
32
+ b_start = options.start + (block_range * n) + block_offset
33
+ b_end = b_start + options.length
34
+ puts "#{options.seq}\t#{b_start}\t#{b_end}\tx"
35
+ end
@@ -0,0 +1,31 @@
1
+ @milestone_2
2
+ Feature: Indexed access to MAF files
3
+ In order to extract alignment blocks from MAF files
4
+ By chromosomal ranges matching a source sequence
5
+ I want to have a way to build indexes on MAF files
6
+ And use indexes to efficiently find alignment blocks
7
+ Because linear searches of a 200 GB file are impractical
8
+
9
+ Scenario: Index a MAF file
10
+ Given a MAF source file "mm8_chr7_tiny.maf"
11
+ When I open it with a MAF reader
12
+ And build an index on the reference sequence
13
+ Then the index has at least 8 entries
14
+
15
+ Scenario: Extract alignment blocks by chromosomal range
16
+ Given a MAF source file "mm8_chr7_tiny.maf"
17
+ When I open it with a MAF reader
18
+ And build an index on the reference sequence
19
+ And search for blocks between positions 80082592 and 80082766 of mm8.chr7
20
+ Then 2 blocks are obtained
21
+ And sequence mm8.chr7 of block 0 has start 80082592
22
+ And sequence mm8.chr7 of block 1 has start 80082713
23
+
24
+ Scenario: Extract alignment blocks by chromosomal range from index file
25
+ Given a MAF source file "mm8_chr7_tiny.maf"
26
+ And a Kyoto Cabinet index file "mm8_chr7_tiny.kct"
27
+ When I open it with a MAF reader
28
+ And search for blocks between positions 80082592 and 80082766 of mm8.chr7
29
+ Then 2 blocks are obtained
30
+ And sequence mm8.chr7 of block 0 has start 80082592
31
+ And sequence mm8.chr7 of block 1 has start 80082713
@@ -0,0 +1,29 @@
1
+ Feature: MAF output
2
+ In order to output modified MAF files or subsets of them
3
+ I want to be able to write out parsed MAF data
4
+
5
+ Scenario: Reproduce simple test data
6
+ Given a MAF source file "mm8_single.maf"
7
+ When I open it with a MAF reader
8
+ And open a new MAF writer
9
+ And write the header from the original MAF file
10
+ And write all the parsed blocks
11
+ Then the output should match, except whitespace, "mm8_single.maf"
12
+
13
+ Scenario: Reproduce longer test data
14
+ Given a MAF source file "mm8_chr7_tiny.maf"
15
+ When I open it with a MAF reader
16
+ And open a new MAF writer
17
+ And write the header from the original MAF file
18
+ And write all the parsed blocks
19
+ Then the output should match, except whitespace, "mm8_chr7_tiny.maf"
20
+
21
+ Scenario: Reproduce test data with i, e, q lines
22
+ Given a MAF source file "chr22_ieq.maf"
23
+ When I enable the :parse_extended parser option
24
+ And I enable the :parse_empty parser option
25
+ And I open it with a MAF reader
26
+ And open a new MAF writer
27
+ And write the header from the original MAF file
28
+ And write all the parsed blocks
29
+ Then the output should match, except whitespace, "chr22_ieq.maf"
@@ -0,0 +1,44 @@
1
+ Feature: Parse MAF files
2
+ In order to extract information from a MAF file
3
+ I want to read it and pull out information
4
+
5
+ Scenario: Read MAF header
6
+ Given MAF data:
7
+ """
8
+ ##maf version=1 scoring=humor.v4
9
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet25/chr22.maf /cluster/data/hg15/bed/blastz.rn3/axtNet25/chr22.maf
10
+
11
+ a score=0.128
12
+ s human_hoxa 100 8 + 100257 ACA-TTACT
13
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
14
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
15
+ """
16
+ When I open it with a MAF reader
17
+ Then the MAF version should be "1"
18
+ And the scoring scheme should be "humor.v4"
19
+ # third line a continuation
20
+ And the alignment parameters should be "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet25/chr22.maf /cluster/data/hg15/bed/blastz.rn3/axtNet25/chr22.maf"
21
+
22
+ Scenario: Read alignment block
23
+ Given MAF data:
24
+ """
25
+ ##maf version=1 scoring=humor.v4
26
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
27
+ # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
28
+
29
+ a score=0.128
30
+ s human_hoxa 100 8 + 100257 ACA-TTACT
31
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
32
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
33
+ """
34
+ When I open it with a MAF reader
35
+ Then an alignment block can be obtained
36
+ And the alignment block has 3 sequences
37
+ And sequence 0 has source "human_hoxa"
38
+ And sequence 0 has start 100
39
+ And sequence 0 has size 8
40
+ And sequence 0 has strand :+
41
+ And sequence 0 has source size 100257
42
+ And sequence 0 has text "ACA-TTACT"
43
+ And sequence 1 has strand :-
44
+
@@ -0,0 +1,75 @@
1
+ @milestone_3
2
+ Feature: Filter results from MAF files
3
+ In order to work with only relevant data from a MAF file
4
+ Such as only species recognized by PhyloCSF
5
+ I want to filter the results of MAF queries
6
+
7
+ Scenario: Return only specified species
8
+ Given MAF data:
9
+ """
10
+ ##maf version=1
11
+ a score=10542.0
12
+ s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
13
+ s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
14
+ s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
15
+ s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
16
+ s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
17
+ s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
18
+ s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
19
+ s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
20
+ s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
21
+ s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
22
+ """
23
+ When I open it with a MAF reader
24
+ And filter for only the species
25
+ | hg18 |
26
+ | mm8 |
27
+ | rheMac2 |
28
+ Then an alignment block can be obtained
29
+ And the alignment block has 3 sequences
30
+
31
+ Scenario: Return only blocks having all specified species
32
+ Given a MAF source file "mm8_chr7_tiny.maf"
33
+ When I open it with a MAF reader
34
+ And build an index on the reference sequence
35
+ And filter for blocks with the species
36
+ | panTro2 |
37
+ | loxAfr1 |
38
+ And search for blocks between positions 80082471 and 80082730 of mm8.chr7
39
+ Then 1 block is obtained
40
+
41
+ Scenario: Return only blocks having a certain number of sequences
42
+ Given a MAF source file "mm8_chr7_tiny.maf"
43
+ When I open it with a MAF reader
44
+ And build an index on the reference sequence
45
+ And filter for blocks with at least 6 sequences
46
+ And search for blocks between positions 80082767 and 80083008 of mm8.chr7
47
+ Then 1 block is obtained
48
+
49
+ # sizes present:
50
+ # 55 64 128 148 157 163 165 192
51
+
52
+ Scenario: Return blocks with a maximum text size
53
+ Given a MAF source file "mm8_chr7_tiny.maf"
54
+ When I open it with a MAF reader
55
+ And build an index on the reference sequence
56
+ And filter for blocks with text size at least 150
57
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
58
+ Then 4 blocks are obtained
59
+
60
+ Scenario: Return blocks with a minimum text size
61
+ Given a MAF source file "mm8_chr7_tiny.maf"
62
+ When I open it with a MAF reader
63
+ And build an index on the reference sequence
64
+ And filter for blocks with text size at most 72
65
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
66
+ Then 2 blocks are obtained
67
+
68
+ Scenario: Return blocks within a text size range
69
+ Given a MAF source file "mm8_chr7_tiny.maf"
70
+ When I open it with a MAF reader
71
+ And build an index on the reference sequence
72
+ And filter for blocks with text size between 72 and 160
73
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
74
+ Then 3 blocks are obtained
75
+
@@ -0,0 +1,50 @@
1
+ Feature: Convert MAF file to FASTA
2
+ In order to use multiple alignment data with other tools
3
+ I want to read a Multiple Alignment Format (MAF) file and write out its data as FASTA
4
+
5
+ Scenario: Convert simple MAF file
6
+ Given a MAF source file "t1.maf"
7
+ When I select FASTA output
8
+ And I open it with a MAF reader
9
+ And process the file
10
+ Then the output should match "t1.fasta"
11
+
12
+ Scenario: Convert simple MAF data
13
+ Given MAF data:
14
+ """
15
+ ##maf version=1 scoring=humor.v4
16
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
17
+ # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
18
+
19
+ a score=0.128
20
+ s human_hoxa 100 8 + 100257 ACA-TTACT
21
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
22
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
23
+
24
+
25
+ a score=0.071
26
+ s human_unc 9077 8 + 10998 ACAGTATT
27
+ # Comment
28
+ s horse_unc 4555 6 - 5099 ACA--ATT
29
+ s fugu_unc 4000 4 + 4038 AC----TT
30
+ """
31
+ When I select FASTA output
32
+ And I open it with a MAF reader
33
+ And process the file
34
+ Then the output should be:
35
+ """
36
+ >human_hoxa:100-108
37
+ ACA-TTACT
38
+ >horse_hoxa:120-129
39
+ ACAATTGCT
40
+ >fugu_hoxa:88-95
41
+ ACA--TGCT
42
+ >human_unc:9077-9085
43
+ ACAGTATT
44
+ >horse_unc:4555-4561
45
+ ACA--ATT
46
+ >fugu_unc:4000-4004
47
+ AC----TT
48
+
49
+ """
50
+
@@ -0,0 +1,45 @@
1
+ require 'bigbio' # FASTA support
2
+
3
+ Given /^a MAF source file "(.*?)"$/ do |src|
4
+ @src_f = $test_data + src
5
+ @src_f.exist?.should be_true
6
+ end
7
+
8
+ Given /^MAF data:$/ do |string|
9
+ @src_f = Tempfile.new(['rspec', '.maf'])
10
+ @src_f.write(string)
11
+ @src_f.close
12
+ end
13
+
14
+ When /^I select FASTA output$/ do
15
+ @dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
16
+ @dst.close
17
+ @writer = FastaWriter.new(@dst.path)
18
+ end
19
+
20
+ When /^process the file$/ do
21
+ @parser.each_block do |block|
22
+ block.each_raw_seq do |seq|
23
+ seq.write_fasta(@writer)
24
+ end
25
+ end
26
+ @writer.close
27
+ end
28
+
29
+ Then /^the output should match "(.*?)"$/ do |ref|
30
+ ref_p = $test_data + ref
31
+ ref_p.exist?.should be_true
32
+ #system("diff #{ref} #{@dst.path} >/dev/null 2>&1").should be_true
33
+ File.read(@dst.path).should == File.read(ref_p)
34
+ end
35
+
36
+ Then /^the output should be:$/ do |string|
37
+ File.read(@dst.path).should == string
38
+ end
39
+
40
+ After do
41
+ if @dst
42
+ @dst.close
43
+ @dst.unlink
44
+ end
45
+ end