bio-maf 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'bigbio'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+
11
+ OptionParser.new do |opts|
12
+ opts.banner = "Usage: maf_to_fasta [options] <maf> <fasta>"
13
+ opts.separator ""
14
+ opts.separator "Options:"
15
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
16
+ options.prof = prof
17
+ end
18
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |path|
19
+ options.ruby_prof = path
20
+ end
21
+ opts.on("--profile-gc", "Profile GC") do |prof|
22
+ options.profile_gc = true
23
+ end
24
+ opts.on("--parser PARSER", "parser") do |name|
25
+ options.parser = Bio::MAF.const_get(name)
26
+ end
27
+ end.parse!(ARGV)
28
+
29
+ src_path = ARGV.shift
30
+ dst_path = ARGV.shift
31
+
32
+ if options.prof
33
+ require 'perftools'
34
+ PerfTools::CpuProfiler.start(options.prof)
35
+ elsif options.ruby_prof
36
+ require 'ruby-prof'
37
+ RubyProf.start
38
+ end
39
+
40
+ if options.profile_gc
41
+ GC::Profiler.enable
42
+ end
43
+
44
+ parser = options.parser.new(src_path)
45
+ writer = FastaWriter.new(dst_path)
46
+
47
+ parser.each_block do |block|
48
+ block.each_raw_seq do |seq|
49
+ seq.write_fasta(writer)
50
+ end
51
+ end
52
+
53
+ writer.close
54
+
55
+ if options.profile_gc
56
+ $stderr.puts GC::Profiler.result
57
+ GC::Profiler.disable
58
+ end
59
+
60
+ if options.prof
61
+ PerfTools::CpuProfiler.stop
62
+ elsif options.ruby_prof
63
+ res = RubyProf.stop
64
+ printer = RubyProf::FlatPrinter.new(res)
65
+ File.open(options.ruby_prof, 'w') do |f|
66
+ printer.print(f)
67
+ end
68
+ end
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'optparse'
5
+ require 'ostruct'
6
+
7
+ options = OpenStruct.new
8
+ options.parser = Bio::MAF::Parser
9
+ options.opts = {
10
+ :chunk_reader => Bio::MAF::ChunkReader,
11
+ :parse_extended => false
12
+ }
13
+
14
+ PRINTERS = {
15
+ 'flat' => :FlatPrinter,
16
+ 'stack' => :CallStackPrinter
17
+ }
18
+
19
+ OptionParser.new do |opts|
20
+ opts.banner = "Usage: maf_write [options] <maf>"
21
+ opts.separator ""
22
+ opts.separator "Options:"
23
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
24
+ options.prof = prof
25
+ end
26
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
27
+ if pspec =~ /(\w+):(.+)/
28
+ require 'ruby-prof'
29
+ options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
30
+ options.ruby_prof_path = $2
31
+ else
32
+ options.ruby_prof_printer = RubyProf::FlatPrinter
33
+ options.ruby_prof_path = pspec
34
+ end
35
+ end
36
+ opts.on("--profile-gc", "Profile GC") do |prof|
37
+ options.profile_gc = true
38
+ end
39
+ opts.on("--parser PARSER", "parser") do |name|
40
+ options.parser = Bio::MAF.const_get(name)
41
+ end
42
+ opts.on("-t", "--threaded") do
43
+ options.opts[:chunk_reader] = Bio::MAF::ThreadedChunkReader
44
+ options.opts[:threads] = 1
45
+ end
46
+ opts.on("-e", "--extended") do
47
+ options.opts[:parse_extended] = true
48
+ options.opts[:parse_empty] = true
49
+ end
50
+ end.parse!(ARGV)
51
+
52
+ src_path = ARGV.shift
53
+
54
+ if options.prof
55
+ require 'perftools'
56
+ PerfTools::CpuProfiler.start(options.prof)
57
+ elsif options.ruby_prof_path
58
+ require 'ruby-prof'
59
+ RubyProf.start
60
+ end
61
+
62
+ if options.profile_gc
63
+ GC::Profiler.enable
64
+ end
65
+
66
+ parser = options.parser.new(src_path, options.opts)
67
+ writer = Bio::MAF::Writer.new($stdout)
68
+ writer.write_header(parser.header)
69
+ writer.write_blocks(parser.parse_blocks)
70
+
71
+ if options.profile_gc
72
+ $stderr.puts GC::Profiler.result
73
+ GC::Profiler.disable
74
+ end
75
+
76
+ if options.prof
77
+ PerfTools::CpuProfiler.stop
78
+ elsif options.ruby_prof_path
79
+ res = RubyProf.stop
80
+ printer = options.ruby_prof_printer.new(res)
81
+ File.open(options.ruby_prof_path, 'w') do |f|
82
+ printer.print(f)
83
+ end
84
+ end
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'ostruct'
5
+
6
+ options = OpenStruct.new
7
+
8
+ op = OptionParser.new do |opts|
9
+ opts.banner = "Usage: random_ranges [options]"
10
+ opts.on("-r", "--range START:END", "range") do |range|
11
+ s, e = range.split(':')
12
+ options.start = s.to_i
13
+ options.end = e.to_i
14
+ end
15
+ opts.on("-l", "--length LEN", "block length") do |len|
16
+ options.length = len.to_i
17
+ end
18
+ opts.on("-n", "--number NUM", "number of blocks") do |num|
19
+ options.num = num.to_i
20
+ end
21
+ opts.on("-s", "--sequence SEQ", "sequence") do |seq|
22
+ options.seq = seq
23
+ end
24
+ end.parse!(ARGV)
25
+
26
+ rand = Random.new
27
+ range = options.end - options.start
28
+ block_range = range / options.num
29
+ block_start_range = block_range - options.length
30
+ (0...options.num).each do |n|
31
+ block_offset = rand.rand(block_start_range)
32
+ b_start = options.start + (block_range * n) + block_offset
33
+ b_end = b_start + options.length
34
+ puts "#{options.seq}\t#{b_start}\t#{b_end}\tx"
35
+ end
@@ -0,0 +1,31 @@
1
+ @milestone_2
2
+ Feature: Indexed access to MAF files
3
+ In order to extract alignment blocks from MAF files
4
+ By chromosomal ranges matching a source sequence
5
+ I want to have a way to build indexes on MAF files
6
+ And use indexes to efficiently find alignment blocks
7
+ Because linear searches of a 200 GB file are impractical
8
+
9
+ Scenario: Index a MAF file
10
+ Given a MAF source file "mm8_chr7_tiny.maf"
11
+ When I open it with a MAF reader
12
+ And build an index on the reference sequence
13
+ Then the index has at least 8 entries
14
+
15
+ Scenario: Extract alignment blocks by chromosomal range
16
+ Given a MAF source file "mm8_chr7_tiny.maf"
17
+ When I open it with a MAF reader
18
+ And build an index on the reference sequence
19
+ And search for blocks between positions 80082592 and 80082766 of mm8.chr7
20
+ Then 2 blocks are obtained
21
+ And sequence mm8.chr7 of block 0 has start 80082592
22
+ And sequence mm8.chr7 of block 1 has start 80082713
23
+
24
+ Scenario: Extract alignment blocks by chromosomal range from index file
25
+ Given a MAF source file "mm8_chr7_tiny.maf"
26
+ And a Kyoto Cabinet index file "mm8_chr7_tiny.kct"
27
+ When I open it with a MAF reader
28
+ And search for blocks between positions 80082592 and 80082766 of mm8.chr7
29
+ Then 2 blocks are obtained
30
+ And sequence mm8.chr7 of block 0 has start 80082592
31
+ And sequence mm8.chr7 of block 1 has start 80082713
@@ -0,0 +1,29 @@
1
+ Feature: MAF output
2
+ In order to output modified MAF files or subsets of them
3
+ I want to be able to write out parsed MAF data
4
+
5
+ Scenario: Reproduce simple test data
6
+ Given a MAF source file "mm8_single.maf"
7
+ When I open it with a MAF reader
8
+ And open a new MAF writer
9
+ And write the header from the original MAF file
10
+ And write all the parsed blocks
11
+ Then the output should match, except whitespace, "mm8_single.maf"
12
+
13
+ Scenario: Reproduce longer test data
14
+ Given a MAF source file "mm8_chr7_tiny.maf"
15
+ When I open it with a MAF reader
16
+ And open a new MAF writer
17
+ And write the header from the original MAF file
18
+ And write all the parsed blocks
19
+ Then the output should match, except whitespace, "mm8_chr7_tiny.maf"
20
+
21
+ Scenario: Reproduce test data with i, e, q lines
22
+ Given a MAF source file "chr22_ieq.maf"
23
+ When I enable the :parse_extended parser option
24
+ And I enable the :parse_empty parser option
25
+ And I open it with a MAF reader
26
+ And open a new MAF writer
27
+ And write the header from the original MAF file
28
+ And write all the parsed blocks
29
+ Then the output should match, except whitespace, "chr22_ieq.maf"
@@ -0,0 +1,44 @@
1
+ Feature: Parse MAF files
2
+ In order to extract information from a MAF file
3
+ I want to read it and pull out information
4
+
5
+ Scenario: Read MAF header
6
+ Given MAF data:
7
+ """
8
+ ##maf version=1 scoring=humor.v4
9
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet25/chr22.maf /cluster/data/hg15/bed/blastz.rn3/axtNet25/chr22.maf
10
+
11
+ a score=0.128
12
+ s human_hoxa 100 8 + 100257 ACA-TTACT
13
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
14
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
15
+ """
16
+ When I open it with a MAF reader
17
+ Then the MAF version should be "1"
18
+ And the scoring scheme should be "humor.v4"
19
+ # third line a continuation
20
+ And the alignment parameters should be "humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet25/chr22.maf /cluster/data/hg15/bed/blastz.rn3/axtNet25/chr22.maf"
21
+
22
+ Scenario: Read alignment block
23
+ Given MAF data:
24
+ """
25
+ ##maf version=1 scoring=humor.v4
26
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
27
+ # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
28
+
29
+ a score=0.128
30
+ s human_hoxa 100 8 + 100257 ACA-TTACT
31
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
32
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
33
+ """
34
+ When I open it with a MAF reader
35
+ Then an alignment block can be obtained
36
+ And the alignment block has 3 sequences
37
+ And sequence 0 has source "human_hoxa"
38
+ And sequence 0 has start 100
39
+ And sequence 0 has size 8
40
+ And sequence 0 has strand :+
41
+ And sequence 0 has source size 100257
42
+ And sequence 0 has text "ACA-TTACT"
43
+ And sequence 1 has strand :-
44
+
@@ -0,0 +1,75 @@
1
+ @milestone_3
2
+ Feature: Filter results from MAF files
3
+ In order to work with only relevant data from a MAF file
4
+ Such as only species recognized by PhyloCSF
5
+ I want to filter the results of MAF queries
6
+
7
+ Scenario: Return only specified species
8
+ Given MAF data:
9
+ """
10
+ ##maf version=1
11
+ a score=10542.0
12
+ s mm8.chr7 80082334 34 + 145134094 GGGCTGAGGGC--AGGGATGG---AGGGCGGTCC--------------CAGCA-
13
+ s rn4.chr1 136011785 34 + 267910886 GGGCTGAGGGC--AGGGACGG---AGGGCGGTCC--------------CAGCA-
14
+ s oryCun1.scaffold_199771 14021 43 - 75077 -----ATGGGC--AAGCGTGG---AGGGGAACCTCTCCTCCCCTCCGACAAAG-
15
+ s hg18.chr15 88557580 27 + 100338915 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
16
+ s panTro2.chr15 87959837 27 + 100063422 --------GGC--AAGTGTGGA--AGGGAAGCCC--------------CAGAA-
17
+ s rheMac2.chr7 69864714 28 + 169801366 -------GGGC--AAGTATGGA--AGGGAAGCCC--------------CAGAA-
18
+ s canFam2.chr3 56030570 39 + 94715083 AGGTTTAGGGCAGAGGGATGAAGGAGGAGAATCC--------------CTATG-
19
+ s dasNov1.scaffold_106893 7435 34 + 9831 GGAACGAGGGC--ATGTGTGG---AGGGGGCTGC--------------CCACA-
20
+ s loxAfr1.scaffold_8298 30264 38 + 78952 ATGATGAGGGG--AAGCGTGGAGGAGGGGAACCC--------------CTAGGA
21
+ s echTel1.scaffold_304651 594 37 - 10007 -TGCTATGGCT--TTGTGTCTAGGAGGGGAATCC--------------CCAGGA
22
+ """
23
+ When I open it with a MAF reader
24
+ And filter for only the species
25
+ | hg18 |
26
+ | mm8 |
27
+ | rheMac2 |
28
+ Then an alignment block can be obtained
29
+ And the alignment block has 3 sequences
30
+
31
+ Scenario: Return only blocks having all specified species
32
+ Given a MAF source file "mm8_chr7_tiny.maf"
33
+ When I open it with a MAF reader
34
+ And build an index on the reference sequence
35
+ And filter for blocks with the species
36
+ | panTro2 |
37
+ | loxAfr1 |
38
+ And search for blocks between positions 80082471 and 80082730 of mm8.chr7
39
+ Then 1 block is obtained
40
+
41
+ Scenario: Return only blocks having a certain number of sequences
42
+ Given a MAF source file "mm8_chr7_tiny.maf"
43
+ When I open it with a MAF reader
44
+ And build an index on the reference sequence
45
+ And filter for blocks with at least 6 sequences
46
+ And search for blocks between positions 80082767 and 80083008 of mm8.chr7
47
+ Then 1 block is obtained
48
+
49
+ # sizes present:
50
+ # 55 64 128 148 157 163 165 192
51
+
52
+ Scenario: Return blocks with a maximum text size
53
+ Given a MAF source file "mm8_chr7_tiny.maf"
54
+ When I open it with a MAF reader
55
+ And build an index on the reference sequence
56
+ And filter for blocks with text size at least 150
57
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
58
+ Then 4 blocks are obtained
59
+
60
+ Scenario: Return blocks with a minimum text size
61
+ Given a MAF source file "mm8_chr7_tiny.maf"
62
+ When I open it with a MAF reader
63
+ And build an index on the reference sequence
64
+ And filter for blocks with text size at most 72
65
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
66
+ Then 2 blocks are obtained
67
+
68
+ Scenario: Return blocks within a text size range
69
+ Given a MAF source file "mm8_chr7_tiny.maf"
70
+ When I open it with a MAF reader
71
+ And build an index on the reference sequence
72
+ And filter for blocks with text size between 72 and 160
73
+ And search for blocks between positions 0 and 80100000 of mm8.chr7
74
+ Then 3 blocks are obtained
75
+
@@ -0,0 +1,50 @@
1
+ Feature: Convert MAF file to FASTA
2
+ In order to use multiple alignment data with other tools
3
+ I want to read a Multiple Alignment Format (MAF) file and write out its data as FASTA
4
+
5
+ Scenario: Convert simple MAF file
6
+ Given a MAF source file "t1.maf"
7
+ When I select FASTA output
8
+ And I open it with a MAF reader
9
+ And process the file
10
+ Then the output should match "t1.fasta"
11
+
12
+ Scenario: Convert simple MAF data
13
+ Given MAF data:
14
+ """
15
+ ##maf version=1 scoring=humor.v4
16
+ # humor.v4 R=30 M=10 /cluster/data/hg15/bed/blastz.mm3/axtNet300/chr1.maf
17
+ # /cluster/data/hg15/bed/blastz.rn3/axtNet300/chr1.maf
18
+
19
+ a score=0.128
20
+ s human_hoxa 100 8 + 100257 ACA-TTACT
21
+ s horse_hoxa 120 9 - 98892 ACAATTGCT
22
+ s fugu_hoxa 88 7 + 90788 ACA--TGCT
23
+
24
+
25
+ a score=0.071
26
+ s human_unc 9077 8 + 10998 ACAGTATT
27
+ # Comment
28
+ s horse_unc 4555 6 - 5099 ACA--ATT
29
+ s fugu_unc 4000 4 + 4038 AC----TT
30
+ """
31
+ When I select FASTA output
32
+ And I open it with a MAF reader
33
+ And process the file
34
+ Then the output should be:
35
+ """
36
+ >human_hoxa:100-108
37
+ ACA-TTACT
38
+ >horse_hoxa:120-129
39
+ ACAATTGCT
40
+ >fugu_hoxa:88-95
41
+ ACA--TGCT
42
+ >human_unc:9077-9085
43
+ ACAGTATT
44
+ >horse_unc:4555-4561
45
+ ACA--ATT
46
+ >fugu_unc:4000-4004
47
+ AC----TT
48
+
49
+ """
50
+
@@ -0,0 +1,45 @@
1
+ require 'bigbio' # FASTA support
2
+
3
+ Given /^a MAF source file "(.*?)"$/ do |src|
4
+ @src_f = $test_data + src
5
+ @src_f.exist?.should be_true
6
+ end
7
+
8
+ Given /^MAF data:$/ do |string|
9
+ @src_f = Tempfile.new(['rspec', '.maf'])
10
+ @src_f.write(string)
11
+ @src_f.close
12
+ end
13
+
14
+ When /^I select FASTA output$/ do
15
+ @dst = Tempfile.new(['cuke', ".#{@out_fmt.to_s}"])
16
+ @dst.close
17
+ @writer = FastaWriter.new(@dst.path)
18
+ end
19
+
20
+ When /^process the file$/ do
21
+ @parser.each_block do |block|
22
+ block.each_raw_seq do |seq|
23
+ seq.write_fasta(@writer)
24
+ end
25
+ end
26
+ @writer.close
27
+ end
28
+
29
+ Then /^the output should match "(.*?)"$/ do |ref|
30
+ ref_p = $test_data + ref
31
+ ref_p.exist?.should be_true
32
+ #system("diff #{ref} #{@dst.path} >/dev/null 2>&1").should be_true
33
+ File.read(@dst.path).should == File.read(ref_p)
34
+ end
35
+
36
+ Then /^the output should be:$/ do |string|
37
+ File.read(@dst.path).should == string
38
+ end
39
+
40
+ After do
41
+ if @dst
42
+ @dst.close
43
+ @dst.unlink
44
+ end
45
+ end