bio-maf 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 1000000
6
+ data = <<EOF
7
+ a score=28680.000000
8
+ s hg19.chr22 16050711 61 + 51304566 atctccaagagggcataaaacac-tgagtaaacagctcttttatatgtgtttcctggatgag
9
+ s panTro2.chrUn 7681110 59 + 58616431 atctccaagagggcataaaacac-tgagtaaacagctctt--atatgtgtttcctggatgag
10
+ q panTro2.chrUn 99999999999999999999999-9999999999999999--99999999999999999999
11
+ i panTro2.chrUn C 0 C 0
12
+ s tarSyr1.scaffold_75923 2859 50 - 8928 atctccaagagggctgaaaatgc-caaatga-----------tcacacgtttcctggacaag
13
+ q tarSyr1.scaffold_75923 79295966999999999999998-9999799-----------99999999997657759999
14
+ i tarSyr1.scaffold_75923 N 0 C 0
15
+ s micMur1.scaffold_22105 5493 59 - 10683 acctccgagagggctcaaaacgc-cgagtgatcagctctt--atgcgcgtttcctggacgag
16
+ q micMur1.scaffold_22105 99999999999999999999999-9999999999999999--99999999999999999999
17
+ i micMur1.scaffold_22105 C 0 C 0
18
+ s tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg
19
+ q tupBel1.scaffold_3803.1-85889 79648579699867994997775679665662767577569-69987455976776322888
20
+ i tupBel1.scaffold_3803.1-85889 I 1 C 0
21
+ s vicPac1.scaffold_12713 6831 55 - 10681 actgccatgggggctcagcgtac-tgaatggttaattact------gtggtccccgaatgag
22
+ q vicPac1.scaffold_12713 99999999999999999999999-9999999999999999------9999999999999999
23
+ EOF
24
+
25
+ Benchmark.bmbm do |x|
26
+ x.report("split/each") do
27
+ n.times do
28
+ i = 0
29
+ data.split("\n").each do |line|
30
+ i += line.size
31
+ end
32
+ end
33
+ end
34
+ x.report("until/shift") do
35
+ n.times do
36
+ i = 0
37
+ lines = data.split("\n")
38
+ until lines.empty?
39
+ line = lines.shift
40
+ i += line.size
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ FILE = '/Users/csw/maf/chr22.maf'
6
+
7
+ Benchmark.bm do |x|
8
+ x.report("8k") do
9
+ File.open(FILE) do |f|
10
+ while true
11
+ r = f.read(8192)
12
+ break unless r
13
+ end
14
+ end
15
+ end
16
+ x.report("128k") do
17
+ File.open(FILE) do |f|
18
+ while true
19
+ r = f.read(128 * 1024)
20
+ break unless r
21
+ end
22
+ end
23
+ end
24
+ x.report("1M") do
25
+ File.open(FILE) do |f|
26
+ while true
27
+ r = f.read(1024 * 1024)
28
+ break unless r
29
+ end
30
+ end
31
+ end
32
+ x.report("8M") do
33
+ File.open(FILE) do |f|
34
+ while true
35
+ r = f.read(8 * 1024 * 1024)
36
+ break unless r
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ class Thing
6
+ attr_reader :part
7
+
8
+ def initialize(part)
9
+ @part = part
10
+ end
11
+ end
12
+
13
+ prng = Random.new
14
+ v_max = 1 << 31
15
+ ary = []
16
+ 1000.times do
17
+ ary << Thing.new(rand(v_max))
18
+ end
19
+
20
+ Benchmark.bmbm do |x|
21
+ x.report("sort!") do
22
+ 1000.times do
23
+ ary2 = ary.dup
24
+ ary2.sort! { |a, b| a.part <=> b.part }
25
+ end
26
+ end
27
+ x.report("sort_by!") do
28
+ 1000.times do
29
+ ary2 = ary.dup
30
+ ary2.sort_by! { |i| i.part }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 2000000
6
+ line = 's tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg'
7
+
8
+ Benchmark.bmbm do |x|
9
+ x.report("basic String#split") do
10
+ n.times do
11
+ parts = line.split
12
+ end
13
+ end
14
+ x.report("regex split") do
15
+ n.times do
16
+ parts = line.split(/\s+/)
17
+ end
18
+ end
19
+ x.report("regex fields") do
20
+ n.times do
21
+ if m = /^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/.match(line)
22
+ parts = m.captures
23
+ end
24
+ end
25
+ end
26
+ x.report("regex fields") do
27
+ n.times do
28
+ if m = /^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/.match(line)
29
+ parts = m.captures
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'bigbio'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+ options.reader = Bio::MAF::ChunkReader
11
+
12
+ PRINTERS = {
13
+ 'flat' => :FlatPrinter,
14
+ 'stack' => :CallStackPrinter
15
+ }
16
+
17
+ OptionParser.new do |opts|
18
+ opts.banner = "Usage: maf_count [options] <maf>"
19
+ opts.separator ""
20
+ opts.separator "Options:"
21
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
22
+ options.prof = prof
23
+ end
24
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
25
+ if pspec =~ /(\w+):(.+)/
26
+ require 'ruby-prof'
27
+ options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
28
+ options.ruby_prof_path = $2
29
+ else
30
+ options.ruby_prof_printer = RubyProf::FlatPrinter
31
+ options.ruby_prof_path = pspec
32
+ end
33
+ end
34
+ opts.on("--profile-gc", "Profile GC") do |prof|
35
+ options.profile_gc = true
36
+ end
37
+ opts.on("--parser PARSER", "parser") do |name|
38
+ options.parser = Bio::MAF.const_get(name)
39
+ end
40
+ opts.on("-t", "--threaded") do
41
+ options.reader = Bio::MAF::ThreadedChunkReader
42
+ end
43
+ end.parse!(ARGV)
44
+
45
+ src_path = ARGV.shift
46
+
47
+ if options.prof
48
+ require 'perftools'
49
+ PerfTools::CpuProfiler.start(options.prof)
50
+ elsif options.ruby_prof_path
51
+ require 'ruby-prof'
52
+ RubyProf.start
53
+ end
54
+
55
+ if options.profile_gc
56
+ GC::Profiler.enable
57
+ end
58
+
59
+ parser = options.parser.new(src_path,
60
+ :chunk_reader => options.reader,
61
+ :parse_extended => false)
62
+
63
+ n = 0
64
+ parser.each_block do |block|
65
+ n += 1
66
+ end
67
+ puts "Parsed #{n} MAF alignment blocks."
68
+
69
+ if options.profile_gc
70
+ $stderr.puts GC::Profiler.result
71
+ GC::Profiler.disable
72
+ end
73
+
74
+ if options.prof
75
+ PerfTools::CpuProfiler.stop
76
+ elsif options.ruby_prof_path
77
+ res = RubyProf.stop
78
+ printer = options.ruby_prof_printer.new(res)
79
+ File.open(options.ruby_prof_path, 'w') do |f|
80
+ printer.print(f)
81
+ end
82
+ end
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'bigbio'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+
11
+ OptionParser.new do |opts|
12
+ opts.banner = "Usage: maf_dump_blocks [options] <maf>"
13
+ opts.separator ""
14
+ opts.separator "Options:"
15
+ opts.on("--parser PARSER", "parser") do |name|
16
+ options.parser = Bio::MAF.const_get(name)
17
+ end
18
+ end.parse!(ARGV)
19
+
20
+ src_path = ARGV.shift
21
+
22
+ parser = options.parser.new(src_path)
23
+
24
+ parser.each_block do |block|
25
+ $stdout.printf("%12d\t%7d\n", block.offset, block.size)
26
+ end
27
+
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'ostruct'
5
+
6
+ require 'bio-maf'
7
+ require 'bio-genomic-interval'
8
+
9
+ options = OpenStruct.new
10
+ options.p = { :threads => 1 }
11
+ options.passes = 1
12
+
13
+ OptionParser.new do |opts|
14
+ opts.banner = "Usage: maf_extract_ranges_count [options] <maf> <index>"
15
+ opts.separator ""
16
+ opts.separator "Options:"
17
+ opts.on("-t", "--threads N", "Parser threads") do |n|
18
+ options.p[:threads] = n.to_i
19
+ end
20
+ opts.on("-p", "--passes N", "Number of passes") do |n|
21
+ options.passes = n.to_i
22
+ end
23
+ end.parse!(ARGV)
24
+
25
+ maf_p = ARGV.shift
26
+ index_p = ARGV.shift
27
+
28
+ parser = Bio::MAF::Parser.new(maf_p, options.p)
29
+ index = Bio::MAF::KyotoIndex.open(index_p)
30
+
31
+ def parse_interval(line)
32
+ src, r_start_s, r_end_s, _ = line.split(nil, 4)
33
+ r_start = r_start_s.to_i
34
+ r_end = r_end_s.to_i
35
+ return Bio::GenomicInterval.zero_based(src, r_start, r_end)
36
+ end
37
+
38
+ intervals = []
39
+ $stdin.each_line { |line| intervals << parse_interval(line) }
40
+
41
+ options.passes.times do
42
+ blocks = index.find(intervals, parser)
43
+ puts "TOTAL: #{blocks.count} blocks parsed."
44
+ end
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+ require 'bio-maf'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ PRINTERS = {
9
+ 'flat' => :FlatPrinter,
10
+ 'stack' => :CallStackPrinter,
11
+ 'graph' => :GraphHtmlPrinter
12
+ }
13
+
14
+ $options = OpenStruct.new
15
+ $options.mode = :build
16
+ $options.reader = Bio::MAF::ChunkReader
17
+
18
+ def build_index(maf, index)
19
+ parser = Bio::MAF::Parser.new(maf,
20
+ :chunk_reader => $options.reader,
21
+ :parse_extended => false)
22
+ idx = Bio::MAF::KyotoIndex.build(parser, index)
23
+ idx.close
24
+ end
25
+
26
+ op = OptionParser.new do |opts|
27
+ opts.banner = "Usage: maf_index [options] <maf> <index>"
28
+ #opts.separator ""
29
+ #opts.separator "Options:"
30
+ opts.on("--time", "print elapsed time") do
31
+ $options.bench = true
32
+ end
33
+ opts.on("-d", "--dump") do
34
+ $options.mode = :dump
35
+ end
36
+ opts.on("-t", "--threaded") do
37
+ $options.reader = Bio::MAF::ThreadedChunkReader
38
+ end
39
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
40
+ require 'ruby-prof'
41
+ if pspec =~ /(\w+):(.+)/
42
+ $options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
43
+ $options.ruby_prof_path = $2
44
+ else
45
+ $options.ruby_prof_printer = Ruby_Prof::FlatPrinter
46
+ $options.ruby_prof_path = pspec
47
+ end
48
+ end
49
+ end
50
+
51
+ op.parse!(ARGV)
52
+
53
+ maf_p = ARGV.shift if $options.mode == :build
54
+ index_p = ARGV.shift
55
+
56
+ unless (maf_p || $options.mode == :dump) && index_p
57
+ $stderr.puts op
58
+ exit 1
59
+ end
60
+
61
+ if $options.ruby_prof_path
62
+ RubyProf.start
63
+ end
64
+
65
+ case $options.mode
66
+ when :build
67
+ if ! $options.bench
68
+ build_index(maf_p, index_p)
69
+ else
70
+ bm_res = Benchmark.measure do
71
+ build_index(maf_p, index_p)
72
+ end
73
+ puts bm_res
74
+ end
75
+ when :dump
76
+ idx = Bio::MAF::KyotoIndex.open(index_p)
77
+ idx.dump
78
+ else
79
+ raise "Unsupported mode: #{$options.mode}"
80
+ end
81
+
82
+ if $options.ruby_prof_path
83
+ res = RubyProf.stop
84
+ printer = $options.ruby_prof_printer.new(res)
85
+ File.open($options.ruby_prof_path, 'w') do |f|
86
+ printer.print(f)
87
+ end
88
+ end
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+ require 'bio-maf'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+ options.runs = 100_000
11
+ options.warmup = false
12
+
13
+ PRINTERS = {
14
+ 'flat' => :FlatPrinter,
15
+ 'stack' => :CallStackPrinter
16
+ }
17
+
18
+ OptionParser.new do |opts|
19
+ opts.banner = "Usage: maf_parse_bench [options] <maf>"
20
+ opts.separator ""
21
+ opts.separator "Options:"
22
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
23
+ options.prof = prof
24
+ end
25
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
26
+ if pspec =~ /(\w+):(.+)/
27
+ require 'ruby-prof'
28
+ options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
29
+ options.ruby_prof_path = $2
30
+ else
31
+ options.ruby_prof_printer = :FlatPrinter
32
+ options.ruby_prof_path = pspec
33
+ end
34
+ end
35
+ opts.on("--profile-gc", "Profile GC") do |prof|
36
+ options.profile_gc = true
37
+ end
38
+ opts.on("--parser PARSER", "parser") do |name|
39
+ options.parser = Bio::MAF.const_get(name)
40
+ end
41
+ opts.on("-w", "--warmup", "perform warmup run") do
42
+ options.warmup = true
43
+ end
44
+ end.parse!(ARGV)
45
+
46
+ src_path = ARGV.shift
47
+
48
+ if options.prof
49
+ require 'perftools'
50
+ PerfTools::CpuProfiler.start(options.prof)
51
+ elsif options.ruby_prof_path
52
+ require 'ruby-prof'
53
+ RubyProf.start
54
+ end
55
+
56
+ if options.profile_gc
57
+ GC::Profiler.enable
58
+ end
59
+
60
+ parser = options.parser.new(src_path)
61
+ parser.parse_block
62
+ parser.parse_block
63
+ pos = parser.s.pos
64
+
65
+ if options.warmup
66
+ options.runs.times do
67
+ parser.parse_block
68
+ parser.s.pos = pos
69
+ end
70
+ end
71
+
72
+ bm_res = Benchmark.measure do
73
+ options.runs.times do
74
+ parser.parse_block
75
+ parser.s.pos = pos
76
+ end
77
+ end
78
+
79
+ if options.profile_gc
80
+ $stderr.puts GC::Profiler.result
81
+ GC::Profiler.disable
82
+ end
83
+
84
+ if options.prof
85
+ PerfTools::CpuProfiler.stop
86
+ elsif options.ruby_prof_path
87
+ res = RubyProf.stop
88
+ printer = options.ruby_prof_printer.new(res)
89
+ File.open(options.ruby_prof_path, 'w') do |f|
90
+ printer.print(f)
91
+ end
92
+ end
93
+
94
+ puts bm_res / options.runs