bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 1000000
6
+ data = <<EOF
7
+ a score=28680.000000
8
+ s hg19.chr22 16050711 61 + 51304566 atctccaagagggcataaaacac-tgagtaaacagctcttttatatgtgtttcctggatgag
9
+ s panTro2.chrUn 7681110 59 + 58616431 atctccaagagggcataaaacac-tgagtaaacagctctt--atatgtgtttcctggatgag
10
+ q panTro2.chrUn 99999999999999999999999-9999999999999999--99999999999999999999
11
+ i panTro2.chrUn C 0 C 0
12
+ s tarSyr1.scaffold_75923 2859 50 - 8928 atctccaagagggctgaaaatgc-caaatga-----------tcacacgtttcctggacaag
13
+ q tarSyr1.scaffold_75923 79295966999999999999998-9999799-----------99999999997657759999
14
+ i tarSyr1.scaffold_75923 N 0 C 0
15
+ s micMur1.scaffold_22105 5493 59 - 10683 acctccgagagggctcaaaacgc-cgagtgatcagctctt--atgcgcgtttcctggacgag
16
+ q micMur1.scaffold_22105 99999999999999999999999-9999999999999999--99999999999999999999
17
+ i micMur1.scaffold_22105 C 0 C 0
18
+ s tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg
19
+ q tupBel1.scaffold_3803.1-85889 79648579699867994997775679665662767577569-69987455976776322888
20
+ i tupBel1.scaffold_3803.1-85889 I 1 C 0
21
+ s vicPac1.scaffold_12713 6831 55 - 10681 actgccatgggggctcagcgtac-tgaatggttaattact------gtggtccccgaatgag
22
+ q vicPac1.scaffold_12713 99999999999999999999999-9999999999999999------9999999999999999
23
+ EOF
24
+
25
+ Benchmark.bmbm do |x|
26
+ x.report("split/each") do
27
+ n.times do
28
+ i = 0
29
+ data.split("\n").each do |line|
30
+ i += line.size
31
+ end
32
+ end
33
+ end
34
+ x.report("until/shift") do
35
+ n.times do
36
+ i = 0
37
+ lines = data.split("\n")
38
+ until lines.empty?
39
+ line = lines.shift
40
+ i += line.size
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ FILE = '/Users/csw/maf/chr22.maf'
6
+
7
+ Benchmark.bm do |x|
8
+ x.report("8k") do
9
+ File.open(FILE) do |f|
10
+ while true
11
+ r = f.read(8192)
12
+ break unless r
13
+ end
14
+ end
15
+ end
16
+ x.report("128k") do
17
+ File.open(FILE) do |f|
18
+ while true
19
+ r = f.read(128 * 1024)
20
+ break unless r
21
+ end
22
+ end
23
+ end
24
+ x.report("1M") do
25
+ File.open(FILE) do |f|
26
+ while true
27
+ r = f.read(1024 * 1024)
28
+ break unless r
29
+ end
30
+ end
31
+ end
32
+ x.report("8M") do
33
+ File.open(FILE) do |f|
34
+ while true
35
+ r = f.read(8 * 1024 * 1024)
36
+ break unless r
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ class Thing
6
+ attr_reader :part
7
+
8
+ def initialize(part)
9
+ @part = part
10
+ end
11
+ end
12
+
13
+ prng = Random.new
14
+ v_max = 1 << 31
15
+ ary = []
16
+ 1000.times do
17
+ ary << Thing.new(rand(v_max))
18
+ end
19
+
20
+ Benchmark.bmbm do |x|
21
+ x.report("sort!") do
22
+ 1000.times do
23
+ ary2 = ary.dup
24
+ ary2.sort! { |a, b| a.part <=> b.part }
25
+ end
26
+ end
27
+ x.report("sort_by!") do
28
+ 1000.times do
29
+ ary2 = ary.dup
30
+ ary2.sort_by! { |i| i.part }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 2000000
6
+ line = 's tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg'
7
+
8
+ Benchmark.bmbm do |x|
9
+ x.report("basic String#split") do
10
+ n.times do
11
+ parts = line.split
12
+ end
13
+ end
14
+ x.report("regex split") do
15
+ n.times do
16
+ parts = line.split(/\s+/)
17
+ end
18
+ end
19
+ x.report("regex fields") do
20
+ n.times do
21
+ if m = /^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/.match(line)
22
+ parts = m.captures
23
+ end
24
+ end
25
+ end
26
+ x.report("regex fields") do
27
+ n.times do
28
+ if m = /^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/.match(line)
29
+ parts = m.captures
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'bigbio'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+ options.reader = Bio::MAF::ChunkReader
11
+
12
+ PRINTERS = {
13
+ 'flat' => :FlatPrinter,
14
+ 'stack' => :CallStackPrinter
15
+ }
16
+
17
+ OptionParser.new do |opts|
18
+ opts.banner = "Usage: maf_count [options] <maf>"
19
+ opts.separator ""
20
+ opts.separator "Options:"
21
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
22
+ options.prof = prof
23
+ end
24
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
25
+ if pspec =~ /(\w+):(.+)/
26
+ require 'ruby-prof'
27
+ options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
28
+ options.ruby_prof_path = $2
29
+ else
30
+ options.ruby_prof_printer = RubyProf::FlatPrinter
31
+ options.ruby_prof_path = pspec
32
+ end
33
+ end
34
+ opts.on("--profile-gc", "Profile GC") do |prof|
35
+ options.profile_gc = true
36
+ end
37
+ opts.on("--parser PARSER", "parser") do |name|
38
+ options.parser = Bio::MAF.const_get(name)
39
+ end
40
+ opts.on("-t", "--threaded") do
41
+ options.reader = Bio::MAF::ThreadedChunkReader
42
+ end
43
+ end.parse!(ARGV)
44
+
45
+ src_path = ARGV.shift
46
+
47
+ if options.prof
48
+ require 'perftools'
49
+ PerfTools::CpuProfiler.start(options.prof)
50
+ elsif options.ruby_prof_path
51
+ require 'ruby-prof'
52
+ RubyProf.start
53
+ end
54
+
55
+ if options.profile_gc
56
+ GC::Profiler.enable
57
+ end
58
+
59
+ parser = options.parser.new(src_path,
60
+ :chunk_reader => options.reader,
61
+ :parse_extended => false)
62
+
63
+ n = 0
64
+ parser.each_block do |block|
65
+ n += 1
66
+ end
67
+ puts "Parsed #{n} MAF alignment blocks."
68
+
69
+ if options.profile_gc
70
+ $stderr.puts GC::Profiler.result
71
+ GC::Profiler.disable
72
+ end
73
+
74
+ if options.prof
75
+ PerfTools::CpuProfiler.stop
76
+ elsif options.ruby_prof_path
77
+ res = RubyProf.stop
78
+ printer = options.ruby_prof_printer.new(res)
79
+ File.open(options.ruby_prof_path, 'w') do |f|
80
+ printer.print(f)
81
+ end
82
+ end
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio-maf'
4
+ require 'bigbio'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+
11
+ OptionParser.new do |opts|
12
+ opts.banner = "Usage: maf_dump_blocks [options] <maf>"
13
+ opts.separator ""
14
+ opts.separator "Options:"
15
+ opts.on("--parser PARSER", "parser") do |name|
16
+ options.parser = Bio::MAF.const_get(name)
17
+ end
18
+ end.parse!(ARGV)
19
+
20
+ src_path = ARGV.shift
21
+
22
+ parser = options.parser.new(src_path)
23
+
24
+ parser.each_block do |block|
25
+ $stdout.printf("%12d\t%7d\n", block.offset, block.size)
26
+ end
27
+
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'ostruct'
5
+
6
+ require 'bio-maf'
7
+ require 'bio-genomic-interval'
8
+
9
+ options = OpenStruct.new
10
+ options.p = { :threads => 1 }
11
+ options.passes = 1
12
+
13
+ OptionParser.new do |opts|
14
+ opts.banner = "Usage: maf_extract_ranges_count [options] <maf> <index>"
15
+ opts.separator ""
16
+ opts.separator "Options:"
17
+ opts.on("-t", "--threads N", "Parser threads") do |n|
18
+ options.p[:threads] = n.to_i
19
+ end
20
+ opts.on("-p", "--passes N", "Number of passes") do |n|
21
+ options.passes = n.to_i
22
+ end
23
+ end.parse!(ARGV)
24
+
25
+ maf_p = ARGV.shift
26
+ index_p = ARGV.shift
27
+
28
+ parser = Bio::MAF::Parser.new(maf_p, options.p)
29
+ index = Bio::MAF::KyotoIndex.open(index_p)
30
+
31
+ def parse_interval(line)
32
+ src, r_start_s, r_end_s, _ = line.split(nil, 4)
33
+ r_start = r_start_s.to_i
34
+ r_end = r_end_s.to_i
35
+ return Bio::GenomicInterval.zero_based(src, r_start, r_end)
36
+ end
37
+
38
+ intervals = []
39
+ $stdin.each_line { |line| intervals << parse_interval(line) }
40
+
41
+ options.passes.times do
42
+ blocks = index.find(intervals, parser)
43
+ puts "TOTAL: #{blocks.count} blocks parsed."
44
+ end
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+ require 'bio-maf'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ PRINTERS = {
9
+ 'flat' => :FlatPrinter,
10
+ 'stack' => :CallStackPrinter,
11
+ 'graph' => :GraphHtmlPrinter
12
+ }
13
+
14
+ $options = OpenStruct.new
15
+ $options.mode = :build
16
+ $options.reader = Bio::MAF::ChunkReader
17
+
18
+ def build_index(maf, index)
19
+ parser = Bio::MAF::Parser.new(maf,
20
+ :chunk_reader => $options.reader,
21
+ :parse_extended => false)
22
+ idx = Bio::MAF::KyotoIndex.build(parser, index)
23
+ idx.close
24
+ end
25
+
26
+ op = OptionParser.new do |opts|
27
+ opts.banner = "Usage: maf_index [options] <maf> <index>"
28
+ #opts.separator ""
29
+ #opts.separator "Options:"
30
+ opts.on("--time", "print elapsed time") do
31
+ $options.bench = true
32
+ end
33
+ opts.on("-d", "--dump") do
34
+ $options.mode = :dump
35
+ end
36
+ opts.on("-t", "--threaded") do
37
+ $options.reader = Bio::MAF::ThreadedChunkReader
38
+ end
39
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
40
+ require 'ruby-prof'
41
+ if pspec =~ /(\w+):(.+)/
42
+ $options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
43
+ $options.ruby_prof_path = $2
44
+ else
45
+ $options.ruby_prof_printer = Ruby_Prof::FlatPrinter
46
+ $options.ruby_prof_path = pspec
47
+ end
48
+ end
49
+ end
50
+
51
+ op.parse!(ARGV)
52
+
53
+ maf_p = ARGV.shift if $options.mode == :build
54
+ index_p = ARGV.shift
55
+
56
+ unless (maf_p || $options.mode == :dump) && index_p
57
+ $stderr.puts op
58
+ exit 1
59
+ end
60
+
61
+ if $options.ruby_prof_path
62
+ RubyProf.start
63
+ end
64
+
65
+ case $options.mode
66
+ when :build
67
+ if ! $options.bench
68
+ build_index(maf_p, index_p)
69
+ else
70
+ bm_res = Benchmark.measure do
71
+ build_index(maf_p, index_p)
72
+ end
73
+ puts bm_res
74
+ end
75
+ when :dump
76
+ idx = Bio::MAF::KyotoIndex.open(index_p)
77
+ idx.dump
78
+ else
79
+ raise "Unsupported mode: #{$options.mode}"
80
+ end
81
+
82
+ if $options.ruby_prof_path
83
+ res = RubyProf.stop
84
+ printer = $options.ruby_prof_printer.new(res)
85
+ File.open($options.ruby_prof_path, 'w') do |f|
86
+ printer.print(f)
87
+ end
88
+ end
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+ require 'bio-maf'
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+ options.parser = Bio::MAF::Parser
10
+ options.runs = 100_000
11
+ options.warmup = false
12
+
13
+ PRINTERS = {
14
+ 'flat' => :FlatPrinter,
15
+ 'stack' => :CallStackPrinter
16
+ }
17
+
18
+ OptionParser.new do |opts|
19
+ opts.banner = "Usage: maf_parse_bench [options] <maf>"
20
+ opts.separator ""
21
+ opts.separator "Options:"
22
+ opts.on("-p", "--profile PROF", "Profile with PerfTools") do |prof|
23
+ options.prof = prof
24
+ end
25
+ opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
26
+ if pspec =~ /(\w+):(.+)/
27
+ require 'ruby-prof'
28
+ options.ruby_prof_printer = RubyProf.const_get(PRINTERS.fetch($1))
29
+ options.ruby_prof_path = $2
30
+ else
31
+ options.ruby_prof_printer = :FlatPrinter
32
+ options.ruby_prof_path = pspec
33
+ end
34
+ end
35
+ opts.on("--profile-gc", "Profile GC") do |prof|
36
+ options.profile_gc = true
37
+ end
38
+ opts.on("--parser PARSER", "parser") do |name|
39
+ options.parser = Bio::MAF.const_get(name)
40
+ end
41
+ opts.on("-w", "--warmup", "perform warmup run") do
42
+ options.warmup = true
43
+ end
44
+ end.parse!(ARGV)
45
+
46
+ src_path = ARGV.shift
47
+
48
+ if options.prof
49
+ require 'perftools'
50
+ PerfTools::CpuProfiler.start(options.prof)
51
+ elsif options.ruby_prof_path
52
+ require 'ruby-prof'
53
+ RubyProf.start
54
+ end
55
+
56
+ if options.profile_gc
57
+ GC::Profiler.enable
58
+ end
59
+
60
+ parser = options.parser.new(src_path)
61
+ parser.parse_block
62
+ parser.parse_block
63
+ pos = parser.s.pos
64
+
65
+ if options.warmup
66
+ options.runs.times do
67
+ parser.parse_block
68
+ parser.s.pos = pos
69
+ end
70
+ end
71
+
72
+ bm_res = Benchmark.measure do
73
+ options.runs.times do
74
+ parser.parse_block
75
+ parser.s.pos = pos
76
+ end
77
+ end
78
+
79
+ if options.profile_gc
80
+ $stderr.puts GC::Profiler.result
81
+ GC::Profiler.disable
82
+ end
83
+
84
+ if options.prof
85
+ PerfTools::CpuProfiler.stop
86
+ elsif options.ruby_prof_path
87
+ res = RubyProf.stop
88
+ printer = options.ruby_prof_printer.new(res)
89
+ File.open(options.ruby_prof_path, 'w') do |f|
90
+ printer.print(f)
91
+ end
92
+ end
93
+
94
+ puts bm_res / options.runs