bio-maf 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio/maf/index.rb +620 -0
  38. data/lib/bio/maf/parser.rb +888 -0
  39. data/lib/bio/maf/struct.rb +63 -0
  40. data/lib/bio/maf/writer.rb +63 -0
  41. data/lib/bio/maf.rb +4 -0
  42. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  43. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio-maf/maf.rb +3 -0
  46. data/lib/bio-maf.rb +12 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +181 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.simplecov ADDED
@@ -0,0 +1 @@
1
+ SimpleCov.start
data/.travis.yml ADDED
@@ -0,0 +1,16 @@
1
+ language: ruby
2
+ before_install:
3
+ #- sudo update-java-alternatives -s java-1.7.0-openjdk-i386
4
+ - sudo update-java-alternatives -s java-1.6.0-openjdk
5
+ - sudo ./travis-ci/install_kc
6
+ bundler_args: --without development
7
+ script: "bundle exec rake test"
8
+ after_script:
9
+ - ./travis-ci/report_errors
10
+ rvm:
11
+ - 1.9.3
12
+ - jruby-19mode # JRuby in 1.9 mode
13
+ - rbx-19mode
14
+ matrix:
15
+ allow_failures:
16
+ - rvm: rbx-19mode
data/.yardopts ADDED
@@ -0,0 +1,3 @@
1
+ --markup markdown
2
+ --markup-provider=redcarpet
3
+ - DEVELOPMENT.md
data/DEVELOPMENT.md ADDED
@@ -0,0 +1,40 @@
1
+ # Development guide
2
+
3
+ Here are notes on less obvious aspects of the development process for
4
+ this library.
5
+
6
+ ## kyotocabinet-java
7
+
8
+ Running `bio-maf` on JRuby requires the [kyotocabinet-java][] gem, a
9
+ wrapper around the Kyoto Cabinet Java interface providing a Ruby API
10
+ compatible with the standard Kyoto Cabinet Ruby API.
11
+
12
+ [kyotocabinet-java]: https://github.com/csw/kyotocabinet-java
13
+
14
+ ## Man pages
15
+
16
+ Man pages are developed with [ronn][] and live in `man/`; see
17
+ [maf_index.1.ronn][] for an example. The generated man pages,
18
+ e.g. `maf_index.1`, are added to Git for [gem-man][] support.
19
+
20
+ [ronn]: https://github.com/rtomayko/ronn
21
+ [gem-man]: https://github.com/defunkt/gem-man
22
+ [maf_index.1.ronn]: https://github.com/csw/bioruby-maf/blob/master/man/maf_index.1.ronn
23
+
24
+ HTML and roff versions are built with:
25
+
26
+ $ rake man
27
+
28
+ The HTML versions are published through Octopress to Github Pages,
29
+ e.g. <http://csw.github.com/bioruby-maf/man/maf_index.1.html>. This is
30
+ a separate step, and necessarily dependent on the local filesystem
31
+ layout. Specifically, there must be an `octopress` directory at the
32
+ same level as `bioruby-maf`, containing a checked-out copy of
33
+ <https://github.com/csw/bioruby-maf-blog>. Then, to publish the man
34
+ pages, run:
35
+
36
+ $ rake man:publish
37
+
38
+ After this, in that Octopress instance, run:
39
+
40
+ $ rake deploy
data/Gemfile ADDED
@@ -0,0 +1,23 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+
4
+ gemspec
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "rdoc", "~> 3.12"
10
+ gem "simplecov", "~> 0.6.4", :platforms => :mri
11
+ gem "yard", "~> 0.8.1"
12
+ gem "kramdown", "~> 0.13.6"
13
+ gem "redcarpet", "~> 2.1.1", :platforms => :mri
14
+ gem "ronn", "~> 0.7.3", :platforms => :mri
15
+ gem "sinatra", "~> 1.3.2" # for ronn --server
16
+ end
17
+
18
+ group :test do
19
+ gem "bundler", ">= 1.0.0"
20
+ gem "rake", ">= 0.9"
21
+ gem "cucumber", ">= 0"
22
+ gem "rspec", "~> 2.10.0"
23
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 csw
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,209 @@
1
+ # bio-maf
2
+
3
+ [![Build Status](https://secure.travis-ci.org/csw/bioruby-maf.png)](http://travis-ci.org/csw/bioruby-maf)
4
+
5
+ This is a plugin for [BioRuby](http://bioruby.open-bio.org/) adding
6
+ support for the
7
+ [Multiple Alignment Format](http://genome.ucsc.edu/FAQ/FAQformat#format5)
8
+ (MAF), used in bioinformatics to store whole-genome sets of multiple
9
+ sequence alignments.
10
+
11
+ Ultimately it will provide indexed and sequential access to MAF data,
12
+ as well as performing various manipulations on it and writing modified
13
+ MAF files. So far, it only supports simple sequential parsing.
14
+
15
+ For more information, see the
16
+ [project wiki](https://github.com/csw/bioruby-maf/wiki).
17
+
18
+ Developer documentation generated with YARD is available at
19
+ [rubydoc.info](http://rubydoc.info/github/csw/bioruby-maf/).
20
+
21
+ This is being developed by Clayton Wheeler as
22
+ [part of](http://www.bioruby.org/wiki/Google_Summer_of_Code) the
23
+ Google Summer of Code 2012, under the auspices of the Open
24
+ Bioinformatics Foundation. The development
25
+ [blog](http://csw.github.com/bioruby-maf/) may be of interest.
26
+
27
+ ## Dependencies
28
+
29
+ [Kyoto Cabinet][] is a database library, required for building MAF
30
+ indexes. Install the core library in the appropriate way for your
31
+ platform, as documented [here][].
32
+
33
+ [Kyoto Cabinet]: http://fallabs.com/kyotocabinet/
34
+ [here]: https://github.com/csw/bioruby-maf/wiki/Kyoto-Cabinet
35
+
36
+ If you're using MRI, the [kyotocabinet-ruby][] gem will be used to
37
+ interact with Kyoto Cabinet. For best performance, however, you should
38
+ really consider using JRuby. On JRuby, the [kyotocabinet-java][] gem
39
+ will be used instead; this builds a Java library using JNI to call
40
+ into Kyoto Cabinet. Please file a [bug report][] if you encounter
41
+ problems building or using this gem, which is still fairly new.
42
+
43
+ [kyotocabinet-ruby]: https://rubygems.org/gems/kyotocabinet-ruby
44
+ [kyotocabinet-java]: https://github.com/csw/kyotocabinet-java
45
+ [bug report]: https://github.com/csw/kyotocabinet-java/issues
46
+
47
+
48
+ ## Installation
49
+
50
+ $ gem install bio-maf
51
+
52
+ ## Usage
53
+
54
+ ### Create an index on a MAF file
55
+
56
+ Much of the functionality of this library relies on an index. You can
57
+ create one with [maf_index(1)][], like so:
58
+
59
+ [maf_index(1)]: http://csw.github.com/bioruby-maf/man/maf_index.1.html
60
+
61
+
62
+ $ maf_index test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
63
+
64
+ Or programmatically:
65
+
66
+ require 'bio-maf'
67
+ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
68
+ idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct")
69
+
70
+ ### Extract blocks from an indexed MAF file, by genomic interval
71
+
72
+ Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
73
+
74
+
75
+ require 'bio-maf'
76
+ parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
77
+ idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
78
+
79
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
80
+ idx.find(q, parser).each do |block|
81
+ ref_seq = block.sequences[0]
82
+ puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
83
+ end
84
+
85
+ # => Matched block at 80082592, 121 bases
86
+ # => Matched block at 80082713, 54 bases
87
+
88
+ ### Filter species returned in alignment blocks
89
+
90
+ require 'bio-maf'
91
+ parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
92
+ idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
93
+
94
+ parser.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
95
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
96
+ blocks = idx.find(q, parser)
97
+ block = blocks.first
98
+ puts "Block has #{block.sequences.size} sequences."
99
+
100
+ # => Block has 3 sequences.
101
+
102
+ ### Extract blocks matching certain conditions
103
+
104
+ See also the [Cucumber feature][] and [step definitions][] for this.
105
+
106
+ [Cucumber feature]: https://github.com/csw/bioruby-maf/blob/master/features/maf-querying.feature
107
+ [step definitions]: https://github.com/csw/bioruby-maf/blob/master/features/step_definitions/query_steps.rb
108
+
109
+ #### Match only blocks with all specified species
110
+
111
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
112
+ filter = { :with_all_species => %w(panTro2 loxAfr1) }
113
+ n_blocks = idx.find(q, parser, filter).count
114
+ # => 1
115
+
116
+ #### Match only blocks with a certain number of sequences
117
+
118
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
119
+ filter = { :at_least_n_sequences => 6 }
120
+ n_blocks = idx.find(q, parser, filter).count
121
+ # => 1
122
+
123
+ #### Match only blocks within a text size range
124
+
125
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
126
+ filter = { :min_size => 72, :max_size => 160 }
127
+ n_blocks = idx.find(q, parser, filter).count
128
+ # => 3
129
+
130
+ ### Process each block in a MAF file
131
+
132
+ require 'bio-maf'
133
+ p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
134
+ puts "MAF version: #{p.header.version}"
135
+ # => MAF version: 1
136
+
137
+ p.parse_blocks.each do |block|
138
+ block.sequences.each do |seq|
139
+ do_something(seq)
140
+ end
141
+ end
142
+
143
+ ### Parse empty ('e') lines
144
+
145
+ Refer to [`chr22_ieq.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/chr22_ieq.maf).
146
+
147
+ require 'bio-maf'
148
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
149
+ :parse_empty => false)
150
+ block = p.parse_block
151
+ block.sequences.size
152
+ # => 3
153
+
154
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
155
+ :parse_empty => true)
156
+ block = p.parse_block
157
+ block.sequences.size
158
+ # => 4
159
+ block.sequences.find { |s| s.empty? }
160
+ # => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
161
+ # @source="turTru1.scaffold_109008", @start=25049,
162
+ # @size=1601, @strand=:+, @src_size=50103, @text=nil,
163
+ # @status="I">
164
+
165
+
166
+ ### Command line tools
167
+
168
+ Man pages for command line tools:
169
+
170
+ * [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
171
+ * [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)
172
+
173
+ ### Other documentation
174
+
175
+ Also see the [API documentation][]. For more code examples see the
176
+ [RSpec][] and [Cucumber][] test files in the source tree.
177
+
178
+ [API documentation]: http://rubydoc.info/github/csw/bioruby-maf/
179
+ [RSpec]: https://github.com/csw/bioruby-maf/tree/master/spec/bio/maf
180
+ [Cucumber]: https://github.com/csw/bioruby-maf/tree/master/features
181
+
182
+ Also, the scripts in the
183
+ [bin](https://github.com/csw/bioruby-maf/tree/master/bin) directory
184
+ provide good worked examples of how to use the existing parsing API.
185
+
186
+ ## Project home page
187
+
188
+ For information on the source tree, documentation, examples, issues
189
+ and how to contribute, see
190
+
191
+ <http://github.com/csw/bioruby-maf>
192
+
193
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
194
+
195
+ ## Cite
196
+
197
+ If you use this software, please cite one of
198
+
199
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
200
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
201
+
202
+ ## Biogems.info
203
+
204
+ This Biogem will be published at [#bio-maf](http://biogems.info/index.html)
205
+
206
+ ## Copyright
207
+
208
+ Copyright (c) 2012 Clayton Wheeler. See LICENSE.txt for further details.
209
+
data/Rakefile ADDED
@@ -0,0 +1,76 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+ require 'rubygems/package_task'
14
+
15
+ $gemspec = Gem::Specification.load("bio-maf.gemspec")
16
+ Gem::PackageTask.new($gemspec) { |pkg| }
17
+
18
+ require 'rspec/core'
19
+ require 'rspec/core/rake_task'
20
+ RSpec::Core::RakeTask.new(:spec) do |spec|
21
+ spec.pattern = FileList['spec/**/*_spec.rb']
22
+ end
23
+
24
+ require 'cucumber/rake/task'
25
+ Cucumber::Rake::Task.new do |features|
26
+ end
27
+
28
+ task :test => [ :spec, :cucumber ]
29
+ task :default => :test
30
+
31
+ #### Man pages
32
+ # (borrowed from matthewtodd/shoe)
33
+ ronn_avail = begin
34
+ require 'ronn'
35
+ true
36
+ rescue LoadError
37
+ false
38
+ end
39
+
40
+ if ronn_avail
41
+ RONN_FILES = Rake::FileList["man/*.?.ronn"]
42
+
43
+ desc "Generate man pages"
44
+ task :man do
45
+ file_spec = RONN_FILES.join(' ')
46
+ sh "ronn --roff --html --style toc --date #{$gemspec.date.strftime('%Y-%m-%d')} --manual='BioRuby Manual' --organization='#{$gemspec.author}' #{file_spec}"
47
+ end
48
+
49
+ namespace :man do
50
+ desc "Publish man pages to Octopress source dir"
51
+ task :publish do
52
+ RONN_FILES.map { |path| path.sub(/\.ronn$/, '.html') }.each do |man|
53
+ cp man, "../octopress/source/man/#{File.basename(man)}"
54
+ end
55
+ end
56
+ end
57
+ task 'man:publish' => :man
58
+
59
+ namespace :ronn do
60
+ task :server do
61
+ sh "ronn --server #{RONN_FILES.join(' ')}"
62
+ end
63
+ end
64
+ end # if ronn_avail
65
+
66
+ #### RDoc (not currently used)
67
+
68
+ require 'rdoc/task'
69
+ Rake::RDocTask.new do |rdoc|
70
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
71
+
72
+ rdoc.rdoc_dir = 'rdoc'
73
+ rdoc.title = "bio-maf #{version}"
74
+ rdoc.rdoc_files.include('README*')
75
+ rdoc.rdoc_files.include('lib/**/*.rb')
76
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 10000000
6
+ line = 's tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg'
7
+
8
+ Benchmark.bmbm do |x|
9
+ x.report("case with strings") do
10
+ n.times do
11
+ i = 0
12
+ case line[0]
13
+ when 's'
14
+ i += 1
15
+ when 'i', 'e', 'q', '#', nil
16
+ next
17
+ else
18
+ raise "foo"
19
+ end
20
+ end
21
+ end
22
+ S = 's'.getbyte(0)
23
+ I = 'i'.getbyte(0)
24
+ E = 'e'.getbyte(0)
25
+ Q = 'q'.getbyte(0)
26
+ COMMENT = '#'.getbyte(0)
27
+ x.report("case with bytes") do
28
+ n.times do
29
+ i = 0
30
+ case line.getbyte(0)
31
+ when S
32
+ i += 1
33
+ when I, E, Q, COMMENT, nil
34
+ next
35
+ else
36
+ raise "foo"
37
+ end
38
+ end
39
+ end
40
+ x.report("if/else with bytes") do
41
+ n.times do
42
+ i = 0
43
+ b = line.getbyte(0)
44
+ if b == S
45
+ i += 1
46
+ elsif [I, E, Q, COMMENT, nil].contain?(b)
47
+ next
48
+ else
49
+ raise "foo"
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 1000000
6
+ data = <<EOF
7
+ a score=28680.000000
8
+ s hg19.chr22 16050711 61 + 51304566 atctccaagagggcataaaacac-tgagtaaacagctcttttatatgtgtttcctggatgag
9
+ s panTro2.chrUn 7681110 59 + 58616431 atctccaagagggcataaaacac-tgagtaaacagctctt--atatgtgtttcctggatgag
10
+ q panTro2.chrUn 99999999999999999999999-9999999999999999--99999999999999999999
11
+ i panTro2.chrUn C 0 C 0
12
+ s tarSyr1.scaffold_75923 2859 50 - 8928 atctccaagagggctgaaaatgc-caaatga-----------tcacacgtttcctggacaag
13
+ q tarSyr1.scaffold_75923 79295966999999999999998-9999799-----------99999999997657759999
14
+ i tarSyr1.scaffold_75923 N 0 C 0
15
+ s micMur1.scaffold_22105 5493 59 - 10683 acctccgagagggctcaaaacgc-cgagtgatcagctctt--atgcgcgtttcctggacgag
16
+ q micMur1.scaffold_22105 99999999999999999999999-9999999999999999--99999999999999999999
17
+ i micMur1.scaffold_22105 C 0 C 0
18
+ s tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg
19
+ q tupBel1.scaffold_3803.1-85889 79648579699867994997775679665662767577569-69987455976776322888
20
+ i tupBel1.scaffold_3803.1-85889 I 1 C 0
21
+ s vicPac1.scaffold_12713 6831 55 - 10681 actgccatgggggctcagcgtac-tgaatggttaattact------gtggtccccgaatgag
22
+ q vicPac1.scaffold_12713 99999999999999999999999-9999999999999999------9999999999999999
23
+ EOF
24
+
25
+ Benchmark.bmbm do |x|
26
+ x.report("split/each") do
27
+ n.times do
28
+ i = 0
29
+ data.split("\n").each do |line|
30
+ i += line.size
31
+ end
32
+ end
33
+ end
34
+ x.report("until/shift") do
35
+ n.times do
36
+ i = 0
37
+ lines = data.split("\n")
38
+ until lines.empty?
39
+ line = lines.shift
40
+ i += line.size
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ FILE = '/Users/csw/maf/chr22.maf'
6
+
7
+ Benchmark.bm do |x|
8
+ x.report("8k") do
9
+ File.open(FILE) do |f|
10
+ while true
11
+ r = f.read(8192)
12
+ break unless r
13
+ end
14
+ end
15
+ end
16
+ x.report("128k") do
17
+ File.open(FILE) do |f|
18
+ while true
19
+ r = f.read(128 * 1024)
20
+ break unless r
21
+ end
22
+ end
23
+ end
24
+ x.report("1M") do
25
+ File.open(FILE) do |f|
26
+ while true
27
+ r = f.read(1024 * 1024)
28
+ break unless r
29
+ end
30
+ end
31
+ end
32
+ x.report("8M") do
33
+ File.open(FILE) do |f|
34
+ while true
35
+ r = f.read(8 * 1024 * 1024)
36
+ break unless r
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ class Thing
6
+ attr_reader :part
7
+
8
+ def initialize(part)
9
+ @part = part
10
+ end
11
+ end
12
+
13
+ prng = Random.new
14
+ v_max = 1 << 31
15
+ ary = []
16
+ 1000.times do
17
+ ary << Thing.new(rand(v_max))
18
+ end
19
+
20
+ Benchmark.bmbm do |x|
21
+ x.report("sort!") do
22
+ 1000.times do
23
+ ary2 = ary.dup
24
+ ary2.sort! { |a, b| a.part <=> b.part }
25
+ end
26
+ end
27
+ x.report("sort_by!") do
28
+ 1000.times do
29
+ ary2 = ary.dup
30
+ ary2.sort_by! { |i| i.part }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 2000000
6
+ line = 's tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg'
7
+
8
+ Benchmark.bmbm do |x|
9
+ x.report("basic String#split") do
10
+ n.times do
11
+ parts = line.split
12
+ end
13
+ end
14
+ x.report("regex split") do
15
+ n.times do
16
+ parts = line.split(/\s+/)
17
+ end
18
+ end
19
+ x.report("regex fields") do
20
+ n.times do
21
+ if m = /^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/.match(line)
22
+ parts = m.captures
23
+ end
24
+ end
25
+ end
26
+ x.report("regex fields") do
27
+ n.times do
28
+ if m = /^s\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/.match(line)
29
+ parts = m.captures
30
+ end
31
+ end
32
+ end
33
+ end