bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
@@ -0,0 +1 @@
1
+ SimpleCov.start
@@ -0,0 +1,16 @@
1
+ language: ruby
2
+ before_install:
3
+ #- sudo update-java-alternatives -s java-1.7.0-openjdk-i386
4
+ - sudo update-java-alternatives -s java-1.6.0-openjdk
5
+ - sudo ./travis-ci/install_kc
6
+ bundler_args: --without development
7
+ script: "bundle exec rake test"
8
+ after_script:
9
+ - ./travis-ci/report_errors
10
+ rvm:
11
+ - 1.9.3
12
+ - jruby-19mode # JRuby in 1.9 mode
13
+ - rbx-19mode
14
+ matrix:
15
+ allow_failures:
16
+ - rvm: rbx-19mode
@@ -0,0 +1,3 @@
1
+ --markup markdown
2
+ --markup-provider=redcarpet
3
+ - DEVELOPMENT.md
@@ -0,0 +1,40 @@
1
+ # Development guide
2
+
3
+ Here are notes on less obvious aspects of the development process for
4
+ this library.
5
+
6
+ ## kyotocabinet-java
7
+
8
+ Running `bio-maf` on JRuby requires the [kyotocabinet-java][] gem, a
9
+ wrapper around the Kyoto Cabinet Java interface providing a Ruby API
10
+ compatible with the standard Kyoto Cabinet Ruby API.
11
+
12
+ [kyotocabinet-java]: https://github.com/csw/kyotocabinet-java
13
+
14
+ ## Man pages
15
+
16
+ Man pages are developed with [ronn][] and live in `man/`; see
17
+ [maf_index.1.ronn][] for an example. The generated man pages,
18
+ e.g. `maf_index.1`, are added to Git for [gem-man][] support.
19
+
20
+ [ronn]: https://github.com/rtomayko/ronn
21
+ [gem-man]: https://github.com/defunkt/gem-man
22
+ [maf_index.1.ronn]: https://github.com/csw/bioruby-maf/blob/master/man/maf_index.1.ronn
23
+
24
+ HTML and roff versions are built with:
25
+
26
+ $ rake man
27
+
28
+ The HTML versions are published through Octopress to Github Pages,
29
+ e.g. <http://csw.github.com/bioruby-maf/man/maf_index.1.html>. This is
30
+ a separate step, and necessarily dependent on the local filesystem
31
+ layout. Specifically, there must be an `octopress` directory at the
32
+ same level as `bioruby-maf`, containing a checked-out copy of
33
+ <https://github.com/csw/bioruby-maf-blog>. Then, to publish the man
34
+ pages, run:
35
+
36
+ $ rake man:publish
37
+
38
+ After this, in that Octopress instance, run:
39
+
40
+ $ rake deploy
data/Gemfile ADDED
@@ -0,0 +1,23 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+
4
+ gemspec
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "rdoc", "~> 3.12"
10
+ gem "simplecov", "~> 0.6.4", :platforms => :mri
11
+ gem "yard", "~> 0.8.1"
12
+ gem "kramdown", "~> 0.13.6"
13
+ gem "redcarpet", "~> 2.1.1", :platforms => :mri
14
+ gem "ronn", "~> 0.7.3", :platforms => :mri
15
+ gem "sinatra", "~> 1.3.2" # for ronn --server
16
+ end
17
+
18
+ group :test do
19
+ gem "bundler", ">= 1.0.0"
20
+ gem "rake", ">= 0.9"
21
+ gem "cucumber", ">= 0"
22
+ gem "rspec", "~> 2.10.0"
23
+ end
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 csw
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,209 @@
1
+ # bio-maf
2
+
3
+ [![Build Status](https://secure.travis-ci.org/csw/bioruby-maf.png)](http://travis-ci.org/csw/bioruby-maf)
4
+
5
+ This is a plugin for [BioRuby](http://bioruby.open-bio.org/) adding
6
+ support for the
7
+ [Multiple Alignment Format](http://genome.ucsc.edu/FAQ/FAQformat#format5)
8
+ (MAF), used in bioinformatics to store whole-genome sets of multiple
9
+ sequence alignments.
10
+
11
+ Ultimately it will provide indexed and sequential access to MAF data,
12
+ as well as performing various manipulations on it and writing modified
13
+ MAF files. So far, it only supports simple sequential parsing.
14
+
15
+ For more information, see the
16
+ [project wiki](https://github.com/csw/bioruby-maf/wiki).
17
+
18
+ Developer documentation generated with YARD is available at
19
+ [rubydoc.info](http://rubydoc.info/github/csw/bioruby-maf/).
20
+
21
+ This is being developed by Clayton Wheeler as
22
+ [part of](http://www.bioruby.org/wiki/Google_Summer_of_Code) the
23
+ Google Summer of Code 2012, under the auspices of the Open
24
+ Bioinformatics Foundation. The development
25
+ [blog](http://csw.github.com/bioruby-maf/) may be of interest.
26
+
27
+ ## Dependencies
28
+
29
+ [Kyoto Cabinet][] is a database library, required for building MAF
30
+ indexes. Install the core library in the appropriate way for your
31
+ platform, as documented [here][].
32
+
33
+ [Kyoto Cabinet]: http://fallabs.com/kyotocabinet/
34
+ [here]: https://github.com/csw/bioruby-maf/wiki/Kyoto-Cabinet
35
+
36
+ If you're using MRI, the [kyotocabinet-ruby][] gem will be used to
37
+ interact with Kyoto Cabinet. For best performance, however, you should
38
+ really consider using JRuby. On JRuby, the [kyotocabinet-java][] gem
39
+ will be used instead; this builds a Java library using JNI to call
40
+ into Kyoto Cabinet. Please file a [bug report][] if you encounter
41
+ problems building or using this gem, which is still fairly new.
42
+
43
+ [kyotocabinet-ruby]: https://rubygems.org/gems/kyotocabinet-ruby
44
+ [kyotocabinet-java]: https://github.com/csw/kyotocabinet-java
45
+ [bug report]: https://github.com/csw/kyotocabinet-java/issues
46
+
47
+
48
+ ## Installation
49
+
50
+ $ gem install bio-maf
51
+
52
+ ## Usage
53
+
54
+ ### Create an index on a MAF file
55
+
56
+ Much of the functionality of this library relies on an index. You can
57
+ create one with [maf_index(1)][], like so:
58
+
59
+ [maf_index(1)]: http://csw.github.com/bioruby-maf/man/maf_index.1.html
60
+
61
+
62
+ $ maf_index test/data/mm8_chr7_tiny.maf /tmp/mm8_chr7_tiny.kct
63
+
64
+ Or programmatically:
65
+
66
+ require 'bio-maf'
67
+ parser = Bio::MAF::Parser.new("test/data/mm8_chr7_tiny.maf")
68
+ idx = Bio::MAF::KyotoIndex.build(parser, "/tmp/mm8_chr7_tiny.kct")
69
+
70
+ ### Extract blocks from an indexed MAF file, by genomic interval
71
+
72
+ Refer to [`mm8_chr7_tiny.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/mm8_chr7_tiny.maf).
73
+
74
+
75
+ require 'bio-maf'
76
+ parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
77
+ idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
78
+
79
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
80
+ idx.find(q, parser).each do |block|
81
+ ref_seq = block.sequences[0]
82
+ puts "Matched block at #{ref_seq.start}, #{ref_seq.size} bases"
83
+ end
84
+
85
+ # => Matched block at 80082592, 121 bases
86
+ # => Matched block at 80082713, 54 bases
87
+
88
+ ### Filter species returned in alignment blocks
89
+
90
+ require 'bio-maf'
91
+ parser = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
92
+ idx = Bio::MAF::KyotoIndex.open('test/data/mm8_chr7_tiny.kct')
93
+
94
+ parser.sequence_filter = { :only_species => %w(hg18 mm8 rheMac2) }
95
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082592, 80082766)]
96
+ blocks = idx.find(q, parser)
97
+ block = blocks.first
98
+ puts "Block has #{block.sequences.size} sequences."
99
+
100
+ # => Block has 3 sequences.
101
+
102
+ ### Extract blocks matching certain conditions
103
+
104
+ See also the [Cucumber feature][] and [step definitions][] for this.
105
+
106
+ [Cucumber feature]: https://github.com/csw/bioruby-maf/blob/master/features/maf-querying.feature
107
+ [step definitions]: https://github.com/csw/bioruby-maf/blob/master/features/step_definitions/query_steps.rb
108
+
109
+ #### Match only blocks with all specified species
110
+
111
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082471, 80082730)]
112
+ filter = { :with_all_species => %w(panTro2 loxAfr1) }
113
+ n_blocks = idx.find(q, parser, filter).count
114
+ # => 1
115
+
116
+ #### Match only blocks with a certain number of sequences
117
+
118
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 80082767, 80083008)]
119
+ filter = { :at_least_n_sequences => 6 }
120
+ n_blocks = idx.find(q, parser, filter).count
121
+ # => 1
122
+
123
+ #### Match only blocks within a text size range
124
+
125
+ q = [Bio::GenomicInterval.zero_based('mm8.chr7', 0, 80100000)]
126
+ filter = { :min_size => 72, :max_size => 160 }
127
+ n_blocks = idx.find(q, parser, filter).count
128
+ # => 3
129
+
130
+ ### Process each block in a MAF file
131
+
132
+ require 'bio-maf'
133
+ p = Bio::MAF::Parser.new('test/data/mm8_chr7_tiny.maf')
134
+ puts "MAF version: #{p.header.version}"
135
+ # => MAF version: 1
136
+
137
+ p.parse_blocks.each do |block|
138
+ block.sequences.each do |seq|
139
+ do_something(seq)
140
+ end
141
+ end
142
+
143
+ ### Parse empty ('e') lines
144
+
145
+ Refer to [`chr22_ieq.maf`](https://github.com/csw/bioruby-maf/blob/master/test/data/chr22_ieq.maf).
146
+
147
+ require 'bio-maf'
148
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
149
+ :parse_empty => false)
150
+ block = p.parse_block
151
+ block.sequences.size
152
+ # => 3
153
+
154
+ p = Bio::MAF::Parser.new('test/data/chr22_ieq.maf',
155
+ :parse_empty => true)
156
+ block = p.parse_block
157
+ block.sequences.size
158
+ # => 4
159
+ block.sequences.find { |s| s.empty? }
160
+ # => #<Bio::MAF::EmptySequence:0x007fe1f39882d0
161
+ # @source="turTru1.scaffold_109008", @start=25049,
162
+ # @size=1601, @strand=:+, @src_size=50103, @text=nil,
163
+ # @status="I">
164
+
165
+
166
+ ### Command line tools
167
+
168
+ Man pages for command line tools:
169
+
170
+ * [`maf_index(1)`](http://csw.github.com/bioruby-maf/man/maf_index.1.html)
171
+ * [`maf_to_fasta(1)`](http://csw.github.com/bioruby-maf/man/maf_to_fasta.1.html)
172
+
173
+ ### Other documentation
174
+
175
+ Also see the [API documentation][]. For more code examples see the
176
+ [RSpec][] and [Cucumber][] test files in the source tree.
177
+
178
+ [API documentation]: http://rubydoc.info/github/csw/bioruby-maf/
179
+ [RSpec]: https://github.com/csw/bioruby-maf/tree/master/spec/bio/maf
180
+ [Cucumber]: https://github.com/csw/bioruby-maf/tree/master/features
181
+
182
+ Also, the scripts in the
183
+ [bin](https://github.com/csw/bioruby-maf/tree/master/bin) directory
184
+ provide good worked examples of how to use the existing parsing API.
185
+
186
+ ## Project home page
187
+
188
+ For information on the source tree, documentation, examples, issues
189
+ and how to contribute, see
190
+
191
+ <http://github.com/csw/bioruby-maf>
192
+
193
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
194
+
195
+ ## Cite
196
+
197
+ If you use this software, please cite one of
198
+
199
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
200
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
201
+
202
+ ## Biogems.info
203
+
204
+ This Biogem will be published at [#bio-maf](http://biogems.info/index.html)
205
+
206
+ ## Copyright
207
+
208
+ Copyright (c) 2012 Clayton Wheeler. See LICENSE.txt for further details.
209
+
@@ -0,0 +1,76 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+ require 'rubygems/package_task'
14
+
15
+ $gemspec = Gem::Specification.load("bio-maf.gemspec")
16
+ Gem::PackageTask.new($gemspec) { |pkg| }
17
+
18
+ require 'rspec/core'
19
+ require 'rspec/core/rake_task'
20
+ RSpec::Core::RakeTask.new(:spec) do |spec|
21
+ spec.pattern = FileList['spec/**/*_spec.rb']
22
+ end
23
+
24
+ require 'cucumber/rake/task'
25
+ Cucumber::Rake::Task.new do |features|
26
+ end
27
+
28
+ task :test => [ :spec, :cucumber ]
29
+ task :default => :test
30
+
31
+ #### Man pages
32
+ # (borrowed from matthewtodd/shoe)
33
+ ronn_avail = begin
34
+ require 'ronn'
35
+ true
36
+ rescue LoadError
37
+ false
38
+ end
39
+
40
+ if ronn_avail
41
+ RONN_FILES = Rake::FileList["man/*.?.ronn"]
42
+
43
+ desc "Generate man pages"
44
+ task :man do
45
+ file_spec = RONN_FILES.join(' ')
46
+ sh "ronn --roff --html --style toc --date #{$gemspec.date.strftime('%Y-%m-%d')} --manual='BioRuby Manual' --organization='#{$gemspec.author}' #{file_spec}"
47
+ end
48
+
49
+ namespace :man do
50
+ desc "Publish man pages to Octopress source dir"
51
+ task :publish do
52
+ RONN_FILES.map { |path| path.sub(/\.ronn$/, '.html') }.each do |man|
53
+ cp man, "../octopress/source/man/#{File.basename(man)}"
54
+ end
55
+ end
56
+ end
57
+ task 'man:publish' => :man
58
+
59
+ namespace :ronn do
60
+ task :server do
61
+ sh "ronn --server #{RONN_FILES.join(' ')}"
62
+ end
63
+ end
64
+ end # if ronn_avail
65
+
66
+ #### RDoc (not currently used)
67
+
68
+ require 'rdoc/task'
69
+ Rake::RDocTask.new do |rdoc|
70
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
71
+
72
+ rdoc.rdoc_dir = 'rdoc'
73
+ rdoc.title = "bio-maf #{version}"
74
+ rdoc.rdoc_files.include('README*')
75
+ rdoc.rdoc_files.include('lib/**/*.rb')
76
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'benchmark'
4
+
5
+ n = 10000000
6
+ line = 's tupBel1.scaffold_3803.1-85889 33686 61 + 85889 ttcaggaagggggcccaaaacgcttgagtggtcagctctta-ttttgcgtttactggatggg'
7
+
8
+ Benchmark.bmbm do |x|
9
+ x.report("case with strings") do
10
+ n.times do
11
+ i = 0
12
+ case line[0]
13
+ when 's'
14
+ i += 1
15
+ when 'i', 'e', 'q', '#', nil
16
+ next
17
+ else
18
+ raise "foo"
19
+ end
20
+ end
21
+ end
22
+ S = 's'.getbyte(0)
23
+ I = 'i'.getbyte(0)
24
+ E = 'e'.getbyte(0)
25
+ Q = 'q'.getbyte(0)
26
+ COMMENT = '#'.getbyte(0)
27
+ x.report("case with bytes") do
28
+ n.times do
29
+ i = 0
30
+ case line.getbyte(0)
31
+ when S
32
+ i += 1
33
+ when I, E, Q, COMMENT, nil
34
+ next
35
+ else
36
+ raise "foo"
37
+ end
38
+ end
39
+ end
40
+ x.report("if/else with bytes") do
41
+ n.times do
42
+ i = 0
43
+ b = line.getbyte(0)
44
+ if b == S
45
+ i += 1
46
+ elsif [I, E, Q, COMMENT, nil].contain?(b)
47
+ next
48
+ else
49
+ raise "foo"
50
+ end
51
+ end
52
+ end
53
+ end