bio-maf 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/.simplecov +1 -0
  3. data/.travis.yml +16 -0
  4. data/.yardopts +3 -0
  5. data/DEVELOPMENT.md +40 -0
  6. data/Gemfile +23 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +209 -0
  9. data/Rakefile +76 -0
  10. data/VERSION +1 -0
  11. data/benchmarks/dispatch_bench +53 -0
  12. data/benchmarks/iter_bench +44 -0
  13. data/benchmarks/read_bench +40 -0
  14. data/benchmarks/sort_bench +33 -0
  15. data/benchmarks/split_bench +33 -0
  16. data/bin/maf_count +82 -0
  17. data/bin/maf_dump_blocks +27 -0
  18. data/bin/maf_extract_ranges_count +44 -0
  19. data/bin/maf_index +88 -0
  20. data/bin/maf_parse_bench +94 -0
  21. data/bin/maf_to_fasta +68 -0
  22. data/bin/maf_write +84 -0
  23. data/bin/random_ranges +35 -0
  24. data/features/maf-indexing.feature +31 -0
  25. data/features/maf-output.feature +29 -0
  26. data/features/maf-parsing.feature +44 -0
  27. data/features/maf-querying.feature +75 -0
  28. data/features/maf-to-fasta.feature +50 -0
  29. data/features/step_definitions/convert_steps.rb +45 -0
  30. data/features/step_definitions/index_steps.rb +20 -0
  31. data/features/step_definitions/output_steps.rb +27 -0
  32. data/features/step_definitions/parse_steps.rb +63 -0
  33. data/features/step_definitions/query_steps.rb +31 -0
  34. data/features/step_definitions/ucsc_bin_steps.rb +14 -0
  35. data/features/support/env.rb +16 -0
  36. data/features/ucsc-bins.feature +24 -0
  37. data/lib/bio-maf.rb +12 -0
  38. data/lib/bio-maf/maf.rb +3 -0
  39. data/lib/bio/maf.rb +4 -0
  40. data/lib/bio/maf/index.rb +620 -0
  41. data/lib/bio/maf/parser.rb +888 -0
  42. data/lib/bio/maf/struct.rb +63 -0
  43. data/lib/bio/maf/writer.rb +63 -0
  44. data/lib/bio/ucsc.rb +2 -0
  45. data/lib/bio/ucsc/genomic-interval-bin.rb +13 -0
  46. data/lib/bio/ucsc/ucsc_bin.rb +117 -0
  47. data/man/.gitignore +1 -0
  48. data/man/maf_index.1 +105 -0
  49. data/man/maf_index.1.markdown +97 -0
  50. data/man/maf_index.1.ronn +83 -0
  51. data/man/maf_to_fasta.1 +53 -0
  52. data/man/maf_to_fasta.1.ronn +51 -0
  53. data/spec/bio/maf/index_spec.rb +363 -0
  54. data/spec/bio/maf/parser_spec.rb +354 -0
  55. data/spec/bio/maf/struct_spec.rb +75 -0
  56. data/spec/spec_helper.rb +14 -0
  57. data/test/data/big-block.maf +15999 -0
  58. data/test/data/chr22_ieq.maf +11 -0
  59. data/test/data/chrY-1block.maf +6 -0
  60. data/test/data/empty +0 -0
  61. data/test/data/empty.db +0 -0
  62. data/test/data/mm8_chr7_tiny.kct +0 -0
  63. data/test/data/mm8_chr7_tiny.maf +76 -0
  64. data/test/data/mm8_mod_a.maf +7 -0
  65. data/test/data/mm8_single.maf +13 -0
  66. data/test/data/mm8_subset_a.maf +23 -0
  67. data/test/data/t1-bad1.maf +15 -0
  68. data/test/data/t1.fasta +12 -0
  69. data/test/data/t1.maf +15 -0
  70. data/test/data/t1a.maf +17 -0
  71. data/test/helper.rb +18 -0
  72. data/test/test_bio-maf.rb +7 -0
  73. data/travis-ci/install_kc +13 -0
  74. data/travis-ci/install_kc_java +13 -0
  75. data/travis-ci/report_errors +4 -0
  76. metadata +182 -0
@@ -0,0 +1,63 @@
1
+ ## NOTE: this is probably not the best place for this, ultimately.
2
+ ## If it works, think about moving it.
3
+
4
+ module Bio
5
+
6
+ module MAF
7
+
8
+ class Struct
9
+ def initialize(spec)
10
+ @members = []
11
+ @by_name = {}
12
+ offset = 0
13
+ spec.each do |m_spec|
14
+ m = Member.new(offset, *m_spec)
15
+ @members << m
16
+ @by_name[m.name] = m
17
+ offset += m.size
18
+ end
19
+ end
20
+
21
+ def fmt
22
+ @members.collect { |m| m.fmt }.join('')
23
+ end
24
+
25
+ def extractor_fmt(*names)
26
+ extract = names.collect { |name| @by_name.fetch(name) }
27
+ extract.sort_by! { |m| m.offset }
28
+ fmt = ''
29
+ pos = 0
30
+ extract.each do |member|
31
+ if member.offset != pos
32
+ fmt << "@#{member.offset}"
33
+ pos = member.offset
34
+ end
35
+ fmt << member.fmt
36
+ pos += member.size
37
+ end
38
+ return fmt
39
+ end
40
+ end
41
+
42
+ TYPE_PROPS = {
43
+ :uint8 => { :size => 1, :fmt => 'C' },
44
+ :uint16 => { :size => 2, :fmt => 'S>' },
45
+ :uint32 => { :size => 4, :fmt => 'L>' },
46
+ :uint64 => { :size => 8, :fmt => 'Q>' }
47
+ }
48
+
49
+ class Member
50
+ attr_reader :offset, :name, :type, :size, :fmt
51
+ def initialize(offset, name, type)
52
+ @offset = offset
53
+ @name = name
54
+ @type = type
55
+ props = TYPE_PROPS.fetch(type)
56
+ @size = props.fetch(:size)
57
+ @fmt = props.fetch(:fmt)
58
+ end
59
+ end
60
+
61
+ end
62
+
63
+ end
@@ -0,0 +1,63 @@
1
+ module Bio::MAF
2
+
3
+ class Writer
4
+ attr_reader :f, :path
5
+
6
+ def initialize(fspec)
7
+ if fspec.respond_to? :write
8
+ @f = fspec
9
+ if fspec.respond_to? :path
10
+ @path = fspec.path
11
+ end
12
+ else
13
+ @path = fspec
14
+ @f = File.open(fspec, 'w')
15
+ end
16
+ end
17
+
18
+ def flatten_vars(vars)
19
+ vars.to_a.collect {|k, v| "#{k}=#{v}"}.join(" ")
20
+ end
21
+
22
+ def write_header(header)
23
+ f.puts "##maf #{flatten_vars(header.vars)}"
24
+ f.puts "##{header.alignment_params}" if header.alignment_params
25
+ end
26
+
27
+ def write_blocks(blocks)
28
+ blocks.each do |block|
29
+ write_block(block)
30
+ end
31
+ f.flush
32
+ end
33
+
34
+ def write_block(block)
35
+ lines = ["a #{flatten_vars(block.vars)}"]
36
+ block.sequences.each do |seq|
37
+ write_seq(seq, lines)
38
+ end
39
+ lines << " "
40
+ f.puts lines.join("\n")
41
+ end
42
+
43
+ def write_seq(s, lines)
44
+ lines << sprintf("%s %-20s %12d %2d %s %9d %s",
45
+ s.empty? ? "e" : "s",
46
+ s.source,
47
+ s.start,
48
+ s.size,
49
+ s.strand,
50
+ s.src_size,
51
+ s.empty? ? s.status : s.text)
52
+ if s.quality
53
+ lines << sprintf("q %-20s %s",
54
+ s.source, s.quality)
55
+ end
56
+ if s.i_data
57
+ lines << sprintf("i %-20s %s %s %s %s",
58
+ s.source, *s.i_data)
59
+ end
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,2 @@
1
+ require 'bio/ucsc/ucsc_bin'
2
+ require 'bio/ucsc/genomic-interval-bin'
@@ -0,0 +1,13 @@
1
+ require "bio-genomic-interval"
2
+
3
+ module Bio
4
+ class GenomicInterval
5
+ def bin
6
+ Bio::Ucsc::UcscBin.bin(self.zero_start, self.zero_end)
7
+ end
8
+
9
+ def bin_all
10
+ Bio::Ucsc::UcscBin.bin_all(self.zero_start, self.zero_end)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,117 @@
1
+ # = UCSCBin
2
+ # Author:: MISHIMA, Hiroyuki
3
+ # Copyright:: MISHIMA, Hiroyuki, 2010-2011
4
+ # License:: The Ruby licence (Ryby's / GPLv2 dual)
5
+ #
6
+ # Original program in C by Jim Kent, 2002
7
+ # See also http://genomewiki.ucsc.edu/index.php/Bin_indexing_system;
8
+ # a paper Kent, et. al. Genome Research 2002.12:996-1006;
9
+ # and src/lib/binRange.c in the kent source tree.
10
+ #
11
+ # Bio::Ucsc::UcscBin -
12
+ # 1) convert between 0-based half-open interval and
13
+ # 1-based full-close intervals.
14
+ # 2) Calculate Bin number from genomic physical position
15
+ # according to UCSC's Bin Indexing System.
16
+ #
17
+ module Bio
18
+ module Ucsc
19
+ class UcscBin
20
+ # Version = "0.1.0" # 20100714
21
+ # Version = "0.2.0" # 20101028
22
+ # Version = "0.2.1" # 20110408
23
+ Version = "0.2.2" # 20110418 the licence is changed
24
+ # embeded in BioRubyUcscApi
25
+ # handle the case, start==end in [start, end)
26
+
27
+ BINRANGE_MAXEND_512M = (512*1024*1024)
28
+ BIN_OFFSETS_EXTENDED = [4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0]
29
+ # BIN_OFFSETS_EXTENDED = [4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1] (to omit BIN=0)
30
+ BIN_OFFSETS = [512+64+8+1, 64+8+1, 8+1, 1, 0]
31
+ # BIN_OFFSETS = [512+64+8+1, 64+8+1, 8+1, 1] (to omit BIN=0)
32
+ BIN_OFFSET_OLD_TO_EXTENDED = 4681
33
+ # How much to shift to get to finest bin.
34
+ BIN_FIRST_SHIFT = 17
35
+ # How much to shift to get to next larger bin.
36
+ BIN_NEXT_SHIFT = 3
37
+
38
+ # Return a Integer of a BIN which is the smallest/finest bin
39
+ # containing whole the interval/range.
40
+ #
41
+ # Extended bin index for positions >= 512M is not supported yet
42
+ # Do you need it? Please email me.
43
+ def self.bin_from_range(bin_start, bin_end)
44
+ if bin_end <= BINRANGE_MAXEND_512M
45
+ bin_from_range_standard(bin_start, bin_end)
46
+ else
47
+ bin_from_range_extended(bin_start, bin_end)
48
+ end
49
+ end
50
+
51
+ class << self; alias bin bin_from_range; end
52
+
53
+ # Return an Array of BINs which are all bins containing whole the
54
+ # interval/range. Thus, it always contains "0" indicating a bin
55
+ # containing whole of a chromosome.
56
+ #
57
+ # extended bin index for positions >= 512M is not supported yet
58
+ # Do you need it? Please email me.
59
+ #
60
+ def self.bin_all(p_start, p_end)
61
+ if p_end <= BINRANGE_MAXEND_512M
62
+ bin_all_standard(p_start, p_end)
63
+ else
64
+ bin_all_extended(p_start, p_end)
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ def self.bin_from_range_standard(bin_start, bin_end)
71
+ # Given start,end in chromosome coordinates assign it
72
+ # a bin. There's a bin for each 128k segment, for each
73
+ # 1M segment, for each 8M segment, for each 64M segment,
74
+ # and for each chromosome (which is assumed to be less than
75
+ # 512M.) A range goes into the smallest bin it will fit in.
76
+
77
+ bin_start >>= BIN_FIRST_SHIFT
78
+ bin_end -= 1
79
+ bin_end >>= BIN_FIRST_SHIFT
80
+
81
+ BIN_OFFSETS.each do |offset|
82
+ return offset + bin_start if bin_start == bin_end
83
+ bin_start >>= BIN_NEXT_SHIFT
84
+ bin_end >>= BIN_NEXT_SHIFT
85
+ end
86
+ raise RangeError, \
87
+ "start #{bin_start}, end #{bin_end} out of range in findBin (max is 512M)"
88
+ end
89
+
90
+ def self.bin_from_range_extended(bin_start, bin_end)
91
+ raise NotImplementedError, "Extended bins are not supported yet"
92
+ end
93
+
94
+ def self.bin_all_standard(bin_start, bin_end)
95
+ bin_start_orig = bin_start
96
+ bin_end_orig = bin_end
97
+ results = Array.new
98
+
99
+ bin_start >>= BIN_FIRST_SHIFT
100
+ bin_end -= 1
101
+ bin_end >>= BIN_FIRST_SHIFT
102
+
103
+ BIN_OFFSETS.each do |offset|
104
+ results.concat(((offset + bin_start)..(offset + bin_end)).to_a)
105
+ bin_start >>= BIN_NEXT_SHIFT
106
+ bin_end >>= BIN_NEXT_SHIFT
107
+ end
108
+ return results
109
+ end
110
+
111
+ def self.bin_all_extended(bin_start, bin_end)
112
+ raise NotImplementedError, "Extended bins are not supported yet"
113
+ end
114
+
115
+ end # class UcscBin
116
+ end # module Ucsc
117
+ end # module Bio
@@ -0,0 +1 @@
1
+ *.html
@@ -0,0 +1,105 @@
1
+ .\" generated with Ronn/v0.7.3
2
+ .\" http://github.com/rtomayko/ronn/tree/0.7.3
3
+ .
4
+ .TH "MAF_INDEX" "1" "June 2012" "Clayton Wheeler" "BioRuby Manual"
5
+ .
6
+ .SH "NAME"
7
+ \fBmaf_index\fR \- build and examine MAF indexes
8
+ .
9
+ .SH "SYNOPSIS"
10
+ \fBmaf_index\fR [\-t] \fImaf\fR \fIindex\fR
11
+ .
12
+ .br
13
+ \fBmaf_index\fR \fB\-d\fR|\fB\-\-dump\fR \fIindex\fR
14
+ .
15
+ .SH "DESCRIPTION"
16
+ \fBmaf_index\fR is part of the bioruby\-maf library and creates Kyoto Cabinet indexes for Multiple Alignment Format (MAF) files\. These indexes enable other MAF tools to selectively extract alignment blocks of interest\.
17
+ .
18
+ .P
19
+ In its default mode, \fBmaf_index\fR parses the \fImaf\fR file given as an argument and creates an index in \fIindex\fR\.
20
+ .
21
+ .P
22
+ The index data is stored in binary form, so with the \fB\-\-dump\fR argument, \fBmaf_index\fR can dump out the index data in human\-readable form for debugging\.
23
+ .
24
+ .SH "FILES"
25
+ The \fImaf\fR input file must be a valid MAF file of any length\.
26
+ .
27
+ .P
28
+ The index created is a Kyoto Cabinet TreeDB (B+ tree) database; \fIindex\fR must have a \fB\.kct\fR extension\.
29
+ .
30
+ .SH "OPTIONS"
31
+ TODO
32
+ .
33
+ .TP
34
+ \fB\-d\fR, \fB\-\-dump\fR
35
+ Instead of creating an index, dump out the given \fIindex\fR in human\-readable form\. Index records will appear like:
36
+ .
37
+ .IP "" 4
38
+ .
39
+ .nf
40
+
41
+ 0 [bin 1195] 80082334:80082368
42
+ offset 16, length 1087
43
+ text size: 54
44
+ sequences in block: 10
45
+ species vector: 00000000000003ff
46
+ .
47
+ .fi
48
+ .
49
+ .IP "" 0
50
+
51
+ .
52
+ .TP
53
+ \fB\-t\fR, \fB\-\-threaded\fR
54
+ Use a separate reader thread to do I/O in parallel with parsing\. Only useful on JRuby\.
55
+ .
56
+ .TP
57
+ \fB\-\-time\fR
58
+ Print elapsed time for index creation\. Mainly useful for measuring performance with different Ruby implementations, I/O subsystems, etc\.
59
+ .
60
+ .SH "EXAMPLES"
61
+ Build an index on a MAF file:
62
+ .
63
+ .IP "" 4
64
+ .
65
+ .nf
66
+
67
+ $ maf_index chr22\.maf chr22\.kct
68
+ .
69
+ .fi
70
+ .
71
+ .IP "" 0
72
+ .
73
+ .P
74
+ Dump out an index:
75
+ .
76
+ .IP "" 4
77
+ .
78
+ .nf
79
+
80
+ $ maf_index \-d chr22\.kct > /tmp/chr22\.dump
81
+ .
82
+ .fi
83
+ .
84
+ .IP "" 0
85
+ .
86
+ .SH "ENVIRONMENT"
87
+ \fBmaf_index\fR is a Ruby program and relies on ordinary Ruby environment variables\.
88
+ .
89
+ .SH "BUGS"
90
+ \fBmaf_index\fR does not currently allow Kyoto Cabinet database parameters to be set\.
91
+ .
92
+ .SH "COPYRIGHT"
93
+ \fBmaf_index\fR is copyright (C) 2012 Clayton Wheeler\.
94
+ .
95
+ .SH "SEE ALSO"
96
+ ruby(1), kctreemgr(1)
97
+ .
98
+ .IP "\(bu" 4
99
+ \fIhttps://github\.com/csw/bioruby\-maf/\fR
100
+ .
101
+ .IP "\(bu" 4
102
+ \fIhttp://fallabs\.com/kyotocabinet/\fR
103
+ .
104
+ .IP "" 0
105
+
@@ -0,0 +1,97 @@
1
+ maf_index(1) -- build and examine MAF indexes
2
+ =============================================
3
+
4
+ ## SYNOPSIS
5
+
6
+ `maf_index` [-t] <var>maf</var> <var>index</var><br>
7
+ `maf_index` `-d`|`--dump` <var>index</var>
8
+
9
+ ## DESCRIPTION
10
+
11
+ **maf_index** is part of the bioruby-maf library and creates
12
+ Kyoto Cabinet indexes for Multiple Alignment Format (MAF)
13
+ files. These indexes enable other MAF tools to selectively extract
14
+ alignment blocks of interest.
15
+
16
+ In its default mode, `maf_index` parses the <var>maf</var> file given as an
17
+ argument and creates an index in <var>index</var>.
18
+
19
+ The index data is stored in binary form, so with the `--dump`
20
+ argument, `maf_index` can dump out the index data in human-readable
21
+ form for debugging.
22
+
23
+ ## FILES
24
+
25
+ The <var>maf</var> input file must be a valid MAF file of any length.
26
+
27
+ The index created is a Kyoto Cabinet TreeDB (B+ tree) database;
28
+ <var>index</var> must have a `.kct` extension.
29
+
30
+ ## OPTIONS
31
+
32
+ TODO
33
+
34
+ * `-d`, `--dump`:
35
+ Instead of creating an index, dump out the given <var>index</var> in
36
+ human-readable form. Index records will appear like:
37
+
38
+ 0 [bin 1195] 80082334:80082368
39
+ offset 16, length 1087
40
+ text size: 54
41
+ sequences in block: 10
42
+ species vector: 00000000000003ff
43
+
44
+ * `-t`, `--threaded`:
45
+ Use a separate reader thread to do I/O in parallel with
46
+ parsing. Only useful on JRuby.
47
+
48
+ * `--time`:
49
+ Print elapsed time for index creation. Mainly useful for measuring
50
+ performance with different Ruby implementations, I/O subsystems,
51
+ etc.
52
+
53
+ ## EXAMPLES
54
+
55
+ Build an index on a MAF file:
56
+
57
+ $ maf_index chr22.maf chr22.kct
58
+
59
+ Dump out an index:
60
+
61
+ $ maf_index -d chr22.kct > /tmp/chr22.dump
62
+
63
+ ## ENVIRONMENT
64
+
65
+ `maf_index` is a Ruby program and relies on ordinary Ruby environment
66
+ variables.
67
+
68
+ ## BUGS
69
+
70
+ `maf_index` does not currently allow Kyoto Cabinet database parameters
71
+ to be set.
72
+
73
+ ## COPYRIGHT
74
+
75
+ `maf_index` is copyright (C) 2012 Clayton Wheeler.
76
+
77
+ ## SEE ALSO
78
+
79
+ ruby(1), kctreemgr(1)
80
+
81
+ * <https://github.com/csw/bioruby-maf/>
82
+ * <http://fallabs.com/kyotocabinet/>
83
+
84
+
85
+
86
+ [SYNOPSIS]: #SYNOPSIS "SYNOPSIS"
87
+ [DESCRIPTION]: #DESCRIPTION "DESCRIPTION"
88
+ [FILES]: #FILES "FILES"
89
+ [OPTIONS]: #OPTIONS "OPTIONS"
90
+ [EXAMPLES]: #EXAMPLES "EXAMPLES"
91
+ [ENVIRONMENT]: #ENVIRONMENT "ENVIRONMENT"
92
+ [BUGS]: #BUGS "BUGS"
93
+ [COPYRIGHT]: #COPYRIGHT "COPYRIGHT"
94
+ [SEE ALSO]: #SEE-ALSO "SEE ALSO"
95
+
96
+
97
+ [maf_index(1)]: maf_index.1.html