bio-maf 1.0.0-java → 1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
data/bin/maf_bgzip
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'optparse'
|
3
4
|
require 'ostruct'
|
4
5
|
|
5
6
|
require 'bio-maf'
|
@@ -8,6 +9,9 @@ require 'bio-bgzf'
|
|
8
9
|
$options = OpenStruct.new
|
9
10
|
$options.dir = '.'
|
10
11
|
$options.ref_only = true
|
12
|
+
$options.n_jobs = 1
|
13
|
+
$options.force = false
|
14
|
+
$options.level = 2
|
11
15
|
|
12
16
|
op = OptionParser.new do |opts|
|
13
17
|
opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
|
@@ -26,31 +30,155 @@ op = OptionParser.new do |opts|
|
|
26
30
|
"(has no effect without --index)") do
|
27
31
|
$options.ref_only = false
|
28
32
|
end
|
33
|
+
opts.on("-l", "--level LEVEL", Integer,
|
34
|
+
"gzip compression level for BGZF (1-9)") do |level|
|
35
|
+
unless 1 <= level && level <= 9
|
36
|
+
$stderr.puts "Invalid compression level: #{level}"
|
37
|
+
$stderr.puts opts
|
38
|
+
exit 2
|
39
|
+
end
|
40
|
+
$options.level = level
|
41
|
+
end
|
42
|
+
opts.on("-f", "--force",
|
43
|
+
"Replace output files if they already exist") do
|
44
|
+
$options.force = true
|
45
|
+
end
|
46
|
+
opts.on("-j", "--jobs N", Integer,
|
47
|
+
"Run N concurrent jobs (default 1)") do |n|
|
48
|
+
$options.n_jobs = n
|
49
|
+
end
|
50
|
+
Bio::MAF::handle_logging_options(opts)
|
29
51
|
end
|
30
52
|
|
31
53
|
op.parse!(ARGV)
|
54
|
+
Bio::Log::CLI.configure('bio-maf')
|
32
55
|
|
33
|
-
|
34
|
-
|
35
|
-
|
56
|
+
INTERVAL = 10
|
57
|
+
LOG = Bio::MAF::LOG
|
58
|
+
|
59
|
+
def make_processing_task(maf)
|
60
|
+
maf_base = File.basename(maf)
|
36
61
|
base = maf_base.gsub(/\.maf.*/, '')
|
37
62
|
bgz_path = "#{$options.dir}/#{base}.maf.bgz"
|
63
|
+
if File.exist?(bgz_path) && ! $options.force
|
64
|
+
LOG.error "#{bgz_path} already exists, refusing to overwrite " \
|
65
|
+
"without --force!"
|
66
|
+
exit 1
|
67
|
+
end
|
68
|
+
idx_path = nil
|
69
|
+
if $options.index
|
70
|
+
idx_path = "#{$options.dir}/#{base}.kct"
|
71
|
+
if File.exist?(idx_path) && ! $options.force
|
72
|
+
LOG.error "#{idx_path} already exists, refusing to overwrite " \
|
73
|
+
"without --force!"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
lambda { process_maf(maf, bgz_path, idx_path) }
|
78
|
+
end
|
79
|
+
|
80
|
+
def process_maf(maf_path, bgz_path, idx_path)
|
81
|
+
maf_base = File.basename(maf_path)
|
82
|
+
LOG.debug { "Processing #{maf_base}." }
|
38
83
|
p = Bio::MAF::Parser.new(maf_path,
|
39
|
-
:
|
40
|
-
|
41
|
-
|
42
|
-
|
84
|
+
:retain_text => true)
|
85
|
+
if idx_path
|
86
|
+
if File.exists?(idx_path)
|
87
|
+
File.unlink(idx_path)
|
88
|
+
end
|
89
|
+
idx = Bio::MAF::KyotoIndex.new(idx_path)
|
90
|
+
idx.prep(bgz_path, :bgzf, $options.ref_only)
|
91
|
+
exec = Bio::MAF::Executor.create
|
92
|
+
end
|
93
|
+
start_t = Time.now
|
94
|
+
last_t = start_t
|
95
|
+
last_pos = 0
|
96
|
+
n_blocks = 0
|
97
|
+
maf_size = File.size(maf_path)
|
98
|
+
File.open(bgz_path, 'wb') do |out_f|
|
99
|
+
Bio::BGZF::Writer.new(out_f, $options.level) do |bgz_w|
|
43
100
|
maf_w = Bio::MAF::Writer.new(bgz_w)
|
44
101
|
maf_w.write_header(p.header)
|
45
102
|
p.each_block do |block|
|
46
|
-
|
103
|
+
bgz_w.write(block.orig_text)
|
104
|
+
if idx
|
105
|
+
block.offset = bgz_w.last_write_pos
|
106
|
+
exec.submit do
|
107
|
+
idx.index_blocks([block])
|
108
|
+
end
|
109
|
+
end
|
110
|
+
n_blocks += 1
|
111
|
+
if n_blocks % 100 == 0
|
112
|
+
cur_t = Time.now
|
113
|
+
delta_t = cur_t - last_t
|
114
|
+
if delta_t > INTERVAL
|
115
|
+
cur_pos = p.phys_f.tell
|
116
|
+
LOG.debug {
|
117
|
+
pos_mb = cur_pos.to_f / 1048576
|
118
|
+
delta_bytes = cur_pos - last_pos
|
119
|
+
rate = delta_bytes.to_f / delta_t
|
120
|
+
mb_rate = rate / 1048576
|
121
|
+
pct = cur_pos.to_f / maf_size * 100
|
122
|
+
elapsed = cur_t - start_t
|
123
|
+
sprintf("%s: processed %.1f MB (%.1f%%) in %ds, %.2f MB/s.",
|
124
|
+
maf_base,
|
125
|
+
pos_mb,
|
126
|
+
pct,
|
127
|
+
elapsed,
|
128
|
+
mb_rate)
|
129
|
+
}
|
130
|
+
last_t = cur_t
|
131
|
+
last_pos = cur_pos
|
132
|
+
end
|
133
|
+
end
|
47
134
|
end
|
48
135
|
end
|
49
136
|
end
|
137
|
+
unc = p.f.tell if p.f != p.phys_f
|
50
138
|
p.close
|
51
|
-
if
|
52
|
-
|
53
|
-
|
54
|
-
|
139
|
+
if idx
|
140
|
+
exec.shutdown
|
141
|
+
idx.db.synchronize(true)
|
142
|
+
end
|
143
|
+
elapsed = Time.now - start_t
|
144
|
+
mb = maf_size.to_f / 1048576
|
145
|
+
mb_rate = mb / elapsed
|
146
|
+
LOG.info { sprintf("Processed %s (%.1f MB) in %ds, %.2f MB/s",
|
147
|
+
maf_base,
|
148
|
+
mb,
|
149
|
+
elapsed,
|
150
|
+
mb_rate) }
|
151
|
+
if unc
|
152
|
+
LOG.info {
|
153
|
+
unc_mb = unc / 1048576
|
154
|
+
unc_rate = unc_mb / elapsed
|
155
|
+
sprintf(" Uncompressed: %.1f MB, %.2f MB/s",
|
156
|
+
unc_mb, unc_rate)
|
157
|
+
}
|
55
158
|
end
|
159
|
+
LOG.info {
|
160
|
+
raw_size = unc || maf_size
|
161
|
+
avg_block_kb = raw_size.to_f / n_blocks / 1024
|
162
|
+
sprintf(" %d alignment blocks, average size %.2f KB",
|
163
|
+
n_blocks, avg_block_kb)
|
164
|
+
}
|
165
|
+
LOG.info {
|
166
|
+
orig_size = unc ? unc : maf_size
|
167
|
+
bgzf_size = File.size(bgz_path).to_f
|
168
|
+
ratio = bgzf_size / orig_size
|
169
|
+
sprintf(" Compressed with BGZF (level=%d) to %.1f MB (%.1fx)",
|
170
|
+
$options.level,
|
171
|
+
bgzf_size / 1048576,
|
172
|
+
ratio)
|
173
|
+
}
|
174
|
+
end
|
175
|
+
|
176
|
+
runner = Bio::MAF::JobRunner.create($options.n_jobs)
|
177
|
+
LOG.debug "Created #{runner.class} set for #{$options.n_jobs} concurrent jobs."
|
178
|
+
ARGV.each do |maf|
|
179
|
+
task = make_processing_task(maf)
|
180
|
+
runner.add(&task)
|
56
181
|
end
|
182
|
+
LOG.debug "Running jobs."
|
183
|
+
runner.run
|
184
|
+
LOG.debug "Finished processing."
|
data/bin/maf_extract
CHANGED
@@ -6,12 +6,13 @@ require 'ostruct'
|
|
6
6
|
|
7
7
|
include Bio::MAF
|
8
8
|
|
9
|
-
options = OpenStruct.new
|
10
|
-
options.mode = :intersect
|
11
|
-
options.format = :maf
|
12
|
-
options.
|
13
|
-
options.
|
14
|
-
options.
|
9
|
+
$options = OpenStruct.new
|
10
|
+
$options.mode = :intersect
|
11
|
+
$options.format = :maf
|
12
|
+
$options.one_based = false
|
13
|
+
$options.seq_filter = {}
|
14
|
+
$options.block_filter = {}
|
15
|
+
$options.parse_options = {}
|
15
16
|
|
16
17
|
def handle_list_spec(spec)
|
17
18
|
if spec =~ /^@(.+)/
|
@@ -23,7 +24,11 @@ end
|
|
23
24
|
|
24
25
|
def handle_interval_spec(int)
|
25
26
|
if int =~ /(.+):(\d+)-(\d+)/
|
26
|
-
|
27
|
+
if $options.one_based
|
28
|
+
Bio::GenomicInterval.new($1, $2.to_i, $3.to_i)
|
29
|
+
else
|
30
|
+
Bio::GenomicInterval.zero_based($1, $2.to_i, $3.to_i)
|
31
|
+
end
|
27
32
|
else
|
28
33
|
raise "Invalid interval specification: #{int}"
|
29
34
|
end
|
@@ -34,13 +39,13 @@ $op = OptionParser.new do |opts|
|
|
34
39
|
opts.separator ""
|
35
40
|
opts.separator "MAF source options (either --maf or --maf-dir must be given):"
|
36
41
|
opts.on("-m", "--maf MAF", "MAF file") do |maf|
|
37
|
-
options.maf = maf
|
42
|
+
$options.maf = maf
|
38
43
|
end
|
39
44
|
opts.on("-i", "--index INDEX", "MAF index") do |idx|
|
40
|
-
options.idx = idx
|
45
|
+
$options.idx = idx
|
41
46
|
end
|
42
47
|
opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
|
43
|
-
options.maf_dir = dir
|
48
|
+
$options.maf_dir = dir
|
44
49
|
end
|
45
50
|
opts.separator ""
|
46
51
|
opts.separator "Extraction options:"
|
@@ -49,21 +54,26 @@ $op = OptionParser.new do |opts|
|
|
49
54
|
"blocks intersecting the given region,",
|
50
55
|
"or 'slice' to extract subsets covering ",
|
51
56
|
"given regions") do |mode|
|
52
|
-
options.mode = mode
|
57
|
+
$options.mode = mode
|
53
58
|
end
|
54
59
|
opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
|
55
|
-
options.bed = bed
|
60
|
+
$options.bed = bed
|
56
61
|
end
|
57
62
|
opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
|
58
|
-
options.interval = handle_interval_spec(int)
|
63
|
+
$options.interval = handle_interval_spec(int)
|
64
|
+
end
|
65
|
+
opts.on("--one-based",
|
66
|
+
"Treat all intervals as one-based",
|
67
|
+
"(even from BED files, contrary to the standard)") do
|
68
|
+
$options.one_based = true
|
59
69
|
end
|
60
70
|
opts.separator ""
|
61
71
|
opts.separator "Output options:"
|
62
72
|
opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
|
63
|
-
options.format = fmt
|
73
|
+
$options.format = fmt
|
64
74
|
end
|
65
75
|
opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
|
66
|
-
options.out_path = out
|
76
|
+
$options.out_path = out
|
67
77
|
end
|
68
78
|
opts.separator ""
|
69
79
|
opts.separator "Filtering options:"
|
@@ -71,41 +81,41 @@ $op = OptionParser.new do |opts|
|
|
71
81
|
"Filter out all but the species in the",
|
72
82
|
"given comma-separated list",
|
73
83
|
"(or @FILE to read from a file)") do |spec|
|
74
|
-
options.seq_filter[:only_species] = handle_list_spec(spec)
|
84
|
+
$options.seq_filter[:only_species] = handle_list_spec(spec)
|
75
85
|
end
|
76
86
|
opts.on("--with-all-species SPECIES",
|
77
87
|
"Only match blocks with all the given",
|
78
88
|
"species, comma-separated",
|
79
89
|
"(or @FILE to read from a file)") do |spec|
|
80
|
-
options.block_filter[:with_all_species] = handle_list_spec(spec)
|
90
|
+
$options.block_filter[:with_all_species] = handle_list_spec(spec)
|
81
91
|
end
|
82
92
|
opts.on("--min-sequences N", Integer,
|
83
93
|
"Match only blocks with at least N sequences") do |n|
|
84
|
-
options.block_filter[:at_least_n_sequences] = n
|
94
|
+
$options.block_filter[:at_least_n_sequences] = n
|
85
95
|
end
|
86
96
|
opts.on("--min-text-size N", Integer,
|
87
97
|
"Match only blocks with minimum text size N") do |n|
|
88
|
-
options.block_filter[:min_size] = n
|
98
|
+
$options.block_filter[:min_size] = n
|
89
99
|
end
|
90
100
|
opts.on("--max-text-size N", Integer,
|
91
101
|
"Match only blocks with maximum text size N") do |n|
|
92
|
-
options.block_filter[:max_size] = n
|
102
|
+
$options.block_filter[:max_size] = n
|
93
103
|
end
|
94
104
|
opts.separator ""
|
95
105
|
opts.separator "Block processing options:"
|
96
106
|
opts.on("--join-blocks",
|
97
107
|
"Join blocks if appropriate after filtering",
|
98
108
|
"out sequences") do
|
99
|
-
options.parse_options[:join_blocks] = true
|
109
|
+
$options.parse_options[:join_blocks] = true
|
100
110
|
end
|
101
111
|
opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
|
102
|
-
options.parse_options[:remove_gaps] = true
|
112
|
+
$options.parse_options[:remove_gaps] = true
|
103
113
|
end
|
104
114
|
opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
|
105
|
-
options.parse_options[:parse_extended] = true
|
115
|
+
$options.parse_options[:parse_extended] = true
|
106
116
|
end
|
107
117
|
opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
|
108
|
-
options.parse_options[:parse_empty] = true
|
118
|
+
$options.parse_options[:parse_empty] = true
|
109
119
|
end
|
110
120
|
opts.separator ""
|
111
121
|
opts.separator "Logging options:"
|
@@ -120,24 +130,24 @@ def usage(msg)
|
|
120
130
|
exit 2
|
121
131
|
end
|
122
132
|
|
123
|
-
if options.maf
|
124
|
-
access = Access.file(options.maf, options.idx, options.parse_options)
|
125
|
-
elsif options.maf_dir
|
126
|
-
access = Access.maf_dir(options.maf_dir, options.parse_options)
|
133
|
+
if $options.maf
|
134
|
+
access = Access.file($options.maf, $options.idx, $options.parse_options)
|
135
|
+
elsif $options.maf_dir
|
136
|
+
access = Access.maf_dir($options.maf_dir, $options.parse_options)
|
127
137
|
else
|
128
138
|
usage "Must supply --maf or --maf-dir!"
|
129
139
|
end
|
130
140
|
|
131
141
|
begin
|
132
|
-
access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
|
133
|
-
access.block_filter = options.block_filter unless options.block_filter.empty?
|
134
|
-
if options.out_path
|
135
|
-
outf = File.open(options.out_path, 'w')
|
142
|
+
access.sequence_filter = $options.seq_filter unless $options.seq_filter.empty?
|
143
|
+
access.block_filter = $options.block_filter unless $options.block_filter.empty?
|
144
|
+
if $options.out_path
|
145
|
+
outf = File.open($options.out_path, 'w')
|
136
146
|
else
|
137
147
|
outf = $stdout
|
138
148
|
end
|
139
149
|
|
140
|
-
case options.format
|
150
|
+
case $options.format
|
141
151
|
when :maf
|
142
152
|
writer = Writer.new(outf)
|
143
153
|
when :fasta
|
@@ -146,20 +156,20 @@ begin
|
|
146
156
|
raise "unsupported output format #{format}!"
|
147
157
|
end
|
148
158
|
|
149
|
-
if options.bed
|
150
|
-
intervals = read_bed_intervals(options.bed)
|
151
|
-
elsif options.interval
|
152
|
-
intervals = [options.interval]
|
159
|
+
if $options.bed
|
160
|
+
intervals = read_bed_intervals($options.bed)
|
161
|
+
elsif $options.interval
|
162
|
+
intervals = [$options.interval]
|
153
163
|
else
|
154
164
|
usage "Must supply --interval or --bed!"
|
155
165
|
end
|
156
166
|
|
157
167
|
# TODO: provide access to original MAF header?
|
158
|
-
if options.format == :maf
|
168
|
+
if $options.format == :maf
|
159
169
|
writer.write_header(Header.default)
|
160
170
|
end
|
161
171
|
|
162
|
-
case options.mode
|
172
|
+
case $options.mode
|
163
173
|
when :intersect
|
164
174
|
access.find(intervals) do |block|
|
165
175
|
writer.write_block(block)
|
@@ -172,7 +182,7 @@ begin
|
|
172
182
|
end
|
173
183
|
end
|
174
184
|
else
|
175
|
-
raise "Unsupported mode #{options.mode}!"
|
185
|
+
raise "Unsupported mode #{$options.mode}!"
|
176
186
|
end
|
177
187
|
|
178
188
|
ensure
|
data/bin/maf_index
CHANGED
@@ -14,10 +14,10 @@ PRINTERS = {
|
|
14
14
|
$options = OpenStruct.new
|
15
15
|
$options.mode = :build
|
16
16
|
$options.ref_only = true
|
17
|
+
$options.parser_opts = { :parse_extended => false }
|
17
18
|
|
18
19
|
def build_index(maf, index)
|
19
|
-
parser = Bio::MAF::Parser.new(maf,
|
20
|
-
:parse_extended => false)
|
20
|
+
parser = Bio::MAF::Parser.new(maf, $options.parser_opts)
|
21
21
|
idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
|
22
22
|
idx.close
|
23
23
|
end
|
@@ -36,6 +36,15 @@ op = OptionParser.new do |opts|
|
|
36
36
|
opts.on("-d", "--dump", "Dump contents of given INDEX") do
|
37
37
|
$options.mode = :dump
|
38
38
|
end
|
39
|
+
opts.on("-O", "--parser-option OPT") do |opt|
|
40
|
+
if opt =~ /(-?)(.+)/
|
41
|
+
val = ! ($1 == "-")
|
42
|
+
option = $2.to_sym
|
43
|
+
$options.parser_opts[option] = val
|
44
|
+
else
|
45
|
+
raise "malformed parser option #{opt}!"
|
46
|
+
end
|
47
|
+
end
|
39
48
|
opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
|
40
49
|
require 'ruby-prof'
|
41
50
|
if pspec =~ /(\w+):(.+)/
|
data/bin/maf_tile
CHANGED
@@ -10,7 +10,16 @@ def parse_interval(line)
|
|
10
10
|
src, r_start_s, r_end_s, _ = line.split(nil, 4)
|
11
11
|
r_start = r_start_s.to_i
|
12
12
|
r_end = r_end_s.to_i
|
13
|
-
|
13
|
+
i_src = if $options.bed_species
|
14
|
+
"#{$options.bed_species}.#{src}"
|
15
|
+
else
|
16
|
+
src
|
17
|
+
end
|
18
|
+
if $options.one_based
|
19
|
+
Bio::GenomicInterval.new(i_src, r_start, r_end)
|
20
|
+
else
|
21
|
+
Bio::GenomicInterval.zero_based(i_src, r_start, r_end)
|
22
|
+
end
|
14
23
|
end
|
15
24
|
|
16
25
|
def target_for(base, interval, &blk)
|
@@ -18,52 +27,96 @@ def target_for(base, interval, &blk)
|
|
18
27
|
File.open(path, 'w', &blk)
|
19
28
|
end
|
20
29
|
|
21
|
-
def apply_options(
|
22
|
-
tiler.reference = options.ref if options.ref
|
23
|
-
tiler.species = options.species
|
24
|
-
tiler.species_map = options.species_map
|
30
|
+
def apply_options(tiler)
|
31
|
+
tiler.reference = $options.ref if $options.ref
|
32
|
+
tiler.species = $options.species
|
33
|
+
tiler.species_map = $options.species_map
|
34
|
+
tiler.fill_char = $options.fill_char if $options.fill_char
|
25
35
|
end
|
26
36
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
37
|
+
def each_tiler(access, intervals)
|
38
|
+
intervals.each do |int|
|
39
|
+
access.tile(int) do |tiler|
|
40
|
+
apply_options(tiler)
|
41
|
+
yield tiler
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
$options = OpenStruct.new
|
47
|
+
$options.p = { :threads => 1 }
|
48
|
+
$options.species = []
|
49
|
+
$options.species_map = {}
|
50
|
+
$options.usage = false
|
32
51
|
|
33
52
|
o_parser = OptionParser.new do |opts|
|
34
53
|
opts.banner = "Usage: maf_tile [options] <maf> [index]"
|
35
54
|
opts.separator ""
|
36
55
|
opts.separator "Options:"
|
37
56
|
opts.on("-r", "--reference SEQ", "FASTA reference sequence") do |ref|
|
38
|
-
options.ref = ref
|
39
|
-
end
|
40
|
-
opts.on("-i", "--interval [CHR:]BEGIN
|
41
|
-
if int =~ /(.+):(\d+)
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
options.interval = ($1.to_i)...($2.to_i)
|
57
|
+
$options.ref = ref
|
58
|
+
end
|
59
|
+
opts.on("-i", "--interval [CHR:]BEGIN-END", "Genomic interval, zero-based") do |int|
|
60
|
+
if int =~ /(.+):(\d+)-(\d+)/
|
61
|
+
$options.genomic_interval_spec = [$1, $2.to_i, $3.to_i]
|
62
|
+
elsif int =~ /(\d+)-(\d+)/
|
63
|
+
$options.interval = ($1.to_i)...($2.to_i)
|
46
64
|
else
|
47
65
|
$stderr.puts "Invalid interval specification #{int}!"
|
48
|
-
options.usage = true
|
66
|
+
$options.usage = true
|
49
67
|
end
|
50
68
|
end
|
51
|
-
opts.on("-
|
69
|
+
opts.on("--one-based",
|
70
|
+
"Treat all intervals as one-based",
|
71
|
+
"(even from BED files, contrary to the standard)") do
|
72
|
+
$options.one_based = true
|
73
|
+
end
|
74
|
+
opts.on("-s", "--species SPECIES[:NAME]",
|
75
|
+
"Species to use (mapped name optional)",
|
76
|
+
"(can be a comma-separated list)") do |sp|
|
52
77
|
if sp =~ /:/
|
53
78
|
species, mapped = sp.split(/:/)
|
54
|
-
options.species << species
|
55
|
-
options.species_map[species] = mapped
|
79
|
+
$options.species << species
|
80
|
+
$options.species_map[species] = mapped
|
81
|
+
elsif sp =~ /,/
|
82
|
+
$options.species.concat(sp.split(/,/))
|
56
83
|
else
|
57
|
-
options.species << sp
|
84
|
+
$options.species << sp
|
85
|
+
end
|
86
|
+
end
|
87
|
+
opts.on("--species-file FILE", "File specifying species and optionally mapped names") do |file|
|
88
|
+
File.open(file) do |f|
|
89
|
+
f.each_line do |line|
|
90
|
+
next if line =~ /^#/
|
91
|
+
parts = line.split
|
92
|
+
next unless parts.size > 0
|
93
|
+
$options.species << parts[0]
|
94
|
+
$options.species_map[parts[0]] = parts[1] if parts[1]
|
95
|
+
end
|
58
96
|
end
|
59
97
|
end
|
60
98
|
opts.on("-o", "--output-base BASE", "Base name for output files",
|
61
99
|
"Use stdout for a single interval if not given") do |base|
|
62
|
-
options.output_base = base
|
100
|
+
$options.output_base = base
|
63
101
|
end
|
64
102
|
opts.on("--bed BED", "BED file specifying intervals",
|
65
103
|
"(requires --output-base)") do |bed|
|
66
|
-
options.bed = bed
|
104
|
+
$options.bed = bed
|
105
|
+
end
|
106
|
+
opts.on("--bed-species SPECIES",
|
107
|
+
"Species to prepend to BED chromosome specs") do |species|
|
108
|
+
$options.bed_species = species
|
109
|
+
end
|
110
|
+
opts.on("--fill-char C",
|
111
|
+
"Fill gaps with character C",
|
112
|
+
"(default is *)") do |char|
|
113
|
+
$options.fill_char = char
|
114
|
+
end
|
115
|
+
opts.on("--upcase", "Fold all sequence data to upper case") do
|
116
|
+
$options.p[:upcase] = true
|
117
|
+
end
|
118
|
+
opts.on("--concat", "Concatenate result blocks") do
|
119
|
+
$options.concat = true
|
67
120
|
end
|
68
121
|
Bio::MAF::handle_logging_options(opts)
|
69
122
|
end
|
@@ -74,52 +127,96 @@ Bio::Log::CLI.configure('bio-maf')
|
|
74
127
|
maf_p = ARGV.shift
|
75
128
|
index_p = ARGV.shift
|
76
129
|
|
77
|
-
unless
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
130
|
+
unless maf_p
|
131
|
+
$stderr.puts "Must specify MAF file to process!"
|
132
|
+
$options.usage = true
|
133
|
+
end
|
134
|
+
|
135
|
+
if $options.species.empty?
|
136
|
+
$stderr.puts "Must specify species to tile with --species!"
|
137
|
+
$options.usage = true
|
138
|
+
end
|
139
|
+
|
140
|
+
unless $options.bed || $options.interval || $options.genomic_interval_spec
|
141
|
+
$stderr.puts "Must specify --bed or --interval!"
|
142
|
+
$options.usage = true
|
143
|
+
end
|
144
|
+
|
145
|
+
if $options.bed && ! ($options.output_base || $options.concat)
|
146
|
+
$stderr.puts "Must specify --output-base or --concat when specifying --bed!"
|
147
|
+
$options.usage = true
|
148
|
+
end
|
149
|
+
|
150
|
+
if (! $options.output_base) && ! ($options.interval || $options.genomic_interval_spec || ($options.bed && $options.concat))
|
151
|
+
$stderr.puts "Must specify --interval or --bed with --concat if --output-base is not given!"
|
152
|
+
$options.usage = true
|
153
|
+
end
|
154
|
+
|
155
|
+
if $options.usage
|
82
156
|
$stderr.puts o_parser
|
83
157
|
exit 2
|
84
158
|
end
|
85
159
|
|
86
160
|
access = if File.directory? maf_p
|
87
|
-
Bio::MAF::Access.maf_dir(maf_p, options.p)
|
161
|
+
Bio::MAF::Access.maf_dir(maf_p, $options.p)
|
88
162
|
else
|
89
|
-
Bio::MAF::Access.file(maf_p, index_p, options.p)
|
163
|
+
Bio::MAF::Access.file(maf_p, index_p, $options.p)
|
90
164
|
end
|
91
165
|
|
92
|
-
if options.bed
|
166
|
+
if $options.bed
|
93
167
|
intervals = []
|
94
|
-
File.open(options.bed) do |bed_f|
|
168
|
+
File.open($options.bed) do |bed_f|
|
95
169
|
bed_f.each_line { |line| intervals << parse_interval(line) }
|
96
170
|
end
|
97
|
-
intervals.sort_by! { |int| int.zero_start }
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
171
|
+
#intervals.sort_by! { |int| int.zero_start }
|
172
|
+
if $options.concat
|
173
|
+
# concatenate, as with exons
|
174
|
+
concat = Array.new($options.species.size)
|
175
|
+
concat.fill { '' }
|
176
|
+
non_fill = nil
|
177
|
+
each_tiler(access, intervals) do |tiler|
|
178
|
+
non_fill = tiler.non_fill_re if ! non_fill
|
179
|
+
concat.zip(tiler.build_bio_alignment) do |buf, seq|
|
180
|
+
buf << seq.to_s
|
181
|
+
end
|
182
|
+
end
|
183
|
+
fh = $options.output_base ? File.open($options.output_base, 'wb') : $stdout
|
184
|
+
$options.species.zip(concat) do |species, seq|
|
185
|
+
if non_fill.match(seq)
|
186
|
+
sp_out_name = $options.species_map[species] || species
|
187
|
+
fh.puts ">#{sp_out_name}", seq.scan(/.{1,70}/)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
else
|
191
|
+
# output each interval separately
|
192
|
+
each_tiler(access, intervals) do |tiler|
|
193
|
+
target_for($options.output_base, tiler.interval) do |target|
|
102
194
|
tiler.write_fasta(target)
|
103
195
|
end
|
104
196
|
end
|
105
197
|
end
|
106
198
|
else
|
107
199
|
# single interval
|
108
|
-
if options.
|
109
|
-
|
200
|
+
if $options.genomic_interval_spec
|
201
|
+
spec = $options.genomic_interval_spec
|
202
|
+
if $options.one_based
|
203
|
+
interval = Bio::GenomicInterval.new(*spec)
|
204
|
+
else
|
205
|
+
interval = Bio::GenomicInterval.zero_based(*spec)
|
206
|
+
end
|
110
207
|
else
|
111
208
|
if access.indices.size != 1
|
112
209
|
raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!"
|
113
210
|
end
|
114
211
|
ref_seq = access.indices.keys.first
|
115
212
|
interval = Bio::GenomicInterval.zero_based(ref_seq,
|
116
|
-
options.interval.begin,
|
117
|
-
options.interval.end)
|
213
|
+
$options.interval.begin,
|
214
|
+
$options.interval.end)
|
118
215
|
end
|
119
216
|
access.tile(interval) do |tiler|
|
120
|
-
apply_options(
|
121
|
-
if options.output_base
|
122
|
-
target = target_for(options.output_base, tiler.interval)
|
217
|
+
apply_options(tiler)
|
218
|
+
if $options.output_base
|
219
|
+
target = target_for($options.output_base, tiler.interval)
|
123
220
|
else
|
124
221
|
target = $stdout
|
125
222
|
end
|