bio-maf 1.0.0-java → 1.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/maf_bgzip +140 -12
- data/bin/maf_extract +50 -40
- data/bin/maf_index +11 -2
- data/bin/maf_tile +143 -46
- data/bio-maf.gemspec +3 -3
- data/features/bgzf.feature +45 -0
- data/features/maf-indexing.feature +6 -0
- data/features/maf-parsing.feature +17 -0
- data/features/maf-querying.feature +11 -0
- data/features/slice.feature +11 -0
- data/features/step_definitions/parse_steps.rb +1 -0
- data/features/tiling.feature +23 -5
- data/lib/bio-maf.rb +5 -1
- data/lib/bio/maf.rb +1 -0
- data/lib/bio/maf/index.rb +158 -68
- data/lib/bio/maf/jobs.rb +168 -0
- data/lib/bio/maf/maf.rb +24 -1
- data/lib/bio/maf/parser.rb +90 -35
- data/lib/bio/maf/struct.rb +4 -0
- data/lib/bio/maf/tiler.rb +30 -3
- data/lib/bio/ucsc/ucsc_bin.rb +14 -1
- data/man/maf_bgzip.1 +27 -0
- data/man/maf_bgzip.1.ronn +32 -0
- data/spec/bio/maf/index_spec.rb +3 -1
- data/spec/bio/maf/parser_spec.rb +6 -2
- data/spec/bio/ucsc/ucsc_bin_spec.rb +18 -0
- data/test/data/empty.maf +2 -0
- data/test/data/ext-bin.maf +22 -0
- data/test/data/gap-1.kct +0 -0
- data/test/data/mm8_chr7_tiny.kct +0 -0
- data/test/data/mm8_chrM_tiny.kct +0 -0
- metadata +380 -184
data/bin/maf_bgzip
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'optparse'
|
3
4
|
require 'ostruct'
|
4
5
|
|
5
6
|
require 'bio-maf'
|
@@ -8,6 +9,9 @@ require 'bio-bgzf'
|
|
8
9
|
$options = OpenStruct.new
|
9
10
|
$options.dir = '.'
|
10
11
|
$options.ref_only = true
|
12
|
+
$options.n_jobs = 1
|
13
|
+
$options.force = false
|
14
|
+
$options.level = 2
|
11
15
|
|
12
16
|
op = OptionParser.new do |opts|
|
13
17
|
opts.banner = "Usage: maf_bgzip [options] [<maf> ...]"
|
@@ -26,31 +30,155 @@ op = OptionParser.new do |opts|
|
|
26
30
|
"(has no effect without --index)") do
|
27
31
|
$options.ref_only = false
|
28
32
|
end
|
33
|
+
opts.on("-l", "--level LEVEL", Integer,
|
34
|
+
"gzip compression level for BGZF (1-9)") do |level|
|
35
|
+
unless 1 <= level && level <= 9
|
36
|
+
$stderr.puts "Invalid compression level: #{level}"
|
37
|
+
$stderr.puts opts
|
38
|
+
exit 2
|
39
|
+
end
|
40
|
+
$options.level = level
|
41
|
+
end
|
42
|
+
opts.on("-f", "--force",
|
43
|
+
"Replace output files if they already exist") do
|
44
|
+
$options.force = true
|
45
|
+
end
|
46
|
+
opts.on("-j", "--jobs N", Integer,
|
47
|
+
"Run N concurrent jobs (default 1)") do |n|
|
48
|
+
$options.n_jobs = n
|
49
|
+
end
|
50
|
+
Bio::MAF::handle_logging_options(opts)
|
29
51
|
end
|
30
52
|
|
31
53
|
op.parse!(ARGV)
|
54
|
+
Bio::Log::CLI.configure('bio-maf')
|
32
55
|
|
33
|
-
|
34
|
-
|
35
|
-
|
56
|
+
INTERVAL = 10
|
57
|
+
LOG = Bio::MAF::LOG
|
58
|
+
|
59
|
+
def make_processing_task(maf)
|
60
|
+
maf_base = File.basename(maf)
|
36
61
|
base = maf_base.gsub(/\.maf.*/, '')
|
37
62
|
bgz_path = "#{$options.dir}/#{base}.maf.bgz"
|
63
|
+
if File.exist?(bgz_path) && ! $options.force
|
64
|
+
LOG.error "#{bgz_path} already exists, refusing to overwrite " \
|
65
|
+
"without --force!"
|
66
|
+
exit 1
|
67
|
+
end
|
68
|
+
idx_path = nil
|
69
|
+
if $options.index
|
70
|
+
idx_path = "#{$options.dir}/#{base}.kct"
|
71
|
+
if File.exist?(idx_path) && ! $options.force
|
72
|
+
LOG.error "#{idx_path} already exists, refusing to overwrite " \
|
73
|
+
"without --force!"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
lambda { process_maf(maf, bgz_path, idx_path) }
|
78
|
+
end
|
79
|
+
|
80
|
+
def process_maf(maf_path, bgz_path, idx_path)
|
81
|
+
maf_base = File.basename(maf_path)
|
82
|
+
LOG.debug { "Processing #{maf_base}." }
|
38
83
|
p = Bio::MAF::Parser.new(maf_path,
|
39
|
-
:
|
40
|
-
|
41
|
-
|
42
|
-
|
84
|
+
:retain_text => true)
|
85
|
+
if idx_path
|
86
|
+
if File.exists?(idx_path)
|
87
|
+
File.unlink(idx_path)
|
88
|
+
end
|
89
|
+
idx = Bio::MAF::KyotoIndex.new(idx_path)
|
90
|
+
idx.prep(bgz_path, :bgzf, $options.ref_only)
|
91
|
+
exec = Bio::MAF::Executor.create
|
92
|
+
end
|
93
|
+
start_t = Time.now
|
94
|
+
last_t = start_t
|
95
|
+
last_pos = 0
|
96
|
+
n_blocks = 0
|
97
|
+
maf_size = File.size(maf_path)
|
98
|
+
File.open(bgz_path, 'wb') do |out_f|
|
99
|
+
Bio::BGZF::Writer.new(out_f, $options.level) do |bgz_w|
|
43
100
|
maf_w = Bio::MAF::Writer.new(bgz_w)
|
44
101
|
maf_w.write_header(p.header)
|
45
102
|
p.each_block do |block|
|
46
|
-
|
103
|
+
bgz_w.write(block.orig_text)
|
104
|
+
if idx
|
105
|
+
block.offset = bgz_w.last_write_pos
|
106
|
+
exec.submit do
|
107
|
+
idx.index_blocks([block])
|
108
|
+
end
|
109
|
+
end
|
110
|
+
n_blocks += 1
|
111
|
+
if n_blocks % 100 == 0
|
112
|
+
cur_t = Time.now
|
113
|
+
delta_t = cur_t - last_t
|
114
|
+
if delta_t > INTERVAL
|
115
|
+
cur_pos = p.phys_f.tell
|
116
|
+
LOG.debug {
|
117
|
+
pos_mb = cur_pos.to_f / 1048576
|
118
|
+
delta_bytes = cur_pos - last_pos
|
119
|
+
rate = delta_bytes.to_f / delta_t
|
120
|
+
mb_rate = rate / 1048576
|
121
|
+
pct = cur_pos.to_f / maf_size * 100
|
122
|
+
elapsed = cur_t - start_t
|
123
|
+
sprintf("%s: processed %.1f MB (%.1f%%) in %ds, %.2f MB/s.",
|
124
|
+
maf_base,
|
125
|
+
pos_mb,
|
126
|
+
pct,
|
127
|
+
elapsed,
|
128
|
+
mb_rate)
|
129
|
+
}
|
130
|
+
last_t = cur_t
|
131
|
+
last_pos = cur_pos
|
132
|
+
end
|
133
|
+
end
|
47
134
|
end
|
48
135
|
end
|
49
136
|
end
|
137
|
+
unc = p.f.tell if p.f != p.phys_f
|
50
138
|
p.close
|
51
|
-
if
|
52
|
-
|
53
|
-
|
54
|
-
|
139
|
+
if idx
|
140
|
+
exec.shutdown
|
141
|
+
idx.db.synchronize(true)
|
142
|
+
end
|
143
|
+
elapsed = Time.now - start_t
|
144
|
+
mb = maf_size.to_f / 1048576
|
145
|
+
mb_rate = mb / elapsed
|
146
|
+
LOG.info { sprintf("Processed %s (%.1f MB) in %ds, %.2f MB/s",
|
147
|
+
maf_base,
|
148
|
+
mb,
|
149
|
+
elapsed,
|
150
|
+
mb_rate) }
|
151
|
+
if unc
|
152
|
+
LOG.info {
|
153
|
+
unc_mb = unc / 1048576
|
154
|
+
unc_rate = unc_mb / elapsed
|
155
|
+
sprintf(" Uncompressed: %.1f MB, %.2f MB/s",
|
156
|
+
unc_mb, unc_rate)
|
157
|
+
}
|
55
158
|
end
|
159
|
+
LOG.info {
|
160
|
+
raw_size = unc || maf_size
|
161
|
+
avg_block_kb = raw_size.to_f / n_blocks / 1024
|
162
|
+
sprintf(" %d alignment blocks, average size %.2f KB",
|
163
|
+
n_blocks, avg_block_kb)
|
164
|
+
}
|
165
|
+
LOG.info {
|
166
|
+
orig_size = unc ? unc : maf_size
|
167
|
+
bgzf_size = File.size(bgz_path).to_f
|
168
|
+
ratio = bgzf_size / orig_size
|
169
|
+
sprintf(" Compressed with BGZF (level=%d) to %.1f MB (%.1fx)",
|
170
|
+
$options.level,
|
171
|
+
bgzf_size / 1048576,
|
172
|
+
ratio)
|
173
|
+
}
|
174
|
+
end
|
175
|
+
|
176
|
+
runner = Bio::MAF::JobRunner.create($options.n_jobs)
|
177
|
+
LOG.debug "Created #{runner.class} set for #{$options.n_jobs} concurrent jobs."
|
178
|
+
ARGV.each do |maf|
|
179
|
+
task = make_processing_task(maf)
|
180
|
+
runner.add(&task)
|
56
181
|
end
|
182
|
+
LOG.debug "Running jobs."
|
183
|
+
runner.run
|
184
|
+
LOG.debug "Finished processing."
|
data/bin/maf_extract
CHANGED
@@ -6,12 +6,13 @@ require 'ostruct'
|
|
6
6
|
|
7
7
|
include Bio::MAF
|
8
8
|
|
9
|
-
options = OpenStruct.new
|
10
|
-
options.mode = :intersect
|
11
|
-
options.format = :maf
|
12
|
-
options.
|
13
|
-
options.
|
14
|
-
options.
|
9
|
+
$options = OpenStruct.new
|
10
|
+
$options.mode = :intersect
|
11
|
+
$options.format = :maf
|
12
|
+
$options.one_based = false
|
13
|
+
$options.seq_filter = {}
|
14
|
+
$options.block_filter = {}
|
15
|
+
$options.parse_options = {}
|
15
16
|
|
16
17
|
def handle_list_spec(spec)
|
17
18
|
if spec =~ /^@(.+)/
|
@@ -23,7 +24,11 @@ end
|
|
23
24
|
|
24
25
|
def handle_interval_spec(int)
|
25
26
|
if int =~ /(.+):(\d+)-(\d+)/
|
26
|
-
|
27
|
+
if $options.one_based
|
28
|
+
Bio::GenomicInterval.new($1, $2.to_i, $3.to_i)
|
29
|
+
else
|
30
|
+
Bio::GenomicInterval.zero_based($1, $2.to_i, $3.to_i)
|
31
|
+
end
|
27
32
|
else
|
28
33
|
raise "Invalid interval specification: #{int}"
|
29
34
|
end
|
@@ -34,13 +39,13 @@ $op = OptionParser.new do |opts|
|
|
34
39
|
opts.separator ""
|
35
40
|
opts.separator "MAF source options (either --maf or --maf-dir must be given):"
|
36
41
|
opts.on("-m", "--maf MAF", "MAF file") do |maf|
|
37
|
-
options.maf = maf
|
42
|
+
$options.maf = maf
|
38
43
|
end
|
39
44
|
opts.on("-i", "--index INDEX", "MAF index") do |idx|
|
40
|
-
options.idx = idx
|
45
|
+
$options.idx = idx
|
41
46
|
end
|
42
47
|
opts.on("-d", "--maf-dir DIR", "MAF directory") do |dir|
|
43
|
-
options.maf_dir = dir
|
48
|
+
$options.maf_dir = dir
|
44
49
|
end
|
45
50
|
opts.separator ""
|
46
51
|
opts.separator "Extraction options:"
|
@@ -49,21 +54,26 @@ $op = OptionParser.new do |opts|
|
|
49
54
|
"blocks intersecting the given region,",
|
50
55
|
"or 'slice' to extract subsets covering ",
|
51
56
|
"given regions") do |mode|
|
52
|
-
options.mode = mode
|
57
|
+
$options.mode = mode
|
53
58
|
end
|
54
59
|
opts.on("--bed BED", "Use intervals from the given BED file") do |bed|
|
55
|
-
options.bed = bed
|
60
|
+
$options.bed = bed
|
56
61
|
end
|
57
62
|
opts.on("--interval SEQ:START:END", "Zero-based genomic interval to match") do |int|
|
58
|
-
options.interval = handle_interval_spec(int)
|
63
|
+
$options.interval = handle_interval_spec(int)
|
64
|
+
end
|
65
|
+
opts.on("--one-based",
|
66
|
+
"Treat all intervals as one-based",
|
67
|
+
"(even from BED files, contrary to the standard)") do
|
68
|
+
$options.one_based = true
|
59
69
|
end
|
60
70
|
opts.separator ""
|
61
71
|
opts.separator "Output options:"
|
62
72
|
opts.on("-f", "--format FMT", [:maf, :fasta], "Output format") do |fmt|
|
63
|
-
options.format = fmt
|
73
|
+
$options.format = fmt
|
64
74
|
end
|
65
75
|
opts.on("-o", "--output OUT", "Write output to file OUT") do |out|
|
66
|
-
options.out_path = out
|
76
|
+
$options.out_path = out
|
67
77
|
end
|
68
78
|
opts.separator ""
|
69
79
|
opts.separator "Filtering options:"
|
@@ -71,41 +81,41 @@ $op = OptionParser.new do |opts|
|
|
71
81
|
"Filter out all but the species in the",
|
72
82
|
"given comma-separated list",
|
73
83
|
"(or @FILE to read from a file)") do |spec|
|
74
|
-
options.seq_filter[:only_species] = handle_list_spec(spec)
|
84
|
+
$options.seq_filter[:only_species] = handle_list_spec(spec)
|
75
85
|
end
|
76
86
|
opts.on("--with-all-species SPECIES",
|
77
87
|
"Only match blocks with all the given",
|
78
88
|
"species, comma-separated",
|
79
89
|
"(or @FILE to read from a file)") do |spec|
|
80
|
-
options.block_filter[:with_all_species] = handle_list_spec(spec)
|
90
|
+
$options.block_filter[:with_all_species] = handle_list_spec(spec)
|
81
91
|
end
|
82
92
|
opts.on("--min-sequences N", Integer,
|
83
93
|
"Match only blocks with at least N sequences") do |n|
|
84
|
-
options.block_filter[:at_least_n_sequences] = n
|
94
|
+
$options.block_filter[:at_least_n_sequences] = n
|
85
95
|
end
|
86
96
|
opts.on("--min-text-size N", Integer,
|
87
97
|
"Match only blocks with minimum text size N") do |n|
|
88
|
-
options.block_filter[:min_size] = n
|
98
|
+
$options.block_filter[:min_size] = n
|
89
99
|
end
|
90
100
|
opts.on("--max-text-size N", Integer,
|
91
101
|
"Match only blocks with maximum text size N") do |n|
|
92
|
-
options.block_filter[:max_size] = n
|
102
|
+
$options.block_filter[:max_size] = n
|
93
103
|
end
|
94
104
|
opts.separator ""
|
95
105
|
opts.separator "Block processing options:"
|
96
106
|
opts.on("--join-blocks",
|
97
107
|
"Join blocks if appropriate after filtering",
|
98
108
|
"out sequences") do
|
99
|
-
options.parse_options[:join_blocks] = true
|
109
|
+
$options.parse_options[:join_blocks] = true
|
100
110
|
end
|
101
111
|
opts.on("--remove-gaps", "Remove gaps after filtering out sequences") do
|
102
|
-
options.parse_options[:remove_gaps] = true
|
112
|
+
$options.parse_options[:remove_gaps] = true
|
103
113
|
end
|
104
114
|
opts.on("--parse-extended", "Parse 'extended' MAF data (i, q lines)") do
|
105
|
-
options.parse_options[:parse_extended] = true
|
115
|
+
$options.parse_options[:parse_extended] = true
|
106
116
|
end
|
107
117
|
opts.on("--parse-empty", "Parse empty (e) lines of MAF data") do
|
108
|
-
options.parse_options[:parse_empty] = true
|
118
|
+
$options.parse_options[:parse_empty] = true
|
109
119
|
end
|
110
120
|
opts.separator ""
|
111
121
|
opts.separator "Logging options:"
|
@@ -120,24 +130,24 @@ def usage(msg)
|
|
120
130
|
exit 2
|
121
131
|
end
|
122
132
|
|
123
|
-
if options.maf
|
124
|
-
access = Access.file(options.maf, options.idx, options.parse_options)
|
125
|
-
elsif options.maf_dir
|
126
|
-
access = Access.maf_dir(options.maf_dir, options.parse_options)
|
133
|
+
if $options.maf
|
134
|
+
access = Access.file($options.maf, $options.idx, $options.parse_options)
|
135
|
+
elsif $options.maf_dir
|
136
|
+
access = Access.maf_dir($options.maf_dir, $options.parse_options)
|
127
137
|
else
|
128
138
|
usage "Must supply --maf or --maf-dir!"
|
129
139
|
end
|
130
140
|
|
131
141
|
begin
|
132
|
-
access.sequence_filter = options.seq_filter unless options.seq_filter.empty?
|
133
|
-
access.block_filter = options.block_filter unless options.block_filter.empty?
|
134
|
-
if options.out_path
|
135
|
-
outf = File.open(options.out_path, 'w')
|
142
|
+
access.sequence_filter = $options.seq_filter unless $options.seq_filter.empty?
|
143
|
+
access.block_filter = $options.block_filter unless $options.block_filter.empty?
|
144
|
+
if $options.out_path
|
145
|
+
outf = File.open($options.out_path, 'w')
|
136
146
|
else
|
137
147
|
outf = $stdout
|
138
148
|
end
|
139
149
|
|
140
|
-
case options.format
|
150
|
+
case $options.format
|
141
151
|
when :maf
|
142
152
|
writer = Writer.new(outf)
|
143
153
|
when :fasta
|
@@ -146,20 +156,20 @@ begin
|
|
146
156
|
raise "unsupported output format #{format}!"
|
147
157
|
end
|
148
158
|
|
149
|
-
if options.bed
|
150
|
-
intervals = read_bed_intervals(options.bed)
|
151
|
-
elsif options.interval
|
152
|
-
intervals = [options.interval]
|
159
|
+
if $options.bed
|
160
|
+
intervals = read_bed_intervals($options.bed)
|
161
|
+
elsif $options.interval
|
162
|
+
intervals = [$options.interval]
|
153
163
|
else
|
154
164
|
usage "Must supply --interval or --bed!"
|
155
165
|
end
|
156
166
|
|
157
167
|
# TODO: provide access to original MAF header?
|
158
|
-
if options.format == :maf
|
168
|
+
if $options.format == :maf
|
159
169
|
writer.write_header(Header.default)
|
160
170
|
end
|
161
171
|
|
162
|
-
case options.mode
|
172
|
+
case $options.mode
|
163
173
|
when :intersect
|
164
174
|
access.find(intervals) do |block|
|
165
175
|
writer.write_block(block)
|
@@ -172,7 +182,7 @@ begin
|
|
172
182
|
end
|
173
183
|
end
|
174
184
|
else
|
175
|
-
raise "Unsupported mode #{options.mode}!"
|
185
|
+
raise "Unsupported mode #{$options.mode}!"
|
176
186
|
end
|
177
187
|
|
178
188
|
ensure
|
data/bin/maf_index
CHANGED
@@ -14,10 +14,10 @@ PRINTERS = {
|
|
14
14
|
$options = OpenStruct.new
|
15
15
|
$options.mode = :build
|
16
16
|
$options.ref_only = true
|
17
|
+
$options.parser_opts = { :parse_extended => false }
|
17
18
|
|
18
19
|
def build_index(maf, index)
|
19
|
-
parser = Bio::MAF::Parser.new(maf,
|
20
|
-
:parse_extended => false)
|
20
|
+
parser = Bio::MAF::Parser.new(maf, $options.parser_opts)
|
21
21
|
idx = Bio::MAF::KyotoIndex.build(parser, index, $options.ref_only)
|
22
22
|
idx.close
|
23
23
|
end
|
@@ -36,6 +36,15 @@ op = OptionParser.new do |opts|
|
|
36
36
|
opts.on("-d", "--dump", "Dump contents of given INDEX") do
|
37
37
|
$options.mode = :dump
|
38
38
|
end
|
39
|
+
opts.on("-O", "--parser-option OPT") do |opt|
|
40
|
+
if opt =~ /(-?)(.+)/
|
41
|
+
val = ! ($1 == "-")
|
42
|
+
option = $2.to_sym
|
43
|
+
$options.parser_opts[option] = val
|
44
|
+
else
|
45
|
+
raise "malformed parser option #{opt}!"
|
46
|
+
end
|
47
|
+
end
|
39
48
|
opts.on("--ruby-prof PATH", "Profile with ruby-prof") do |pspec|
|
40
49
|
require 'ruby-prof'
|
41
50
|
if pspec =~ /(\w+):(.+)/
|
data/bin/maf_tile
CHANGED
@@ -10,7 +10,16 @@ def parse_interval(line)
|
|
10
10
|
src, r_start_s, r_end_s, _ = line.split(nil, 4)
|
11
11
|
r_start = r_start_s.to_i
|
12
12
|
r_end = r_end_s.to_i
|
13
|
-
|
13
|
+
i_src = if $options.bed_species
|
14
|
+
"#{$options.bed_species}.#{src}"
|
15
|
+
else
|
16
|
+
src
|
17
|
+
end
|
18
|
+
if $options.one_based
|
19
|
+
Bio::GenomicInterval.new(i_src, r_start, r_end)
|
20
|
+
else
|
21
|
+
Bio::GenomicInterval.zero_based(i_src, r_start, r_end)
|
22
|
+
end
|
14
23
|
end
|
15
24
|
|
16
25
|
def target_for(base, interval, &blk)
|
@@ -18,52 +27,96 @@ def target_for(base, interval, &blk)
|
|
18
27
|
File.open(path, 'w', &blk)
|
19
28
|
end
|
20
29
|
|
21
|
-
def apply_options(
|
22
|
-
tiler.reference = options.ref if options.ref
|
23
|
-
tiler.species = options.species
|
24
|
-
tiler.species_map = options.species_map
|
30
|
+
def apply_options(tiler)
|
31
|
+
tiler.reference = $options.ref if $options.ref
|
32
|
+
tiler.species = $options.species
|
33
|
+
tiler.species_map = $options.species_map
|
34
|
+
tiler.fill_char = $options.fill_char if $options.fill_char
|
25
35
|
end
|
26
36
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
37
|
+
def each_tiler(access, intervals)
|
38
|
+
intervals.each do |int|
|
39
|
+
access.tile(int) do |tiler|
|
40
|
+
apply_options(tiler)
|
41
|
+
yield tiler
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
$options = OpenStruct.new
|
47
|
+
$options.p = { :threads => 1 }
|
48
|
+
$options.species = []
|
49
|
+
$options.species_map = {}
|
50
|
+
$options.usage = false
|
32
51
|
|
33
52
|
o_parser = OptionParser.new do |opts|
|
34
53
|
opts.banner = "Usage: maf_tile [options] <maf> [index]"
|
35
54
|
opts.separator ""
|
36
55
|
opts.separator "Options:"
|
37
56
|
opts.on("-r", "--reference SEQ", "FASTA reference sequence") do |ref|
|
38
|
-
options.ref = ref
|
39
|
-
end
|
40
|
-
opts.on("-i", "--interval [CHR:]BEGIN
|
41
|
-
if int =~ /(.+):(\d+)
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
options.interval = ($1.to_i)...($2.to_i)
|
57
|
+
$options.ref = ref
|
58
|
+
end
|
59
|
+
opts.on("-i", "--interval [CHR:]BEGIN-END", "Genomic interval, zero-based") do |int|
|
60
|
+
if int =~ /(.+):(\d+)-(\d+)/
|
61
|
+
$options.genomic_interval_spec = [$1, $2.to_i, $3.to_i]
|
62
|
+
elsif int =~ /(\d+)-(\d+)/
|
63
|
+
$options.interval = ($1.to_i)...($2.to_i)
|
46
64
|
else
|
47
65
|
$stderr.puts "Invalid interval specification #{int}!"
|
48
|
-
options.usage = true
|
66
|
+
$options.usage = true
|
49
67
|
end
|
50
68
|
end
|
51
|
-
opts.on("-
|
69
|
+
opts.on("--one-based",
|
70
|
+
"Treat all intervals as one-based",
|
71
|
+
"(even from BED files, contrary to the standard)") do
|
72
|
+
$options.one_based = true
|
73
|
+
end
|
74
|
+
opts.on("-s", "--species SPECIES[:NAME]",
|
75
|
+
"Species to use (mapped name optional)",
|
76
|
+
"(can be a comma-separated list)") do |sp|
|
52
77
|
if sp =~ /:/
|
53
78
|
species, mapped = sp.split(/:/)
|
54
|
-
options.species << species
|
55
|
-
options.species_map[species] = mapped
|
79
|
+
$options.species << species
|
80
|
+
$options.species_map[species] = mapped
|
81
|
+
elsif sp =~ /,/
|
82
|
+
$options.species.concat(sp.split(/,/))
|
56
83
|
else
|
57
|
-
options.species << sp
|
84
|
+
$options.species << sp
|
85
|
+
end
|
86
|
+
end
|
87
|
+
opts.on("--species-file FILE", "File specifying species and optionally mapped names") do |file|
|
88
|
+
File.open(file) do |f|
|
89
|
+
f.each_line do |line|
|
90
|
+
next if line =~ /^#/
|
91
|
+
parts = line.split
|
92
|
+
next unless parts.size > 0
|
93
|
+
$options.species << parts[0]
|
94
|
+
$options.species_map[parts[0]] = parts[1] if parts[1]
|
95
|
+
end
|
58
96
|
end
|
59
97
|
end
|
60
98
|
opts.on("-o", "--output-base BASE", "Base name for output files",
|
61
99
|
"Use stdout for a single interval if not given") do |base|
|
62
|
-
options.output_base = base
|
100
|
+
$options.output_base = base
|
63
101
|
end
|
64
102
|
opts.on("--bed BED", "BED file specifying intervals",
|
65
103
|
"(requires --output-base)") do |bed|
|
66
|
-
options.bed = bed
|
104
|
+
$options.bed = bed
|
105
|
+
end
|
106
|
+
opts.on("--bed-species SPECIES",
|
107
|
+
"Species to prepend to BED chromosome specs") do |species|
|
108
|
+
$options.bed_species = species
|
109
|
+
end
|
110
|
+
opts.on("--fill-char C",
|
111
|
+
"Fill gaps with character C",
|
112
|
+
"(default is *)") do |char|
|
113
|
+
$options.fill_char = char
|
114
|
+
end
|
115
|
+
opts.on("--upcase", "Fold all sequence data to upper case") do
|
116
|
+
$options.p[:upcase] = true
|
117
|
+
end
|
118
|
+
opts.on("--concat", "Concatenate result blocks") do
|
119
|
+
$options.concat = true
|
67
120
|
end
|
68
121
|
Bio::MAF::handle_logging_options(opts)
|
69
122
|
end
|
@@ -74,52 +127,96 @@ Bio::Log::CLI.configure('bio-maf')
|
|
74
127
|
maf_p = ARGV.shift
|
75
128
|
index_p = ARGV.shift
|
76
129
|
|
77
|
-
unless
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
130
|
+
unless maf_p
|
131
|
+
$stderr.puts "Must specify MAF file to process!"
|
132
|
+
$options.usage = true
|
133
|
+
end
|
134
|
+
|
135
|
+
if $options.species.empty?
|
136
|
+
$stderr.puts "Must specify species to tile with --species!"
|
137
|
+
$options.usage = true
|
138
|
+
end
|
139
|
+
|
140
|
+
unless $options.bed || $options.interval || $options.genomic_interval_spec
|
141
|
+
$stderr.puts "Must specify --bed or --interval!"
|
142
|
+
$options.usage = true
|
143
|
+
end
|
144
|
+
|
145
|
+
if $options.bed && ! ($options.output_base || $options.concat)
|
146
|
+
$stderr.puts "Must specify --output-base or --concat when specifying --bed!"
|
147
|
+
$options.usage = true
|
148
|
+
end
|
149
|
+
|
150
|
+
if (! $options.output_base) && ! ($options.interval || $options.genomic_interval_spec || ($options.bed && $options.concat))
|
151
|
+
$stderr.puts "Must specify --interval or --bed with --concat if --output-base is not given!"
|
152
|
+
$options.usage = true
|
153
|
+
end
|
154
|
+
|
155
|
+
if $options.usage
|
82
156
|
$stderr.puts o_parser
|
83
157
|
exit 2
|
84
158
|
end
|
85
159
|
|
86
160
|
access = if File.directory? maf_p
|
87
|
-
Bio::MAF::Access.maf_dir(maf_p, options.p)
|
161
|
+
Bio::MAF::Access.maf_dir(maf_p, $options.p)
|
88
162
|
else
|
89
|
-
Bio::MAF::Access.file(maf_p, index_p, options.p)
|
163
|
+
Bio::MAF::Access.file(maf_p, index_p, $options.p)
|
90
164
|
end
|
91
165
|
|
92
|
-
if options.bed
|
166
|
+
if $options.bed
|
93
167
|
intervals = []
|
94
|
-
File.open(options.bed) do |bed_f|
|
168
|
+
File.open($options.bed) do |bed_f|
|
95
169
|
bed_f.each_line { |line| intervals << parse_interval(line) }
|
96
170
|
end
|
97
|
-
intervals.sort_by! { |int| int.zero_start }
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
171
|
+
#intervals.sort_by! { |int| int.zero_start }
|
172
|
+
if $options.concat
|
173
|
+
# concatenate, as with exons
|
174
|
+
concat = Array.new($options.species.size)
|
175
|
+
concat.fill { '' }
|
176
|
+
non_fill = nil
|
177
|
+
each_tiler(access, intervals) do |tiler|
|
178
|
+
non_fill = tiler.non_fill_re if ! non_fill
|
179
|
+
concat.zip(tiler.build_bio_alignment) do |buf, seq|
|
180
|
+
buf << seq.to_s
|
181
|
+
end
|
182
|
+
end
|
183
|
+
fh = $options.output_base ? File.open($options.output_base, 'wb') : $stdout
|
184
|
+
$options.species.zip(concat) do |species, seq|
|
185
|
+
if non_fill.match(seq)
|
186
|
+
sp_out_name = $options.species_map[species] || species
|
187
|
+
fh.puts ">#{sp_out_name}", seq.scan(/.{1,70}/)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
else
|
191
|
+
# output each interval separately
|
192
|
+
each_tiler(access, intervals) do |tiler|
|
193
|
+
target_for($options.output_base, tiler.interval) do |target|
|
102
194
|
tiler.write_fasta(target)
|
103
195
|
end
|
104
196
|
end
|
105
197
|
end
|
106
198
|
else
|
107
199
|
# single interval
|
108
|
-
if options.
|
109
|
-
|
200
|
+
if $options.genomic_interval_spec
|
201
|
+
spec = $options.genomic_interval_spec
|
202
|
+
if $options.one_based
|
203
|
+
interval = Bio::GenomicInterval.new(*spec)
|
204
|
+
else
|
205
|
+
interval = Bio::GenomicInterval.zero_based(*spec)
|
206
|
+
end
|
110
207
|
else
|
111
208
|
if access.indices.size != 1
|
112
209
|
raise "Must explicitly specify sequence in --interval argument with multiple candidate MAF files!"
|
113
210
|
end
|
114
211
|
ref_seq = access.indices.keys.first
|
115
212
|
interval = Bio::GenomicInterval.zero_based(ref_seq,
|
116
|
-
options.interval.begin,
|
117
|
-
options.interval.end)
|
213
|
+
$options.interval.begin,
|
214
|
+
$options.interval.end)
|
118
215
|
end
|
119
216
|
access.tile(interval) do |tiler|
|
120
|
-
apply_options(
|
121
|
-
if options.output_base
|
122
|
-
target = target_for(options.output_base, tiler.interval)
|
217
|
+
apply_options(tiler)
|
218
|
+
if $options.output_base
|
219
|
+
target = target_for($options.output_base, tiler.interval)
|
123
220
|
else
|
124
221
|
target = $stdout
|
125
222
|
end
|