miga-base 0.3.6.3 → 0.3.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/actions/init.rb +1 -1
- data/actions/tax_dist.rb +32 -26
- data/bin/miga +38 -38
- data/lib/miga/daemon.rb +11 -5
- data/lib/miga/version.rb +1 -1
- data/scripts/ogs.bash +4 -2
- data/utils/cleanup-databases.rb +6 -5
- data/utils/distance/commands.rb +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
- data/utils/plot-taxdist.R +42 -33
- data/utils/requirements.txt +1 -0
- metadata +179 -179
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
- data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 38acfea63f14c8cca837d84fbca92fa70d134e76
|
4
|
+
data.tar.gz: 6c6f955e4ba5c3a90ead50a3ac38896df4febcf1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4303872589a4d02f75d392957dfbcc06f5d3e1f78107a521833f98c3d11b155dbb16bb386e1ab50723d2cb12bcd2bba77ab87033c6a7916f49ce68ce59b4941
|
7
|
+
data.tar.gz: 05134a38b3082e0a5982beb5f23d2a882ce4bba23c34cda1d81bd697288ac841861b6f2a07eeeafe691fed53816125f3de3fd75bf9a7fbea3d49a5975558cb3f
|
data/actions/init.rb
CHANGED
@@ -154,7 +154,7 @@ end
|
|
154
154
|
|
155
155
|
# Check for R packages
|
156
156
|
$stderr.puts "Looking for R packages:"
|
157
|
-
%w(enveomics.R ape
|
157
|
+
%w(enveomics.R ape cluster vegan).each do |pkg|
|
158
158
|
$stderr.print "Testing #{pkg}... "
|
159
159
|
`echo "library('#{pkg}')" | #{paths["R"].shellescape} --vanilla -q 2>&1`
|
160
160
|
if $?.success?
|
data/actions/tax_dist.rb
CHANGED
@@ -3,34 +3,34 @@
|
|
3
3
|
# @package MiGA
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
6
|
+
require 'miga/tax_index'
|
7
|
+
require 'zlib'
|
8
|
+
require 'tmpdir'
|
9
9
|
|
10
|
-
o = {q:true, format: :json}
|
10
|
+
o = {q: true, format: :json}
|
11
11
|
OptionParser.new do |opt|
|
12
12
|
opt_banner(opt)
|
13
13
|
opt_object(opt, o, [:project])
|
14
14
|
opt_filter_datasets(opt, o)
|
15
|
-
opt.on(
|
16
|
-
|
17
|
-
|
18
|
-
){ |v| o[:index]=v }
|
15
|
+
opt.on('-i', '--index FILE',
|
16
|
+
'Pre-calculated tax-index (in tabular format) to be used.',
|
17
|
+
'If passed, dataset filtering arguments are ignored.'
|
18
|
+
){ |v| o[:index] = v }
|
19
19
|
opt_common(opt, o)
|
20
20
|
end.parse!
|
21
21
|
|
22
22
|
##=> Functions <=
|
23
23
|
# Returns the _cannonical_ ID between strings +a+ and +b+.
|
24
|
-
def cannid(a, b) ; [a, b].
|
24
|
+
def cannid(a, b) ; (a > b ? [b, a] : [a, b]).join('-') ; end
|
25
25
|
|
26
26
|
##=> Main <=
|
27
|
-
opt_require(o, project:
|
27
|
+
opt_require(o, project: '-P')
|
28
28
|
|
29
|
-
$stderr.puts
|
29
|
+
$stderr.puts 'Loading project.' unless o[:q]
|
30
30
|
p = MiGA::Project.load(o[:project])
|
31
31
|
raise "Impossible to load project: #{o[:project]}" if p.nil?
|
32
32
|
|
33
|
-
metric = p.is_clade? ?
|
33
|
+
metric = p.is_clade? ? 'ani' : 'aai'
|
34
34
|
res_n = "#{metric}_distances"
|
35
35
|
$stderr.puts "Reading distances (1-#{metric.upcase})." unless o[:q]
|
36
36
|
res = p.result res_n
|
@@ -38,31 +38,33 @@ raise "#{res_n} not yet calculated." if res.nil?
|
|
38
38
|
matrix = res.file_path(:matrix)
|
39
39
|
raise "#{res_n} has no matrix." if matrix.nil?
|
40
40
|
dist = {}
|
41
|
-
mfh = matrix
|
41
|
+
mfh = matrix =~ /\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix, 'r')
|
42
42
|
mfh.each_line do |ln|
|
43
43
|
next if mfh.lineno==1
|
44
|
-
row = ln.chomp.split(
|
45
|
-
dist[cannid(row[1], row[2])] = [row[3], 0, [
|
44
|
+
row = ln.chomp.split("\t")
|
45
|
+
dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']]
|
46
|
+
$stderr.print(" Ln:#{mfh.lineno} \r") if !o[:q] and (mfh.lineno % 1_000) == 0
|
46
47
|
end
|
48
|
+
$stderr.puts " Lines: #{mfh.lineno}" unless o[:q]
|
47
49
|
mfh.close
|
48
50
|
|
49
51
|
Dir.mktmpdir do |dir|
|
50
52
|
if o[:index].nil?
|
51
|
-
$stderr.puts
|
53
|
+
$stderr.puts 'Loading datasets.' unless o[:q]
|
52
54
|
ds = p.datasets
|
53
55
|
ds.select!{ |d| not d.metadata[:tax].nil? }
|
54
56
|
ds = filter_datasets!(ds, o)
|
55
57
|
|
56
|
-
$stderr.puts
|
58
|
+
$stderr.puts 'Indexing taxonomy.' unless o[:q]
|
57
59
|
tax_index = MiGA::TaxIndex.new
|
58
60
|
ds.each { |d| tax_index << d }
|
59
|
-
tab = File.expand_path(
|
60
|
-
File.open(tab,
|
61
|
+
tab = File.expand_path('index.tab', dir)
|
62
|
+
File.open(tab, 'w') { |fh| fh.print tax_index.to_tab }
|
61
63
|
else
|
62
64
|
tab = o[:index]
|
63
65
|
end
|
64
|
-
|
65
|
-
$stderr.puts
|
66
|
+
|
67
|
+
$stderr.puts 'Traversing taxonomy.' unless o[:q]
|
66
68
|
rank_i = 0
|
67
69
|
MiGA::Taxonomy.KNOWN_RANKS.each do |rank|
|
68
70
|
$stderr.print "o #{rank}: " unless o[:q]
|
@@ -70,13 +72,13 @@ Dir.mktmpdir do |dir|
|
|
70
72
|
rank_i += 1
|
71
73
|
in_rank = nil
|
72
74
|
ds_name = []
|
73
|
-
File.open(tab,
|
75
|
+
File.open(tab, 'r') do |fh|
|
74
76
|
fh.each_line do |ln|
|
75
77
|
if ln =~ /^ {#{(rank_i-1)*2}}\S+:\S+:/
|
76
78
|
in_rank = nil
|
77
79
|
ds_name = []
|
78
80
|
elsif ln =~ /^ {#{rank_i*2}}(#{rank}:(\S+)):/
|
79
|
-
in_rank = $2==
|
81
|
+
in_rank = $2 == '?' ? nil : $1
|
80
82
|
ds_name = []
|
81
83
|
elsif ln =~ /^ *# (\S+)/ and not in_rank.nil?
|
82
84
|
ds_i = $1
|
@@ -85,8 +87,8 @@ Dir.mktmpdir do |dir|
|
|
85
87
|
k = cannid(ds_i, ds_j)
|
86
88
|
next if dist[k].nil?
|
87
89
|
rank_n += 1
|
88
|
-
dist[k][
|
89
|
-
dist[k][
|
90
|
+
dist[k][3] = rank_i
|
91
|
+
dist[k][4].unshift in_rank
|
90
92
|
end
|
91
93
|
end
|
92
94
|
end
|
@@ -95,6 +97,10 @@ Dir.mktmpdir do |dir|
|
|
95
97
|
end
|
96
98
|
end
|
97
99
|
|
100
|
+
$stderr.puts 'Generating report.' unless o[:q]
|
98
101
|
dist.keys.each do |k|
|
99
|
-
|
102
|
+
dist[k][5] = dist[k][4].reverse.join(' ')
|
103
|
+
dist[k][4] = dist[k][4].first
|
104
|
+
puts (k.split('-') + dist[k]).join("\t")
|
100
105
|
end
|
106
|
+
|
data/bin/miga
CHANGED
@@ -3,43 +3,43 @@
|
|
3
3
|
# @package MiGA
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
$:.push File.expand_path(
|
6
|
+
$:.push File.expand_path('../../lib', __FILE__)
|
7
7
|
|
8
|
-
require
|
9
|
-
require
|
8
|
+
require 'optparse'
|
9
|
+
require 'miga'
|
10
10
|
|
11
11
|
##=> Global variables <=
|
12
12
|
|
13
13
|
$task_desc = {
|
14
14
|
# Projects
|
15
|
-
new:
|
16
|
-
about:
|
17
|
-
plugins:
|
18
|
-
doctor:
|
15
|
+
new: 'Creates an empty MiGA project',
|
16
|
+
about: 'Displays information about a MiGA project',
|
17
|
+
plugins: 'Lists or (un)installs plugins in a MiGA project',
|
18
|
+
doctor: 'Performs consistency checks on a MiGA project',
|
19
19
|
# Datasets
|
20
|
-
add:
|
21
|
-
get:
|
22
|
-
ncbi_get:
|
23
|
-
rm:
|
24
|
-
find:
|
25
|
-
ln:
|
26
|
-
ls:
|
20
|
+
add: 'Creates an empty dataset in a pre-existing MiGA project',
|
21
|
+
get: 'Downloads a dataset from public databases into a MiGA project',
|
22
|
+
ncbi_get: 'Downloads all genomes in a taxon from NCBI into a MiGA project',
|
23
|
+
rm: 'Removes a dataset from an MiGA project',
|
24
|
+
find: 'Finds unregistered datasets based on result files',
|
25
|
+
ln: 'Link datasets (including results) from one project to another',
|
26
|
+
ls: 'Lists all registered datasets in an MiGA project',
|
27
27
|
# Results
|
28
|
-
add_result:
|
29
|
-
stats:
|
30
|
-
files:
|
31
|
-
run:
|
32
|
-
summary:
|
28
|
+
add_result: 'Registers a result',
|
29
|
+
stats: 'Extracts statistics for the given result',
|
30
|
+
files: 'Lists registered files from the results of a dataset or project',
|
31
|
+
run: 'Executes locally one step analysis producing the given result',
|
32
|
+
summary: 'Generates a summary table for the statistics of all datasets',
|
33
33
|
# System
|
34
|
-
init:
|
35
|
-
daemon:
|
36
|
-
date:
|
37
|
-
console:
|
34
|
+
init: 'Initialize MiGA to process new projects',
|
35
|
+
daemon: 'Controls the daemon of a MiGA project',
|
36
|
+
date: 'Returns the current date in standard MiGA format',
|
37
|
+
console: 'Opens an IRB console with MiGA',
|
38
38
|
# Taxonomy
|
39
|
-
tax_set:
|
40
|
-
tax_test:
|
41
|
-
tax_index:
|
42
|
-
tax_dist:
|
39
|
+
tax_set: 'Registers taxonomic information for datasets',
|
40
|
+
tax_test: 'Returns test of taxonomic distributions for query datasets',
|
41
|
+
tax_index: 'Creates a taxonomy-indexed list of the datasets',
|
42
|
+
tax_dist: 'Estimates distributions of distance by taxonomy',
|
43
43
|
}
|
44
44
|
|
45
45
|
$task_alias = {
|
@@ -178,14 +178,14 @@ def filter_datasets!(ds, o)
|
|
178
178
|
end
|
179
179
|
|
180
180
|
def add_metadata(o, obj)
|
181
|
-
o[:metadata].split(
|
182
|
-
(k,v) = pair.split(
|
181
|
+
o[:metadata].split(',').each do |pair|
|
182
|
+
(k,v) = pair.split('=')
|
183
183
|
case v
|
184
184
|
when 'true'; v = true
|
185
185
|
when 'false'; v = false
|
186
186
|
when 'nil'; v = nil
|
187
187
|
end
|
188
|
-
if k=='_step'
|
188
|
+
if k == '_step'
|
189
189
|
obj.metadata["_try_#{v}"] ||= 0
|
190
190
|
obj.metadata["_try_#{v}"] += 1
|
191
191
|
end
|
@@ -205,20 +205,20 @@ ARGV[0] = $task_alias[ARGV[0].to_sym] unless
|
|
205
205
|
ARGV[0].nil? or $task_alias[ARGV[0].to_sym].nil?
|
206
206
|
|
207
207
|
case ARGV[0].to_s
|
208
|
-
when
|
208
|
+
when '-v', '--version'
|
209
209
|
puts MiGA::MiGA.VERSION
|
210
|
-
when
|
210
|
+
when '-V', '--long-version'
|
211
211
|
puts MiGA::MiGA.LONG_VERSION
|
212
|
-
when
|
212
|
+
when '-C', '--citation'
|
213
213
|
puts MiGA::MiGA.CITATION
|
214
|
-
when
|
215
|
-
require
|
216
|
-
require
|
214
|
+
when 'console'
|
215
|
+
require 'irb'
|
216
|
+
require 'irb/completion'
|
217
217
|
ARGV.shift
|
218
218
|
IRB.start
|
219
219
|
when *execs
|
220
220
|
$task = ARGV.shift.to_sym
|
221
|
-
ARGV <<
|
221
|
+
ARGV << '-h' if ARGV.empty? and not [:date, :init].include? $task
|
222
222
|
begin
|
223
223
|
load File.expand_path("../actions/#{$task}.rb", File.dirname(__FILE__))
|
224
224
|
rescue => err
|
@@ -233,7 +233,7 @@ Microbial Genomes Atlas.
|
|
233
233
|
|
234
234
|
Usage: #{$0} {action} [options]
|
235
235
|
|
236
|
-
#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n")}
|
236
|
+
#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n") }
|
237
237
|
|
238
238
|
generic options:
|
239
239
|
-h, --help Display this screen.
|
data/lib/miga/daemon.rb
CHANGED
@@ -99,11 +99,17 @@ class MiGA::Daemon < MiGA::MiGA
|
|
99
99
|
status = JSON.parse(File.read(f_path), symbolize_names: true)
|
100
100
|
status.keys.each do |i|
|
101
101
|
status[i].map! do |j|
|
102
|
-
j.tap
|
102
|
+
j.tap do |k|
|
103
|
+
unless k[:ds].nil? or k[:ds_name] == 'miga-project'
|
104
|
+
k[:ds] = project.dataset(k[:ds_name])
|
105
|
+
end
|
106
|
+
k[:job] = k[:job].to_sym unless k[:job].nil?
|
107
|
+
end
|
103
108
|
end
|
104
109
|
end
|
105
110
|
@jobs_running = status[:jobs_running]
|
106
111
|
@jobs_to_run = status[:jobs_to_run]
|
112
|
+
say "- jobs left running: #{@jobs_running.size}"
|
107
113
|
purge!
|
108
114
|
say "- jobs running: #{@jobs_running.size}"
|
109
115
|
say "- jobs to run: #{@jobs_to_run.size}"
|
@@ -171,12 +177,12 @@ class MiGA::Daemon < MiGA::MiGA
|
|
171
177
|
##
|
172
178
|
# Get the taks with key symbol +job+ in dataset +ds+. For project-wide tasks
|
173
179
|
# let +ds+ be nil.
|
174
|
-
def get_job(job, ds=nil)
|
180
|
+
def get_job(job, ds = nil)
|
175
181
|
(jobs_to_run + jobs_running).find do |j|
|
176
|
-
if ds
|
177
|
-
j[:ds].nil? and j[:job]==job
|
182
|
+
if ds.nil?
|
183
|
+
j[:ds].nil? and j[:job] == job
|
178
184
|
else
|
179
|
-
(! j[:ds].nil?) and j[:ds].name==ds.name and j[:job]==job
|
185
|
+
(! j[:ds].nil?) and j[:ds].name == ds.name and j[:job] == job
|
180
186
|
end
|
181
187
|
end
|
182
188
|
end
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 7, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
data/scripts/ogs.bash
CHANGED
@@ -11,7 +11,9 @@ cd "$PROJECT/data/10.clades/03.ogs"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
|
-
DS=$(miga
|
14
|
+
DS=$(miga ls -P "$PROJECT" --ref --no-multi)
|
15
|
+
MIN_ID=$(miga about -P "$PROJECT" -m ogs_identity)
|
16
|
+
[[ $MIN_ID == "?" ]] && MIN_ID=80
|
15
17
|
if [[ ! -s miga-project.ogs ]] ; then
|
16
18
|
# Extract RBMs
|
17
19
|
if [[ ! -s miga-project.abc ]] ; then
|
@@ -19,7 +21,7 @@ if [[ ! -s miga-project.ogs ]] ; then
|
|
19
21
|
for i in $DS ; do
|
20
22
|
file="miga-project.tmp/$i.abc"
|
21
23
|
[[ -s "$file" ]] && continue
|
22
|
-
echo "SELECT seq1,id1,seq2,id2,bitscore from rbm;" \
|
24
|
+
echo "SELECT seq1,id1,seq2,id2,bitscore from rbm where id >= $MIN_ID;" \
|
23
25
|
| sqlite3 "../../09.distances/02.aai/$i.db" | tr "\\|" " " \
|
24
26
|
| awk '{ print $1">"$2"'"\\t"'"$3">"$4"'"\\t"'"$5 }' \
|
25
27
|
> "$file.tmp"
|
data/utils/cleanup-databases.rb
CHANGED
@@ -6,18 +6,19 @@ require 'miga'
|
|
6
6
|
ARGV[1] or abort "Usage: #{$0} path/to/project threads"
|
7
7
|
|
8
8
|
$stderr.puts "Cleaning databases..."
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
p = MiGA::Project.load(ARGV[0])
|
10
|
+
ds_names = p.dataset_names
|
12
11
|
thr = ARGV[1].to_i
|
13
12
|
|
14
13
|
(0 .. thr-1).each do |t|
|
15
14
|
fork do
|
16
15
|
k = -1
|
17
|
-
|
16
|
+
ds_names.each do |i|
|
18
17
|
k = (k+1) % thr
|
19
18
|
next unless k == t
|
20
|
-
i
|
19
|
+
d = p.dataset(i)
|
20
|
+
next unless d.is_ref? and d.is_active?
|
21
|
+
d.cleanup_distances!
|
21
22
|
end
|
22
23
|
end
|
23
24
|
end
|
data/utils/distance/commands.rb
CHANGED
@@ -21,6 +21,7 @@ module MiGA::DistanceRunner::Commands
|
|
21
21
|
##
|
22
22
|
# Estimates AAI against +target+ using hAAI
|
23
23
|
def haai(target)
|
24
|
+
return nil if opts[:haai_p] == 'no'
|
24
25
|
haai = aai_cmd(tmp_file('ess_genes.fa'),
|
25
26
|
target.result(:essential_genes).file_path(:ess_genes),
|
26
27
|
dataset.name, target.name, tmp_dbs[:haai],
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @update: Oct 07 2015
|
5
|
+
# @license: artistic license 2.0
|
6
|
+
#
|
7
|
+
use strict;
|
8
|
+
use warnings;
|
9
|
+
use List::Util qw/sum min max/;
|
10
|
+
|
11
|
+
my ($seqs, $minlen, $n__) = @ARGV;
|
12
|
+
$seqs or die "
|
13
|
+
Description:
|
14
|
+
Calculates the N50 value of a set of sequences. Alternatively, it
|
15
|
+
can calculate other N** values. It also calculates the total number
|
16
|
+
of sequences and the total added length.
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
$0 seqs.fa[ minlen[ **]]
|
20
|
+
|
21
|
+
seqs.fa A FastA file containing the sequences.
|
22
|
+
minlen (optional) The minimum length to take into consideration.
|
23
|
+
By default: 0.
|
24
|
+
** Value N** to calculate. By default: 50 (N50).
|
25
|
+
";
|
26
|
+
$minlen ||= 0;
|
27
|
+
$n__ ||= 50;
|
28
|
+
|
29
|
+
my @len = ();
|
30
|
+
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
|
+
while(<SEQ>){
|
32
|
+
if(/^>/){
|
33
|
+
push @len, 0;
|
34
|
+
}else{
|
35
|
+
next if /^;/;
|
36
|
+
chomp;
|
37
|
+
s/\W//g;
|
38
|
+
$len[-1]+=length $_;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
close SEQ;
|
42
|
+
@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
|
43
|
+
my $tot = (sum(@len) || 0);
|
44
|
+
|
45
|
+
my $thr = $n__*$tot/100;
|
46
|
+
my $pos = 0;
|
47
|
+
for(@len){
|
48
|
+
$pos+= $_;
|
49
|
+
if($pos>=$thr){
|
50
|
+
print "N$n__: $_\n";
|
51
|
+
last;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
print "Sequences: ".scalar(@len)."\n";
|
55
|
+
print "Total length: $tot\n";
|
56
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @update Oct-07-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
|
11
|
+
my($file, $content, $stretch) = @ARGV;
|
12
|
+
$file or die <<HELP
|
13
|
+
|
14
|
+
Description:
|
15
|
+
Filter sequences by N-content and presence of long homopolymers.
|
16
|
+
Usage:
|
17
|
+
$0 sequences.fa [content [stretch]] > filtered.fa
|
18
|
+
Where:
|
19
|
+
sequences.fa Input file in FastA format
|
20
|
+
content A number between 0 and 1 indicating the maximum proportion of Ns
|
21
|
+
(1 to turn off, 0.5 by default)
|
22
|
+
stretch A number indicating the maximum number of consecutive identical
|
23
|
+
nucleotides allowed (0 to turn off, 100 by default)
|
24
|
+
filtered.fa Filtered set of sequences.
|
25
|
+
|
26
|
+
HELP
|
27
|
+
;
|
28
|
+
($content ||= 0.5)+=0;
|
29
|
+
($stretch ||= 100)+=0;
|
30
|
+
|
31
|
+
my $good = 0;
|
32
|
+
my $N = 0;
|
33
|
+
|
34
|
+
FASTA: {
|
35
|
+
local $/ = "\n>";
|
36
|
+
open FILE, "<", $file or die "I can not open the file: $file: $!\n";
|
37
|
+
SEQ: while(<FILE>){
|
38
|
+
$N++;
|
39
|
+
s/^;.*//gm;
|
40
|
+
s/>//g;
|
41
|
+
my($n,$s) = split /\n/, $_, 2;
|
42
|
+
(my $clean = $s) =~ s/[^ACTGN]//g;
|
43
|
+
if($content < 1){
|
44
|
+
(my $Ns = $clean) =~ s/[^N]//g;
|
45
|
+
next SEQ if length($Ns)>length($clean)*$content;
|
46
|
+
}
|
47
|
+
if($stretch > 0){
|
48
|
+
for my $nuc (qw(A C T G N)){
|
49
|
+
next SEQ if $clean =~ m/[$nuc]{$stretch}/;
|
50
|
+
}
|
51
|
+
}
|
52
|
+
print ">$n\n$s\n";
|
53
|
+
$good++;
|
54
|
+
}
|
55
|
+
close FILE;
|
56
|
+
print STDERR "Total sequences: $N\nAfter filtering: $good\n";
|
57
|
+
}
|
58
|
+
|
59
|
+
|
60
|
+
|