miga-base 0.3.6.3 → 0.3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/actions/init.rb +1 -1
- data/actions/tax_dist.rb +32 -26
- data/bin/miga +38 -38
- data/lib/miga/daemon.rb +11 -5
- data/lib/miga/version.rb +1 -1
- data/scripts/ogs.bash +4 -2
- data/utils/cleanup-databases.rb +6 -5
- data/utils/distance/commands.rb +1 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +56 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +60 -0
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +38 -0
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +55 -0
- data/utils/plot-taxdist.R +42 -33
- data/utils/requirements.txt +1 -0
- metadata +179 -179
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
- data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
- data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
- data/utils/enveomics/Scripts/lib/enveomics.R +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 38acfea63f14c8cca837d84fbca92fa70d134e76
|
4
|
+
data.tar.gz: 6c6f955e4ba5c3a90ead50a3ac38896df4febcf1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4303872589a4d02f75d392957dfbcc06f5d3e1f78107a521833f98c3d11b155dbb16bb386e1ab50723d2cb12bcd2bba77ab87033c6a7916f49ce68ce59b4941
|
7
|
+
data.tar.gz: 05134a38b3082e0a5982beb5f23d2a882ce4bba23c34cda1d81bd697288ac841861b6f2a07eeeafe691fed53816125f3de3fd75bf9a7fbea3d49a5975558cb3f
|
data/actions/init.rb
CHANGED
@@ -154,7 +154,7 @@ end
|
|
154
154
|
|
155
155
|
# Check for R packages
|
156
156
|
$stderr.puts "Looking for R packages:"
|
157
|
-
%w(enveomics.R ape
|
157
|
+
%w(enveomics.R ape cluster vegan).each do |pkg|
|
158
158
|
$stderr.print "Testing #{pkg}... "
|
159
159
|
`echo "library('#{pkg}')" | #{paths["R"].shellescape} --vanilla -q 2>&1`
|
160
160
|
if $?.success?
|
data/actions/tax_dist.rb
CHANGED
@@ -3,34 +3,34 @@
|
|
3
3
|
# @package MiGA
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
6
|
+
require 'miga/tax_index'
|
7
|
+
require 'zlib'
|
8
|
+
require 'tmpdir'
|
9
9
|
|
10
|
-
o = {q:true, format: :json}
|
10
|
+
o = {q: true, format: :json}
|
11
11
|
OptionParser.new do |opt|
|
12
12
|
opt_banner(opt)
|
13
13
|
opt_object(opt, o, [:project])
|
14
14
|
opt_filter_datasets(opt, o)
|
15
|
-
opt.on(
|
16
|
-
|
17
|
-
|
18
|
-
){ |v| o[:index]=v }
|
15
|
+
opt.on('-i', '--index FILE',
|
16
|
+
'Pre-calculated tax-index (in tabular format) to be used.',
|
17
|
+
'If passed, dataset filtering arguments are ignored.'
|
18
|
+
){ |v| o[:index] = v }
|
19
19
|
opt_common(opt, o)
|
20
20
|
end.parse!
|
21
21
|
|
22
22
|
##=> Functions <=
|
23
23
|
# Returns the _cannonical_ ID between strings +a+ and +b+.
|
24
|
-
def cannid(a, b) ; [a, b].
|
24
|
+
def cannid(a, b) ; (a > b ? [b, a] : [a, b]).join('-') ; end
|
25
25
|
|
26
26
|
##=> Main <=
|
27
|
-
opt_require(o, project:
|
27
|
+
opt_require(o, project: '-P')
|
28
28
|
|
29
|
-
$stderr.puts
|
29
|
+
$stderr.puts 'Loading project.' unless o[:q]
|
30
30
|
p = MiGA::Project.load(o[:project])
|
31
31
|
raise "Impossible to load project: #{o[:project]}" if p.nil?
|
32
32
|
|
33
|
-
metric = p.is_clade? ?
|
33
|
+
metric = p.is_clade? ? 'ani' : 'aai'
|
34
34
|
res_n = "#{metric}_distances"
|
35
35
|
$stderr.puts "Reading distances (1-#{metric.upcase})." unless o[:q]
|
36
36
|
res = p.result res_n
|
@@ -38,31 +38,33 @@ raise "#{res_n} not yet calculated." if res.nil?
|
|
38
38
|
matrix = res.file_path(:matrix)
|
39
39
|
raise "#{res_n} has no matrix." if matrix.nil?
|
40
40
|
dist = {}
|
41
|
-
mfh = matrix
|
41
|
+
mfh = matrix =~ /\.gz$/ ? Zlib::GzipReader.open(matrix) : File.open(matrix, 'r')
|
42
42
|
mfh.each_line do |ln|
|
43
43
|
next if mfh.lineno==1
|
44
|
-
row = ln.chomp.split(
|
45
|
-
dist[cannid(row[1], row[2])] = [row[3], 0, [
|
44
|
+
row = ln.chomp.split("\t")
|
45
|
+
dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']]
|
46
|
+
$stderr.print(" Ln:#{mfh.lineno} \r") if !o[:q] and (mfh.lineno % 1_000) == 0
|
46
47
|
end
|
48
|
+
$stderr.puts " Lines: #{mfh.lineno}" unless o[:q]
|
47
49
|
mfh.close
|
48
50
|
|
49
51
|
Dir.mktmpdir do |dir|
|
50
52
|
if o[:index].nil?
|
51
|
-
$stderr.puts
|
53
|
+
$stderr.puts 'Loading datasets.' unless o[:q]
|
52
54
|
ds = p.datasets
|
53
55
|
ds.select!{ |d| not d.metadata[:tax].nil? }
|
54
56
|
ds = filter_datasets!(ds, o)
|
55
57
|
|
56
|
-
$stderr.puts
|
58
|
+
$stderr.puts 'Indexing taxonomy.' unless o[:q]
|
57
59
|
tax_index = MiGA::TaxIndex.new
|
58
60
|
ds.each { |d| tax_index << d }
|
59
|
-
tab = File.expand_path(
|
60
|
-
File.open(tab,
|
61
|
+
tab = File.expand_path('index.tab', dir)
|
62
|
+
File.open(tab, 'w') { |fh| fh.print tax_index.to_tab }
|
61
63
|
else
|
62
64
|
tab = o[:index]
|
63
65
|
end
|
64
|
-
|
65
|
-
$stderr.puts
|
66
|
+
|
67
|
+
$stderr.puts 'Traversing taxonomy.' unless o[:q]
|
66
68
|
rank_i = 0
|
67
69
|
MiGA::Taxonomy.KNOWN_RANKS.each do |rank|
|
68
70
|
$stderr.print "o #{rank}: " unless o[:q]
|
@@ -70,13 +72,13 @@ Dir.mktmpdir do |dir|
|
|
70
72
|
rank_i += 1
|
71
73
|
in_rank = nil
|
72
74
|
ds_name = []
|
73
|
-
File.open(tab,
|
75
|
+
File.open(tab, 'r') do |fh|
|
74
76
|
fh.each_line do |ln|
|
75
77
|
if ln =~ /^ {#{(rank_i-1)*2}}\S+:\S+:/
|
76
78
|
in_rank = nil
|
77
79
|
ds_name = []
|
78
80
|
elsif ln =~ /^ {#{rank_i*2}}(#{rank}:(\S+)):/
|
79
|
-
in_rank = $2==
|
81
|
+
in_rank = $2 == '?' ? nil : $1
|
80
82
|
ds_name = []
|
81
83
|
elsif ln =~ /^ *# (\S+)/ and not in_rank.nil?
|
82
84
|
ds_i = $1
|
@@ -85,8 +87,8 @@ Dir.mktmpdir do |dir|
|
|
85
87
|
k = cannid(ds_i, ds_j)
|
86
88
|
next if dist[k].nil?
|
87
89
|
rank_n += 1
|
88
|
-
dist[k][
|
89
|
-
dist[k][
|
90
|
+
dist[k][3] = rank_i
|
91
|
+
dist[k][4].unshift in_rank
|
90
92
|
end
|
91
93
|
end
|
92
94
|
end
|
@@ -95,6 +97,10 @@ Dir.mktmpdir do |dir|
|
|
95
97
|
end
|
96
98
|
end
|
97
99
|
|
100
|
+
$stderr.puts 'Generating report.' unless o[:q]
|
98
101
|
dist.keys.each do |k|
|
99
|
-
|
102
|
+
dist[k][5] = dist[k][4].reverse.join(' ')
|
103
|
+
dist[k][4] = dist[k][4].first
|
104
|
+
puts (k.split('-') + dist[k]).join("\t")
|
100
105
|
end
|
106
|
+
|
data/bin/miga
CHANGED
@@ -3,43 +3,43 @@
|
|
3
3
|
# @package MiGA
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
|
-
$:.push File.expand_path(
|
6
|
+
$:.push File.expand_path('../../lib', __FILE__)
|
7
7
|
|
8
|
-
require
|
9
|
-
require
|
8
|
+
require 'optparse'
|
9
|
+
require 'miga'
|
10
10
|
|
11
11
|
##=> Global variables <=
|
12
12
|
|
13
13
|
$task_desc = {
|
14
14
|
# Projects
|
15
|
-
new:
|
16
|
-
about:
|
17
|
-
plugins:
|
18
|
-
doctor:
|
15
|
+
new: 'Creates an empty MiGA project',
|
16
|
+
about: 'Displays information about a MiGA project',
|
17
|
+
plugins: 'Lists or (un)installs plugins in a MiGA project',
|
18
|
+
doctor: 'Performs consistency checks on a MiGA project',
|
19
19
|
# Datasets
|
20
|
-
add:
|
21
|
-
get:
|
22
|
-
ncbi_get:
|
23
|
-
rm:
|
24
|
-
find:
|
25
|
-
ln:
|
26
|
-
ls:
|
20
|
+
add: 'Creates an empty dataset in a pre-existing MiGA project',
|
21
|
+
get: 'Downloads a dataset from public databases into a MiGA project',
|
22
|
+
ncbi_get: 'Downloads all genomes in a taxon from NCBI into a MiGA project',
|
23
|
+
rm: 'Removes a dataset from an MiGA project',
|
24
|
+
find: 'Finds unregistered datasets based on result files',
|
25
|
+
ln: 'Link datasets (including results) from one project to another',
|
26
|
+
ls: 'Lists all registered datasets in an MiGA project',
|
27
27
|
# Results
|
28
|
-
add_result:
|
29
|
-
stats:
|
30
|
-
files:
|
31
|
-
run:
|
32
|
-
summary:
|
28
|
+
add_result: 'Registers a result',
|
29
|
+
stats: 'Extracts statistics for the given result',
|
30
|
+
files: 'Lists registered files from the results of a dataset or project',
|
31
|
+
run: 'Executes locally one step analysis producing the given result',
|
32
|
+
summary: 'Generates a summary table for the statistics of all datasets',
|
33
33
|
# System
|
34
|
-
init:
|
35
|
-
daemon:
|
36
|
-
date:
|
37
|
-
console:
|
34
|
+
init: 'Initialize MiGA to process new projects',
|
35
|
+
daemon: 'Controls the daemon of a MiGA project',
|
36
|
+
date: 'Returns the current date in standard MiGA format',
|
37
|
+
console: 'Opens an IRB console with MiGA',
|
38
38
|
# Taxonomy
|
39
|
-
tax_set:
|
40
|
-
tax_test:
|
41
|
-
tax_index:
|
42
|
-
tax_dist:
|
39
|
+
tax_set: 'Registers taxonomic information for datasets',
|
40
|
+
tax_test: 'Returns test of taxonomic distributions for query datasets',
|
41
|
+
tax_index: 'Creates a taxonomy-indexed list of the datasets',
|
42
|
+
tax_dist: 'Estimates distributions of distance by taxonomy',
|
43
43
|
}
|
44
44
|
|
45
45
|
$task_alias = {
|
@@ -178,14 +178,14 @@ def filter_datasets!(ds, o)
|
|
178
178
|
end
|
179
179
|
|
180
180
|
def add_metadata(o, obj)
|
181
|
-
o[:metadata].split(
|
182
|
-
(k,v) = pair.split(
|
181
|
+
o[:metadata].split(',').each do |pair|
|
182
|
+
(k,v) = pair.split('=')
|
183
183
|
case v
|
184
184
|
when 'true'; v = true
|
185
185
|
when 'false'; v = false
|
186
186
|
when 'nil'; v = nil
|
187
187
|
end
|
188
|
-
if k=='_step'
|
188
|
+
if k == '_step'
|
189
189
|
obj.metadata["_try_#{v}"] ||= 0
|
190
190
|
obj.metadata["_try_#{v}"] += 1
|
191
191
|
end
|
@@ -205,20 +205,20 @@ ARGV[0] = $task_alias[ARGV[0].to_sym] unless
|
|
205
205
|
ARGV[0].nil? or $task_alias[ARGV[0].to_sym].nil?
|
206
206
|
|
207
207
|
case ARGV[0].to_s
|
208
|
-
when
|
208
|
+
when '-v', '--version'
|
209
209
|
puts MiGA::MiGA.VERSION
|
210
|
-
when
|
210
|
+
when '-V', '--long-version'
|
211
211
|
puts MiGA::MiGA.LONG_VERSION
|
212
|
-
when
|
212
|
+
when '-C', '--citation'
|
213
213
|
puts MiGA::MiGA.CITATION
|
214
|
-
when
|
215
|
-
require
|
216
|
-
require
|
214
|
+
when 'console'
|
215
|
+
require 'irb'
|
216
|
+
require 'irb/completion'
|
217
217
|
ARGV.shift
|
218
218
|
IRB.start
|
219
219
|
when *execs
|
220
220
|
$task = ARGV.shift.to_sym
|
221
|
-
ARGV <<
|
221
|
+
ARGV << '-h' if ARGV.empty? and not [:date, :init].include? $task
|
222
222
|
begin
|
223
223
|
load File.expand_path("../actions/#{$task}.rb", File.dirname(__FILE__))
|
224
224
|
rescue => err
|
@@ -233,7 +233,7 @@ Microbial Genomes Atlas.
|
|
233
233
|
|
234
234
|
Usage: #{$0} {action} [options]
|
235
235
|
|
236
|
-
#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n")}
|
236
|
+
#{ MiGA::MiGA.tabulate([:action, :description], $task_desc.to_a).join("\n") }
|
237
237
|
|
238
238
|
generic options:
|
239
239
|
-h, --help Display this screen.
|
data/lib/miga/daemon.rb
CHANGED
@@ -99,11 +99,17 @@ class MiGA::Daemon < MiGA::MiGA
|
|
99
99
|
status = JSON.parse(File.read(f_path), symbolize_names: true)
|
100
100
|
status.keys.each do |i|
|
101
101
|
status[i].map! do |j|
|
102
|
-
j.tap
|
102
|
+
j.tap do |k|
|
103
|
+
unless k[:ds].nil? or k[:ds_name] == 'miga-project'
|
104
|
+
k[:ds] = project.dataset(k[:ds_name])
|
105
|
+
end
|
106
|
+
k[:job] = k[:job].to_sym unless k[:job].nil?
|
107
|
+
end
|
103
108
|
end
|
104
109
|
end
|
105
110
|
@jobs_running = status[:jobs_running]
|
106
111
|
@jobs_to_run = status[:jobs_to_run]
|
112
|
+
say "- jobs left running: #{@jobs_running.size}"
|
107
113
|
purge!
|
108
114
|
say "- jobs running: #{@jobs_running.size}"
|
109
115
|
say "- jobs to run: #{@jobs_to_run.size}"
|
@@ -171,12 +177,12 @@ class MiGA::Daemon < MiGA::MiGA
|
|
171
177
|
##
|
172
178
|
# Get the taks with key symbol +job+ in dataset +ds+. For project-wide tasks
|
173
179
|
# let +ds+ be nil.
|
174
|
-
def get_job(job, ds=nil)
|
180
|
+
def get_job(job, ds = nil)
|
175
181
|
(jobs_to_run + jobs_running).find do |j|
|
176
|
-
if ds
|
177
|
-
j[:ds].nil? and j[:job]==job
|
182
|
+
if ds.nil?
|
183
|
+
j[:ds].nil? and j[:job] == job
|
178
184
|
else
|
179
|
-
(! j[:ds].nil?) and j[:ds].name==ds.name and j[:job]==job
|
185
|
+
(! j[:ds].nil?) and j[:ds].name == ds.name and j[:job] == job
|
180
186
|
end
|
181
187
|
end
|
182
188
|
end
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 7, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
data/scripts/ogs.bash
CHANGED
@@ -11,7 +11,9 @@ cd "$PROJECT/data/10.clades/03.ogs"
|
|
11
11
|
# Initialize
|
12
12
|
miga date > "miga-project.start"
|
13
13
|
|
14
|
-
DS=$(miga
|
14
|
+
DS=$(miga ls -P "$PROJECT" --ref --no-multi)
|
15
|
+
MIN_ID=$(miga about -P "$PROJECT" -m ogs_identity)
|
16
|
+
[[ $MIN_ID == "?" ]] && MIN_ID=80
|
15
17
|
if [[ ! -s miga-project.ogs ]] ; then
|
16
18
|
# Extract RBMs
|
17
19
|
if [[ ! -s miga-project.abc ]] ; then
|
@@ -19,7 +21,7 @@ if [[ ! -s miga-project.ogs ]] ; then
|
|
19
21
|
for i in $DS ; do
|
20
22
|
file="miga-project.tmp/$i.abc"
|
21
23
|
[[ -s "$file" ]] && continue
|
22
|
-
echo "SELECT seq1,id1,seq2,id2,bitscore from rbm;" \
|
24
|
+
echo "SELECT seq1,id1,seq2,id2,bitscore from rbm where id >= $MIN_ID;" \
|
23
25
|
| sqlite3 "../../09.distances/02.aai/$i.db" | tr "\\|" " " \
|
24
26
|
| awk '{ print $1">"$2"'"\\t"'"$3">"$4"'"\\t"'"$5 }' \
|
25
27
|
> "$file.tmp"
|
data/utils/cleanup-databases.rb
CHANGED
@@ -6,18 +6,19 @@ require 'miga'
|
|
6
6
|
ARGV[1] or abort "Usage: #{$0} path/to/project threads"
|
7
7
|
|
8
8
|
$stderr.puts "Cleaning databases..."
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
p = MiGA::Project.load(ARGV[0])
|
10
|
+
ds_names = p.dataset_names
|
12
11
|
thr = ARGV[1].to_i
|
13
12
|
|
14
13
|
(0 .. thr-1).each do |t|
|
15
14
|
fork do
|
16
15
|
k = -1
|
17
|
-
|
16
|
+
ds_names.each do |i|
|
18
17
|
k = (k+1) % thr
|
19
18
|
next unless k == t
|
20
|
-
i
|
19
|
+
d = p.dataset(i)
|
20
|
+
next unless d.is_ref? and d.is_active?
|
21
|
+
d.cleanup_distances!
|
21
22
|
end
|
22
23
|
end
|
23
24
|
end
|
data/utils/distance/commands.rb
CHANGED
@@ -21,6 +21,7 @@ module MiGA::DistanceRunner::Commands
|
|
21
21
|
##
|
22
22
|
# Estimates AAI against +target+ using hAAI
|
23
23
|
def haai(target)
|
24
|
+
return nil if opts[:haai_p] == 'no'
|
24
25
|
haai = aai_cmd(tmp_file('ess_genes.fa'),
|
25
26
|
target.result(:essential_genes).file_path(:ess_genes),
|
26
27
|
dataset.name, target.name, tmp_dbs[:haai],
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author: Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
|
4
|
+
# @update: Oct 07 2015
|
5
|
+
# @license: artistic license 2.0
|
6
|
+
#
|
7
|
+
use strict;
|
8
|
+
use warnings;
|
9
|
+
use List::Util qw/sum min max/;
|
10
|
+
|
11
|
+
my ($seqs, $minlen, $n__) = @ARGV;
|
12
|
+
$seqs or die "
|
13
|
+
Description:
|
14
|
+
Calculates the N50 value of a set of sequences. Alternatively, it
|
15
|
+
can calculate other N** values. It also calculates the total number
|
16
|
+
of sequences and the total added length.
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
$0 seqs.fa[ minlen[ **]]
|
20
|
+
|
21
|
+
seqs.fa A FastA file containing the sequences.
|
22
|
+
minlen (optional) The minimum length to take into consideration.
|
23
|
+
By default: 0.
|
24
|
+
** Value N** to calculate. By default: 50 (N50).
|
25
|
+
";
|
26
|
+
$minlen ||= 0;
|
27
|
+
$n__ ||= 50;
|
28
|
+
|
29
|
+
my @len = ();
|
30
|
+
open SEQ, "<", $seqs or die "Cannot open file: $seqs: $!\n";
|
31
|
+
while(<SEQ>){
|
32
|
+
if(/^>/){
|
33
|
+
push @len, 0;
|
34
|
+
}else{
|
35
|
+
next if /^;/;
|
36
|
+
chomp;
|
37
|
+
s/\W//g;
|
38
|
+
$len[-1]+=length $_;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
close SEQ;
|
42
|
+
@len = sort { $a <=> $b } map { $_>=$minlen?$_:() } @len;
|
43
|
+
my $tot = (sum(@len) || 0);
|
44
|
+
|
45
|
+
my $thr = $n__*$tot/100;
|
46
|
+
my $pos = 0;
|
47
|
+
for(@len){
|
48
|
+
$pos+= $_;
|
49
|
+
if($pos>=$thr){
|
50
|
+
print "N$n__: $_\n";
|
51
|
+
last;
|
52
|
+
}
|
53
|
+
}
|
54
|
+
print "Sequences: ".scalar(@len)."\n";
|
55
|
+
print "Total length: $tot\n";
|
56
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
#
|
3
|
+
# @author Luis M. Rodriguez-R
|
4
|
+
# @update Oct-07-2015
|
5
|
+
# @license artistic license 2.0
|
6
|
+
#
|
7
|
+
|
8
|
+
use warnings;
|
9
|
+
use strict;
|
10
|
+
|
11
|
+
my($file, $content, $stretch) = @ARGV;
|
12
|
+
$file or die <<HELP
|
13
|
+
|
14
|
+
Description:
|
15
|
+
Filter sequences by N-content and presence of long homopolymers.
|
16
|
+
Usage:
|
17
|
+
$0 sequences.fa [content [stretch]] > filtered.fa
|
18
|
+
Where:
|
19
|
+
sequences.fa Input file in FastA format
|
20
|
+
content A number between 0 and 1 indicating the maximum proportion of Ns
|
21
|
+
(1 to turn off, 0.5 by default)
|
22
|
+
stretch A number indicating the maximum number of consecutive identical
|
23
|
+
nucleotides allowed (0 to turn off, 100 by default)
|
24
|
+
filtered.fa Filtered set of sequences.
|
25
|
+
|
26
|
+
HELP
|
27
|
+
;
|
28
|
+
($content ||= 0.5)+=0;
|
29
|
+
($stretch ||= 100)+=0;
|
30
|
+
|
31
|
+
my $good = 0;
|
32
|
+
my $N = 0;
|
33
|
+
|
34
|
+
FASTA: {
|
35
|
+
local $/ = "\n>";
|
36
|
+
open FILE, "<", $file or die "I can not open the file: $file: $!\n";
|
37
|
+
SEQ: while(<FILE>){
|
38
|
+
$N++;
|
39
|
+
s/^;.*//gm;
|
40
|
+
s/>//g;
|
41
|
+
my($n,$s) = split /\n/, $_, 2;
|
42
|
+
(my $clean = $s) =~ s/[^ACTGN]//g;
|
43
|
+
if($content < 1){
|
44
|
+
(my $Ns = $clean) =~ s/[^N]//g;
|
45
|
+
next SEQ if length($Ns)>length($clean)*$content;
|
46
|
+
}
|
47
|
+
if($stretch > 0){
|
48
|
+
for my $nuc (qw(A C T G N)){
|
49
|
+
next SEQ if $clean =~ m/[$nuc]{$stretch}/;
|
50
|
+
}
|
51
|
+
}
|
52
|
+
print ">$n\n$s\n";
|
53
|
+
$good++;
|
54
|
+
}
|
55
|
+
close FILE;
|
56
|
+
print STDERR "Total sequences: $N\nAfter filtering: $good\n";
|
57
|
+
}
|
58
|
+
|
59
|
+
|
60
|
+
|