miga-base 0.7.26.2 → 1.0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/classify_wf.rb +2 -2
- data/lib/miga/cli/action/derep_wf.rb +1 -1
- data/lib/miga/cli/action/doctor.rb +57 -14
- data/lib/miga/cli/action/doctor/base.rb +47 -23
- data/lib/miga/cli/action/env.rb +26 -0
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/cli/base.rb +1 -0
- data/lib/miga/common.rb +1 -0
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +5 -4
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +36 -0
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +10 -2
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +12 -8
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/test/remote_dataset_test.rb +1 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/database.rb +0 -1
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- data/utils/subclade/pipeline.rb +2 -2
- metadata +33 -4
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$VERSION = 1.2
|
4
|
+
$:.push File.expand_path('../lib', __FILE__)
|
5
|
+
require 'enveomics_rb/enveomics'
|
6
|
+
|
7
|
+
o = { q: false, offset: 33, qual: 15, fasta: false }
|
8
|
+
OptionParser.new do |opts|
|
9
|
+
opts.version = $VERSION
|
10
|
+
Enveomics.opt_banner(
|
11
|
+
opts, 'Masks low-quality bases in a FastQ file',
|
12
|
+
"#{File.basename($0)} -i in.fastq -o out.fastq [options]"
|
13
|
+
)
|
14
|
+
|
15
|
+
opts.separator 'Mandatory'
|
16
|
+
opts.on(
|
17
|
+
'-i', '--input FILE',
|
18
|
+
'Path to the FastQ file containing the sequences',
|
19
|
+
'Supports compression with .gz extension, use - for STDIN'
|
20
|
+
) { |v| o[:in] = v }
|
21
|
+
opts.on(
|
22
|
+
'-o', '--out FILE',
|
23
|
+
'Path to the output FastQ file',
|
24
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
25
|
+
) { |v| o[:out] = v }
|
26
|
+
|
27
|
+
opts.separator ''
|
28
|
+
opts.separator 'Quality Options'
|
29
|
+
opts.on(
|
30
|
+
'-q', '--qual INT', Integer,
|
31
|
+
"Minimum quality score to allow a base, by default: #{o[:qual]}"
|
32
|
+
) { |v| o[:qual] = v }
|
33
|
+
opts.on(
|
34
|
+
'--offset INT', Integer,
|
35
|
+
"Q-score offset, by default: #{o[:offset]}"
|
36
|
+
) { |v| o[:offset] = v }
|
37
|
+
|
38
|
+
opts.separator ''
|
39
|
+
opts.separator 'Other Options'
|
40
|
+
opts.on(
|
41
|
+
'-a', '--fasta', 'Output sequences in FastA format'
|
42
|
+
) { |v| o[:fasta] = v }
|
43
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
44
|
+
opts.on('-h', '--help', 'Display this screen') do
|
45
|
+
puts opts
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
opts.separator ''
|
49
|
+
end.parse!
|
50
|
+
|
51
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
52
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
53
|
+
$QUIET = o[:q]
|
54
|
+
|
55
|
+
# Open in/out files
|
56
|
+
say 'Reading FastQ file'
|
57
|
+
ifh = reader(o[:in])
|
58
|
+
ofh = writer(o[:out])
|
59
|
+
|
60
|
+
# Parse and mask
|
61
|
+
entry = []
|
62
|
+
lno = 0
|
63
|
+
ifh.each_line do |ln|
|
64
|
+
lno += 1 # <- Gzip doesn't support $.
|
65
|
+
case lno % 4
|
66
|
+
when 1
|
67
|
+
ln =~ /^@(\S+)/ or
|
68
|
+
raise Enveomics::ParseError.new("Unexpected defline format: #{ln}")
|
69
|
+
entry << ln
|
70
|
+
when 2, 3
|
71
|
+
entry << ln
|
72
|
+
when 0
|
73
|
+
entry << ln
|
74
|
+
q = entry[3].chomp.split('').map { |i| (i.ord - o[:offset]) }
|
75
|
+
q.map { |i| i < o[:qual] }.each_with_index { |i, k| entry[1][k] = 'N' if i }
|
76
|
+
ofh.puts(o[:fasta] ? [entry[0].gsub(/^@/, '>'), entry[1]] : entry)
|
77
|
+
entry = []
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# Finalize
|
82
|
+
say " Lines: #{lno}"
|
83
|
+
unless entry.empty?
|
84
|
+
raise Enveomics::ParseError.new('Unexpected trailing lines in FastQ')
|
85
|
+
end
|
86
|
+
say " Sequences: #{lno / 4}"
|
87
|
+
ifh.close
|
88
|
+
ofh.close
|
89
|
+
|
@@ -1,63 +1,70 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
|
-
# @author: Luis M. Rodriguez-R
|
5
|
-
# @update: Feb-06-2015
|
6
|
-
# @license: artistic license 2.0
|
7
|
-
#
|
3
|
+
# frozen_string_literal: true
|
8
4
|
|
9
|
-
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
6
|
+
require 'enveomics_rb/enveomics'
|
7
|
+
$VERSION = 1.1
|
10
8
|
|
11
|
-
o = {:
|
12
|
-
ARGV << '-h' if ARGV.size==0
|
9
|
+
o = { q: false, p: '', s: '' }
|
13
10
|
OptionParser.new do |opts|
|
14
|
-
|
15
|
-
|
11
|
+
opts.version = $VERSION
|
12
|
+
Enveomics.opt_banner(
|
13
|
+
opts, 'Generates easy-to-parse tagged reads from FastQ files',
|
14
|
+
"#{File.basename($0)} -i in.fasta -o out.fasta [options]"
|
15
|
+
)
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
17
|
+
opts.separator 'Mandatory'
|
18
|
+
opts.on(
|
19
|
+
'-i', '--in FILE',
|
20
|
+
'Path to the FastQ file containing the sequences',
|
21
|
+
'Supports compression with .gz extension, use - for STDIN'
|
22
|
+
) { |v| o[:in] = v }
|
23
|
+
opts.on(
|
24
|
+
'-o', '--out FILE', 'Path to the FastQ to create',
|
25
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
26
|
+
) { |v| o[:out] = v }
|
27
|
+
opts.separator ''
|
28
|
+
opts.separator 'ID options'
|
29
|
+
opts.on('-p', '--prefix STR', 'Prefix to use in all IDs') { |v| o[:p] = v }
|
30
|
+
opts.on('-s', '--suffix STR', 'Suffix to use in all IDs') { |v| o[:s] = v }
|
31
|
+
opts.separator ''
|
32
|
+
opts.separator 'Other Options'
|
33
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
34
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
35
|
+
opts.separator ''
|
34
36
|
end.parse!
|
35
|
-
|
36
|
-
|
37
|
-
|
37
|
+
|
38
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
39
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
40
|
+
|
38
41
|
begin
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
42
|
+
ifh = reader(o[:in])
|
43
|
+
ofh = writer(o[:out])
|
44
|
+
i = 0
|
45
|
+
lno = 0
|
46
|
+
ifh.each do |ln|
|
47
|
+
ln.chomp!
|
48
|
+
lno += 1
|
49
|
+
case lno % 4
|
50
|
+
when 1
|
51
|
+
ln =~ /^@/ or
|
52
|
+
raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
|
53
|
+
i += 1
|
54
|
+
ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
|
55
|
+
when 3
|
56
|
+
ln =~ /^\+/ or
|
57
|
+
raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
|
58
|
+
ofh.puts '+'
|
59
|
+
else
|
60
|
+
ofh.puts ln
|
61
|
+
end
|
62
|
+
end
|
63
|
+
ifh.close
|
64
|
+
ofh.close
|
57
65
|
rescue => err
|
58
|
-
|
59
|
-
|
60
|
-
|
66
|
+
$stderr.puts "Exception: #{err}\n\n"
|
67
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
68
|
+
err
|
61
69
|
end
|
62
70
|
|
63
|
-
|
@@ -1,21 +1,19 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
|
3
|
-
|
4
|
-
# @author Luis M. Rodriguez-R
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
|
3
|
+
DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
|
9
4
|
DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
|
10
5
|
SRX=$1
|
11
6
|
DIR=${2:-$SRX}
|
7
|
+
VERSION=1.0
|
12
8
|
|
13
9
|
if [[ "$SRX" == "" ]] ; then
|
14
10
|
echo "
|
11
|
+
[Enveomics Collection: $(basename "$0" .bash) $VERSION]
|
12
|
+
|
15
13
|
Downloads the set of runs from a project, sample, or experiment in SRA.
|
16
14
|
|
17
15
|
Usage:
|
18
|
-
$0 <SRA-ID>[ <dir>]
|
16
|
+
$(basename "$0") <SRA-ID>[ <dir>]
|
19
17
|
|
20
18
|
<SRA-ID> ID of the SRA Project, Sample, or Experiment.
|
21
19
|
<dir> Directory where the files are to be downladed. By default,
|
@@ -34,7 +32,7 @@ function md5value {
|
|
34
32
|
echo "$o"
|
35
33
|
}
|
36
34
|
|
37
|
-
curl -
|
35
|
+
curl -Ls "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
|
38
36
|
tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
|
39
37
|
srr=$(echo "$ln"|cut -f 1)
|
40
38
|
ftp=$(echo "$ln"|cut -f 2)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
|
3
|
+
#= Load stuff
|
4
|
+
args <- commandArgs(trailingOnly = FALSE)
|
5
|
+
enveomics_R <- file.path(
|
6
|
+
dirname(sub('^--file=', '', args[grep('^--file=', args)])),
|
7
|
+
'lib',
|
8
|
+
'enveomics.R'
|
9
|
+
)
|
10
|
+
for(file in c('cliopts.R','utils.R','prefscore.R'))
|
11
|
+
source(file.path(enveomics_R, 'R', file))
|
12
|
+
|
13
|
+
#= Generate interface
|
14
|
+
opt <- enve.cliopts(
|
15
|
+
enve.prefscore,
|
16
|
+
file.path(enveomics_R, 'man', 'enve.prefscore.Rd'),
|
17
|
+
positional_arguments = c(1, 4),
|
18
|
+
usage = 'usage: %prog [options] output.tsv [output.pdf [width height]]',
|
19
|
+
mandatory = c('x', 'set'),
|
20
|
+
number = c('signif.thr'),
|
21
|
+
ignore = c('plot'),
|
22
|
+
o_desc = list(
|
23
|
+
x = 'A tab-delimited table of presence/absence (1/0) with species as rows and samples as columns.',
|
24
|
+
set = 'A list of sample names that constitute the test set, one per line',
|
25
|
+
ignore = 'A list of species to exclude from the analysis, one per line'
|
26
|
+
)
|
27
|
+
)
|
28
|
+
|
29
|
+
#= Set output files
|
30
|
+
opt$options[['x']] <- read.table(
|
31
|
+
opt$options[['x']],
|
32
|
+
header = TRUE,
|
33
|
+
row.names = 1,
|
34
|
+
sep = '\t'
|
35
|
+
)
|
36
|
+
opt$options[['set']] <- read.table(
|
37
|
+
opt$options[['set']],
|
38
|
+
header = FALSE,
|
39
|
+
sep = '\t',
|
40
|
+
as.is = TRUE
|
41
|
+
)[,1]
|
42
|
+
if(!is.null(opt$options[['ignore']]))
|
43
|
+
opt$options[['ignore']] <- read.table(
|
44
|
+
opt$options[['ignore']],
|
45
|
+
header = FALSE,
|
46
|
+
sep = '\t',
|
47
|
+
as.is = TRUE
|
48
|
+
)[,1]
|
49
|
+
if(length(opt$args) > 1) {
|
50
|
+
args <- as.list(opt$args[-1])
|
51
|
+
for(i in 2:3) if(length(args) >= i) args[[i]] <- as.numeric(args[[i]])
|
52
|
+
do.call('pdf', args)
|
53
|
+
} else {
|
54
|
+
opt$options[['plot']] <- FALSE
|
55
|
+
}
|
56
|
+
|
57
|
+
#= Run it!
|
58
|
+
y <- do.call('enve.prefscore', opt$options)
|
59
|
+
write.table(y, opt$args[1], quote = FALSE, sep = '\t', col.names = FALSE)
|
60
|
+
if(length(opt$args)>1) ttt <- dev.off()
|
@@ -236,8 +236,9 @@ Dir.mktmpdir do |dir|
|
|
236
236
|
end
|
237
237
|
end
|
238
238
|
response = RestClient.post(
|
239
|
-
|
240
|
-
db:
|
239
|
+
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
|
240
|
+
db: 'protein', rettype: 'fasta', id: protIds.join(','), idtype: 'acc'
|
241
|
+
)
|
241
242
|
abort "Unable to reach NCBI EUtils, error code " +
|
242
243
|
response.code.to_s + "." unless response.code == 200
|
243
244
|
fo.puts response.to_str
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
6
|
+
require 'enveomics_rb/enveomics'
|
7
|
+
require 'enveomics_rb/anir'
|
8
|
+
$VERSION = 1.0
|
9
|
+
|
10
|
+
o = {
|
11
|
+
q: false, threads: 2,
|
12
|
+
r_format: :fastq, g_format: :fasta, m_format: :sam, r_type: :single,
|
13
|
+
identity: 95.0, algorithm: :auto, bimodality: 0.5, bin_size: 1.0,
|
14
|
+
coefficient: :sarle
|
15
|
+
}
|
16
|
+
|
17
|
+
OptionParser.new do |opt|
|
18
|
+
cmd = File.basename($0)
|
19
|
+
opt.banner = <<~BANNER
|
20
|
+
|
21
|
+
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
22
|
+
|
23
|
+
Estimates ANIr: the Average Nucleotide Identity of reads against a genome
|
24
|
+
|
25
|
+
Usage
|
26
|
+
# [ Input/output modes ]
|
27
|
+
# Run mapping and (optionally) save it as SAM
|
28
|
+
# Requires bowtie2
|
29
|
+
#{cmd} -r reads.fastq -g genome.fasta -m out_map.sam [options]
|
30
|
+
|
31
|
+
# Read mapping from BAM file
|
32
|
+
# Requires samtools
|
33
|
+
#{cmd} -m map.bam --m-format bam [options]
|
34
|
+
|
35
|
+
# Read mapping from other formats: SAM or Tabular BLAST
|
36
|
+
#{cmd} -m map.blast --m-format tab [options]
|
37
|
+
|
38
|
+
# Read a list of identities as percentage (contig filtering off)
|
39
|
+
#{cmd} -m identities.txt --m-format list [options]
|
40
|
+
|
41
|
+
# [ Identity threshold modes ]
|
42
|
+
#{cmd} -i 95 -a fix [options] # Set fixed identity threshold
|
43
|
+
#{cmd} -a gmm [options] # Find valley by EM of GMM
|
44
|
+
#{cmd} -a auto [options] # Pick method by bimodality (default)"
|
45
|
+
|
46
|
+
BANNER
|
47
|
+
|
48
|
+
opt.separator 'Input/Output'
|
49
|
+
opt.on('-r', '--reads PATH', 'Metagenomic reads') { |v| o[:r] = v }
|
50
|
+
opt.on('-g', '--genome PATH', 'Genome assembly') { |v| o[:g] = v }
|
51
|
+
opt.on('-m', '--mapping PATH', 'Mapping file') { |v| o[:m] = v }
|
52
|
+
opt.on('-L', '--list PATH', 'Output file with identities') { |v| o[:L] = v }
|
53
|
+
opt.on('-H', '--hist PATH', 'Output file with histogram') { |v| o[:H] = v }
|
54
|
+
opt.on(
|
55
|
+
'-T', '--tab PATH', 'Output file with results in tabular format'
|
56
|
+
) { |v| o[:T] = v }
|
57
|
+
opt.separator ''
|
58
|
+
|
59
|
+
opt.separator 'Formats'
|
60
|
+
opt.on(
|
61
|
+
'--r-format STRING',
|
62
|
+
'Metagenomic reads format: fastq (default) or fasta',
|
63
|
+
'Both options support compression with .gz file extension'
|
64
|
+
) { |v| o[:r_format] = v.downcase.to_sym }
|
65
|
+
opt.on(
|
66
|
+
'--r-type STRING', 'Type of metagenomic reads:',
|
67
|
+
'~ single (default): Single reads',
|
68
|
+
'~ coupled: Coupled reads in separate files (-m must be comma-delimited)',
|
69
|
+
'~ interleaved: Coupled reads in a single interposed file'
|
70
|
+
) { |v| o[:r_type] = v.downcase.to_sym }
|
71
|
+
opt.on(
|
72
|
+
'--g-format STRING',
|
73
|
+
'Genome assembly format: fasta (default) or list',
|
74
|
+
'Both options support compression with .gz file extension',
|
75
|
+
'If passed in mapping-read mode, filters only matches to these contigs'
|
76
|
+
) { |v| o[:g_format] = v.downcase.to_sym }
|
77
|
+
opt.on(
|
78
|
+
'--m-format STRING',
|
79
|
+
'Mapping file format: sam (default), bam, tab, or list',
|
80
|
+
'sam, tab, and list options support compression with .gz file extension'
|
81
|
+
) { |v| o[:m_format] = v.downcase.to_sym }
|
82
|
+
opt.separator ''
|
83
|
+
|
84
|
+
opt.separator 'Identity threshold'
|
85
|
+
opt.on(
|
86
|
+
'-i', '--identity FLOAT', Float,
|
87
|
+
"Set a fixed threshold of percent identity (default: #{o[:identity]})"
|
88
|
+
) { |v| o[:identity] = v }
|
89
|
+
opt.on(
|
90
|
+
'-a', '--algorithm STRING',
|
91
|
+
'Set an algorithm to automatically detect identity threshold:',
|
92
|
+
'~ gmm: Valley detection by E-M of Gaussian Mixture Model',
|
93
|
+
'~ fix: Fixed threshold, see -i',
|
94
|
+
'~ auto (default): Pick gmm or fix depending on bimodality, see -b'
|
95
|
+
) { |v| o[:algorithm] = v.downcase.to_sym }
|
96
|
+
opt.on(
|
97
|
+
'-b', '--bimodality FLOAT', Float,
|
98
|
+
'Threshold of bimodality below which the algorithm is set to fix',
|
99
|
+
'The coefficient used is the de Michele & Accantino (2014) B index',
|
100
|
+
"By default: #{o[:bimodality]}"
|
101
|
+
) { |v| o[:bimodality] = v }
|
102
|
+
opt.on(
|
103
|
+
'--coefficient STRING',
|
104
|
+
'Coefficient of bimodality for -a auto:',
|
105
|
+
'~ sarle (default): Sarle\'s bimodality coefficient b',
|
106
|
+
'~ dma: de Michele and Accatino (2014 PLoS ONE) B index, use with -b 0.1'
|
107
|
+
) { |v| o[:coefficient] = v.downcase.to_sym }
|
108
|
+
opt.on(
|
109
|
+
'--bin-size FLOAT', Float,
|
110
|
+
"Width of histogram bins (in percent identity). By default: #{o[:bin_size]}"
|
111
|
+
) { |v| o[:bin_size] = v }
|
112
|
+
opt.separator ''
|
113
|
+
|
114
|
+
opt.separator 'General'
|
115
|
+
opt.on(
|
116
|
+
'-t', '--threads INT', Integer, 'Threads to use'
|
117
|
+
) { |v| o[:threads] = v }
|
118
|
+
opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
|
119
|
+
opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
|
120
|
+
opt.on('-h', '--help', 'Display this screen') do
|
121
|
+
puts opt
|
122
|
+
exit
|
123
|
+
end
|
124
|
+
opt.separator ''
|
125
|
+
end.parse!
|
126
|
+
|
127
|
+
anir = Enveomics::ANIr.new(o)
|
128
|
+
anir.go!
|
129
|
+
if o[:T]
|
130
|
+
File.open(o[:T], 'w') do |fh|
|
131
|
+
fh.puts "anir\tsd\treads\tid_threshold"
|
132
|
+
fh.puts [
|
133
|
+
anir.sample.mean, anir.sample.sd, anir.sample.n, anir.opts[:identity]
|
134
|
+
].join("\t")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|