miga-base 0.7.26.3 → 1.0.0.sr1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
- data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
- data/lib/miga/cli/action/doctor.rb +50 -19
- data/lib/miga/cli/action/doctor/base.rb +20 -18
- data/lib/miga/cli/action/init.rb +11 -7
- data/lib/miga/cli/action/init/files_helper.rb +1 -0
- data/lib/miga/cli/action/ncbi_get.rb +3 -3
- data/lib/miga/cli/action/tax_dist.rb +2 -2
- data/lib/miga/cli/action/wf.rb +5 -4
- data/lib/miga/daemon.rb +11 -4
- data/lib/miga/dataset/result.rb +10 -6
- data/lib/miga/json.rb +1 -2
- data/lib/miga/metadata.rb +5 -1
- data/lib/miga/parallel.rb +11 -6
- data/lib/miga/project.rb +8 -8
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -2
- data/lib/miga/sqlite.rb +7 -0
- data/lib/miga/version.rb +23 -9
- data/scripts/aai_distances.bash +16 -18
- data/scripts/ani_distances.bash +16 -17
- data/scripts/assembly.bash +31 -16
- data/scripts/haai_distances.bash +3 -27
- data/scripts/miga.bash +6 -4
- data/scripts/p.bash +1 -1
- data/scripts/read_quality.bash +9 -18
- data/scripts/trimmed_fasta.bash +14 -30
- data/scripts/trimmed_reads.bash +36 -36
- data/test/parallel_test.rb +31 -0
- data/test/project_test.rb +2 -1
- data/utils/distance/commands.rb +1 -0
- data/utils/distance/runner.rb +2 -4
- data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
- data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
- data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
- data/utils/enveomics/Manifest/Tasks/other.json +77 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
- data/utils/enveomics/Manifest/categories.json +13 -4
- data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
- data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
- data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
- data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
- data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
- data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
- data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
- data/utils/enveomics/Scripts/SRA.download.bash +6 -8
- data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
- data/utils/enveomics/Scripts/aai.rb +3 -2
- data/utils/enveomics/Scripts/anir.rb +137 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
- data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
- data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
- data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
- data/utils/enveomics/Scripts/rbm.rb +87 -133
- data/utils/enveomics/Scripts/sam.filter.rb +148 -0
- data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
- data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
- data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
- data/utils/enveomics/enveomics.R/R/utils.R +30 -0
- data/utils/enveomics/enveomics.R/README.md +1 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
- data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
- data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
- data/utils/multitrim/Multitrim How-To.pdf +0 -0
- data/utils/multitrim/README.md +67 -0
- data/utils/multitrim/multitrim.py +1555 -0
- data/utils/multitrim/multitrim.yml +13 -0
- data/utils/requirements.txt +4 -3
- metadata +33 -6
- data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -1,63 +1,70 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
#
|
4
|
-
# @author: Luis M. Rodriguez-R
|
5
|
-
# @update: Feb-06-2015
|
6
|
-
# @license: artistic license 2.0
|
7
|
-
#
|
3
|
+
# frozen_string_literal: true
|
8
4
|
|
9
|
-
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
6
|
+
require 'enveomics_rb/enveomics'
|
7
|
+
$VERSION = 1.1
|
10
8
|
|
11
|
-
o = {:
|
12
|
-
ARGV << '-h' if ARGV.size==0
|
9
|
+
o = { q: false, p: '', s: '' }
|
13
10
|
OptionParser.new do |opts|
|
14
|
-
|
15
|
-
|
11
|
+
opts.version = $VERSION
|
12
|
+
Enveomics.opt_banner(
|
13
|
+
opts, 'Generates easy-to-parse tagged reads from FastQ files',
|
14
|
+
"#{File.basename($0)} -i in.fasta -o out.fasta [options]"
|
15
|
+
)
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
17
|
+
opts.separator 'Mandatory'
|
18
|
+
opts.on(
|
19
|
+
'-i', '--in FILE',
|
20
|
+
'Path to the FastQ file containing the sequences',
|
21
|
+
'Supports compression with .gz extension, use - for STDIN'
|
22
|
+
) { |v| o[:in] = v }
|
23
|
+
opts.on(
|
24
|
+
'-o', '--out FILE', 'Path to the FastQ to create',
|
25
|
+
'Supports compression with .gz extension, use - for STDOUT'
|
26
|
+
) { |v| o[:out] = v }
|
27
|
+
opts.separator ''
|
28
|
+
opts.separator 'ID options'
|
29
|
+
opts.on('-p', '--prefix STR', 'Prefix to use in all IDs') { |v| o[:p] = v }
|
30
|
+
opts.on('-s', '--suffix STR', 'Suffix to use in all IDs') { |v| o[:s] = v }
|
31
|
+
opts.separator ''
|
32
|
+
opts.separator 'Other Options'
|
33
|
+
opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
|
34
|
+
opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
|
35
|
+
opts.separator ''
|
34
36
|
end.parse!
|
35
|
-
|
36
|
-
|
37
|
-
|
37
|
+
|
38
|
+
raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
|
39
|
+
raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
|
40
|
+
|
38
41
|
begin
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
42
|
+
ifh = reader(o[:in])
|
43
|
+
ofh = writer(o[:out])
|
44
|
+
i = 0
|
45
|
+
lno = 0
|
46
|
+
ifh.each do |ln|
|
47
|
+
ln.chomp!
|
48
|
+
lno += 1
|
49
|
+
case lno % 4
|
50
|
+
when 1
|
51
|
+
ln =~ /^@/ or
|
52
|
+
raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
|
53
|
+
i += 1
|
54
|
+
ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
|
55
|
+
when 3
|
56
|
+
ln =~ /^\+/ or
|
57
|
+
raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
|
58
|
+
ofh.puts '+'
|
59
|
+
else
|
60
|
+
ofh.puts ln
|
61
|
+
end
|
62
|
+
end
|
63
|
+
ifh.close
|
64
|
+
ofh.close
|
57
65
|
rescue => err
|
58
|
-
|
59
|
-
|
60
|
-
|
66
|
+
$stderr.puts "Exception: #{err}\n\n"
|
67
|
+
err.backtrace.each { |l| $stderr.puts l + "\n" }
|
68
|
+
err
|
61
69
|
end
|
62
70
|
|
63
|
-
|
@@ -1,21 +1,19 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
|
3
|
-
|
4
|
-
# @author Luis M. Rodriguez-R
|
5
|
-
# @license artistic license 2.0
|
6
|
-
#
|
7
|
-
|
8
|
-
DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
|
3
|
+
DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
|
9
4
|
DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
|
10
5
|
SRX=$1
|
11
6
|
DIR=${2:-$SRX}
|
7
|
+
VERSION=1.0
|
12
8
|
|
13
9
|
if [[ "$SRX" == "" ]] ; then
|
14
10
|
echo "
|
11
|
+
[Enveomics Collection: $(basename "$0" .bash) $VERSION]
|
12
|
+
|
15
13
|
Downloads the set of runs from a project, sample, or experiment in SRA.
|
16
14
|
|
17
15
|
Usage:
|
18
|
-
$0 <SRA-ID>[ <dir>]
|
16
|
+
$(basename "$0") <SRA-ID>[ <dir>]
|
19
17
|
|
20
18
|
<SRA-ID> ID of the SRA Project, Sample, or Experiment.
|
21
19
|
<dir> Directory where the files are to be downladed. By default,
|
@@ -34,7 +32,7 @@ function md5value {
|
|
34
32
|
echo "$o"
|
35
33
|
}
|
36
34
|
|
37
|
-
curl -
|
35
|
+
curl -Ls "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
|
38
36
|
tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
|
39
37
|
srr=$(echo "$ln"|cut -f 1)
|
40
38
|
ftp=$(echo "$ln"|cut -f 2)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env Rscript
|
2
|
+
|
3
|
+
#= Load stuff
|
4
|
+
args <- commandArgs(trailingOnly = FALSE)
|
5
|
+
enveomics_R <- file.path(
|
6
|
+
dirname(sub('^--file=', '', args[grep('^--file=', args)])),
|
7
|
+
'lib',
|
8
|
+
'enveomics.R'
|
9
|
+
)
|
10
|
+
for(file in c('cliopts.R','utils.R','prefscore.R'))
|
11
|
+
source(file.path(enveomics_R, 'R', file))
|
12
|
+
|
13
|
+
#= Generate interface
|
14
|
+
opt <- enve.cliopts(
|
15
|
+
enve.prefscore,
|
16
|
+
file.path(enveomics_R, 'man', 'enve.prefscore.Rd'),
|
17
|
+
positional_arguments = c(1, 4),
|
18
|
+
usage = 'usage: %prog [options] output.tsv [output.pdf [width height]]',
|
19
|
+
mandatory = c('x', 'set'),
|
20
|
+
number = c('signif.thr'),
|
21
|
+
ignore = c('plot'),
|
22
|
+
o_desc = list(
|
23
|
+
x = 'A tab-delimited table of presence/absence (1/0) with species as rows and samples as columns.',
|
24
|
+
set = 'A list of sample names that constitute the test set, one per line',
|
25
|
+
ignore = 'A list of species to exclude from the analysis, one per line'
|
26
|
+
)
|
27
|
+
)
|
28
|
+
|
29
|
+
#= Set output files
|
30
|
+
opt$options[['x']] <- read.table(
|
31
|
+
opt$options[['x']],
|
32
|
+
header = TRUE,
|
33
|
+
row.names = 1,
|
34
|
+
sep = '\t'
|
35
|
+
)
|
36
|
+
opt$options[['set']] <- read.table(
|
37
|
+
opt$options[['set']],
|
38
|
+
header = FALSE,
|
39
|
+
sep = '\t',
|
40
|
+
as.is = TRUE
|
41
|
+
)[,1]
|
42
|
+
if(!is.null(opt$options[['ignore']]))
|
43
|
+
opt$options[['ignore']] <- read.table(
|
44
|
+
opt$options[['ignore']],
|
45
|
+
header = FALSE,
|
46
|
+
sep = '\t',
|
47
|
+
as.is = TRUE
|
48
|
+
)[,1]
|
49
|
+
if(length(opt$args) > 1) {
|
50
|
+
args <- as.list(opt$args[-1])
|
51
|
+
for(i in 2:3) if(length(args) >= i) args[[i]] <- as.numeric(args[[i]])
|
52
|
+
do.call('pdf', args)
|
53
|
+
} else {
|
54
|
+
opt$options[['plot']] <- FALSE
|
55
|
+
}
|
56
|
+
|
57
|
+
#= Run it!
|
58
|
+
y <- do.call('enve.prefscore', opt$options)
|
59
|
+
write.table(y, opt$args[1], quote = FALSE, sep = '\t', col.names = FALSE)
|
60
|
+
if(length(opt$args)>1) ttt <- dev.off()
|
@@ -236,8 +236,9 @@ Dir.mktmpdir do |dir|
|
|
236
236
|
end
|
237
237
|
end
|
238
238
|
response = RestClient.post(
|
239
|
-
|
240
|
-
db:
|
239
|
+
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
|
240
|
+
db: 'protein', rettype: 'fasta', id: protIds.join(','), idtype: 'acc'
|
241
|
+
)
|
241
242
|
abort "Unable to reach NCBI EUtils, error code " +
|
242
243
|
response.code.to_s + "." unless response.code == 200
|
243
244
|
fo.puts response.to_str
|
@@ -0,0 +1,137 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
$:.push File.expand_path('../lib', __FILE__)
|
6
|
+
require 'enveomics_rb/enveomics'
|
7
|
+
require 'enveomics_rb/anir'
|
8
|
+
$VERSION = 1.0
|
9
|
+
|
10
|
+
o = {
|
11
|
+
q: false, threads: 2,
|
12
|
+
r_format: :fastq, g_format: :fasta, m_format: :sam, r_type: :single,
|
13
|
+
identity: 95.0, algorithm: :auto, bimodality: 0.5, bin_size: 1.0,
|
14
|
+
coefficient: :sarle
|
15
|
+
}
|
16
|
+
|
17
|
+
OptionParser.new do |opt|
|
18
|
+
cmd = File.basename($0)
|
19
|
+
opt.banner = <<~BANNER
|
20
|
+
|
21
|
+
[Enveomics Collection: #{cmd} v#{$VERSION}]
|
22
|
+
|
23
|
+
Estimates ANIr: the Average Nucleotide Identity of reads against a genome
|
24
|
+
|
25
|
+
Usage
|
26
|
+
# [ Input/output modes ]
|
27
|
+
# Run mapping and (optionally) save it as SAM
|
28
|
+
# Requires bowtie2
|
29
|
+
#{cmd} -r reads.fastq -g genome.fasta -m out_map.sam [options]
|
30
|
+
|
31
|
+
# Read mapping from BAM file
|
32
|
+
# Requires samtools
|
33
|
+
#{cmd} -m map.bam --m-format bam [options]
|
34
|
+
|
35
|
+
# Read mapping from other formats: SAM or Tabular BLAST
|
36
|
+
#{cmd} -m map.blast --m-format tab [options]
|
37
|
+
|
38
|
+
# Read a list of identities as percentage (contig filtering off)
|
39
|
+
#{cmd} -m identities.txt --m-format list [options]
|
40
|
+
|
41
|
+
# [ Identity threshold modes ]
|
42
|
+
#{cmd} -i 95 -a fix [options] # Set fixed identity threshold
|
43
|
+
#{cmd} -a gmm [options] # Find valley by EM of GMM
|
44
|
+
#{cmd} -a auto [options] # Pick method by bimodality (default)"
|
45
|
+
|
46
|
+
BANNER
|
47
|
+
|
48
|
+
opt.separator 'Input/Output'
|
49
|
+
opt.on('-r', '--reads PATH', 'Metagenomic reads') { |v| o[:r] = v }
|
50
|
+
opt.on('-g', '--genome PATH', 'Genome assembly') { |v| o[:g] = v }
|
51
|
+
opt.on('-m', '--mapping PATH', 'Mapping file') { |v| o[:m] = v }
|
52
|
+
opt.on('-L', '--list PATH', 'Output file with identities') { |v| o[:L] = v }
|
53
|
+
opt.on('-H', '--hist PATH', 'Output file with histogram') { |v| o[:H] = v }
|
54
|
+
opt.on(
|
55
|
+
'-T', '--tab PATH', 'Output file with results in tabular format'
|
56
|
+
) { |v| o[:T] = v }
|
57
|
+
opt.separator ''
|
58
|
+
|
59
|
+
opt.separator 'Formats'
|
60
|
+
opt.on(
|
61
|
+
'--r-format STRING',
|
62
|
+
'Metagenomic reads format: fastq (default) or fasta',
|
63
|
+
'Both options support compression with .gz file extension'
|
64
|
+
) { |v| o[:r_format] = v.downcase.to_sym }
|
65
|
+
opt.on(
|
66
|
+
'--r-type STRING', 'Type of metagenomic reads:',
|
67
|
+
'~ single (default): Single reads',
|
68
|
+
'~ coupled: Coupled reads in separate files (-m must be comma-delimited)',
|
69
|
+
'~ interleaved: Coupled reads in a single interposed file'
|
70
|
+
) { |v| o[:r_type] = v.downcase.to_sym }
|
71
|
+
opt.on(
|
72
|
+
'--g-format STRING',
|
73
|
+
'Genome assembly format: fasta (default) or list',
|
74
|
+
'Both options support compression with .gz file extension',
|
75
|
+
'If passed in mapping-read mode, filters only matches to these contigs'
|
76
|
+
) { |v| o[:g_format] = v.downcase.to_sym }
|
77
|
+
opt.on(
|
78
|
+
'--m-format STRING',
|
79
|
+
'Mapping file format: sam (default), bam, tab, or list',
|
80
|
+
'sam, tab, and list options support compression with .gz file extension'
|
81
|
+
) { |v| o[:m_format] = v.downcase.to_sym }
|
82
|
+
opt.separator ''
|
83
|
+
|
84
|
+
opt.separator 'Identity threshold'
|
85
|
+
opt.on(
|
86
|
+
'-i', '--identity FLOAT', Float,
|
87
|
+
"Set a fixed threshold of percent identity (default: #{o[:identity]})"
|
88
|
+
) { |v| o[:identity] = v }
|
89
|
+
opt.on(
|
90
|
+
'-a', '--algorithm STRING',
|
91
|
+
'Set an algorithm to automatically detect identity threshold:',
|
92
|
+
'~ gmm: Valley detection by E-M of Gaussian Mixture Model',
|
93
|
+
'~ fix: Fixed threshold, see -i',
|
94
|
+
'~ auto (default): Pick gmm or fix depending on bimodality, see -b'
|
95
|
+
) { |v| o[:algorithm] = v.downcase.to_sym }
|
96
|
+
opt.on(
|
97
|
+
'-b', '--bimodality FLOAT', Float,
|
98
|
+
'Threshold of bimodality below which the algorithm is set to fix',
|
99
|
+
'The coefficient used is the de Michele & Accantino (2014) B index',
|
100
|
+
"By default: #{o[:bimodality]}"
|
101
|
+
) { |v| o[:bimodality] = v }
|
102
|
+
opt.on(
|
103
|
+
'--coefficient STRING',
|
104
|
+
'Coefficient of bimodality for -a auto:',
|
105
|
+
'~ sarle (default): Sarle\'s bimodality coefficient b',
|
106
|
+
'~ dma: de Michele and Accatino (2014 PLoS ONE) B index, use with -b 0.1'
|
107
|
+
) { |v| o[:coefficient] = v.downcase.to_sym }
|
108
|
+
opt.on(
|
109
|
+
'--bin-size FLOAT', Float,
|
110
|
+
"Width of histogram bins (in percent identity). By default: #{o[:bin_size]}"
|
111
|
+
) { |v| o[:bin_size] = v }
|
112
|
+
opt.separator ''
|
113
|
+
|
114
|
+
opt.separator 'General'
|
115
|
+
opt.on(
|
116
|
+
'-t', '--threads INT', Integer, 'Threads to use'
|
117
|
+
) { |v| o[:threads] = v }
|
118
|
+
opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
|
119
|
+
opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
|
120
|
+
opt.on('-h', '--help', 'Display this screen') do
|
121
|
+
puts opt
|
122
|
+
exit
|
123
|
+
end
|
124
|
+
opt.separator ''
|
125
|
+
end.parse!
|
126
|
+
|
127
|
+
anir = Enveomics::ANIr.new(o)
|
128
|
+
anir.go!
|
129
|
+
if o[:T]
|
130
|
+
File.open(o[:T], 'w') do |fh|
|
131
|
+
fh.puts "anir\tsd\treads\tid_threshold"
|
132
|
+
fh.puts [
|
133
|
+
anir.sample.mean, anir.sample.sd, anir.sample.n, anir.opts[:identity]
|
134
|
+
].join("\t")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
@@ -0,0 +1,293 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'enveomics_rb/stats'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'shellwords'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'zlib'
|
8
|
+
|
9
|
+
module Enveomics
|
10
|
+
# Wrapper class for ANIr estimation
|
11
|
+
#
|
12
|
+
# Use as: +ANIr.new(opts).go!+
|
13
|
+
class ANIr
|
14
|
+
# Options hash
|
15
|
+
attr :opts
|
16
|
+
|
17
|
+
# Identities list (unsorted)
|
18
|
+
attr :identities
|
19
|
+
|
20
|
+
def initialize(opts)
|
21
|
+
@opts = opts
|
22
|
+
@identities = []
|
23
|
+
end
|
24
|
+
|
25
|
+
# --------------------------------------------------[ High-level pipelines ]
|
26
|
+
|
27
|
+
# Perform all the analyses
|
28
|
+
def go!
|
29
|
+
read_input
|
30
|
+
detect_identity
|
31
|
+
estimate_ani_r
|
32
|
+
end
|
33
|
+
|
34
|
+
# Identify input/output mode and read mapping
|
35
|
+
def read_input
|
36
|
+
if opts[:m_format] != :list
|
37
|
+
@tmpdir = Dir.mktmpdir
|
38
|
+
@filter_contigs = !opts[:g].nil?
|
39
|
+
opts[:m] = File.join(@tmpdir, 'map.sam') if opts[:m].nil?
|
40
|
+
run_mapping unless File.exist? opts[:m]
|
41
|
+
load_contigs_to_filter if @filter_contigs
|
42
|
+
end
|
43
|
+
read_mapping = :"read_mapping_from_#{opts[:m_format]}"
|
44
|
+
raise Enveomics::OptionError.new(
|
45
|
+
"Unsupported mapping format: #{opts[:m_format]}"
|
46
|
+
) unless respond_to? read_mapping
|
47
|
+
@identities = []
|
48
|
+
send(read_mapping)
|
49
|
+
say "- Unfiltered average identity: #{sample.mean}"
|
50
|
+
say "- Reads mapped: #{sample.n}"
|
51
|
+
save_identities
|
52
|
+
save_histogram
|
53
|
+
ensure
|
54
|
+
@tmpdir ||= nil
|
55
|
+
FileUtils.rm_rf @tmpdir if @tmpdir
|
56
|
+
end
|
57
|
+
|
58
|
+
# Identify the identity threshold
|
59
|
+
def detect_identity
|
60
|
+
say 'Detecting identity threshold'
|
61
|
+
if opts[:algorithm] == :auto
|
62
|
+
say "- Bimodality: #{bimodality}"
|
63
|
+
opts[:algorithm] = bimodality >= opts[:bimodality] ? :gmm : :fix
|
64
|
+
end
|
65
|
+
say "- Algorithm: #{opts[:algorithm]}"
|
66
|
+
if opts[:algorithm] == :gmm
|
67
|
+
detect_identity_by_gmm
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Estimate ANIr
|
72
|
+
def estimate_ani_r
|
73
|
+
say 'Estimating ANIr'
|
74
|
+
@sample = nil # Empty cached sample
|
75
|
+
@identities.delete_if { |i| i < opts[:identity] }
|
76
|
+
say "- ANIr: #{sample.mean}"
|
77
|
+
end
|
78
|
+
|
79
|
+
# -----------------------------------------------------------------[ Utils ]
|
80
|
+
|
81
|
+
# Show progress unless +opts[:q]+
|
82
|
+
def say(*msg)
|
83
|
+
o = '[%s] %s' % [Time.now, msg.join('')]
|
84
|
+
$stderr.puts(o) unless opts[:q]
|
85
|
+
File.open(opts[:log], 'a') { |fh| fh.puts o } if opts[:log]
|
86
|
+
end
|
87
|
+
|
88
|
+
# Execute command in the shell
|
89
|
+
def run(cmd)
|
90
|
+
say " - Running: #{cmd.join(' ')}"
|
91
|
+
`#{cmd.shelljoin} 2>&1 | tee >> #{opts[:log] || '/dev/null'}`
|
92
|
+
unless $?.success?
|
93
|
+
raise Enveomics::CommandError.new("#{cmd.first} failed: #{$?}")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns an open file handler for the file, supporting .gz
|
98
|
+
def reader(file)
|
99
|
+
file =~ /\.gz$/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
|
100
|
+
end
|
101
|
+
|
102
|
+
# Is the mapping in SAM format?
|
103
|
+
def sam?
|
104
|
+
opts[:m_format] == :sam
|
105
|
+
end
|
106
|
+
|
107
|
+
# ------------------------------------------------------------[ Map it out ]
|
108
|
+
|
109
|
+
# Execute Bowtie2 and generate SAM file
|
110
|
+
def run_mapping
|
111
|
+
say 'Running mapping using Bowtie2'
|
112
|
+
raise Enveomics::OptionError.new(
|
113
|
+
'Only SAM output is supported for mapping'
|
114
|
+
) unless sam?
|
115
|
+
|
116
|
+
@filter_contigs = false
|
117
|
+
say '- Indexing input sequences'
|
118
|
+
raise Enveomics::OptionError.new(
|
119
|
+
'Only FastA genome input is supported for mapping'
|
120
|
+
) unless opts[:g_format] == :fasta
|
121
|
+
|
122
|
+
idx = File.join(@tmpdir, 'genome.idx')
|
123
|
+
run(['bowtie2-build', opts[:g], idx])
|
124
|
+
|
125
|
+
say '- Mapping metagenomic reads to genome assembly'
|
126
|
+
cmd = [
|
127
|
+
'bowtie2', '-x', idx, '-p', opts[:threads], '-S', opts[:m], '--no-mixed'
|
128
|
+
]
|
129
|
+
cmd << '-f' if opts[:r_format] == :fasta
|
130
|
+
cmd +=
|
131
|
+
case opts[:r_type]
|
132
|
+
when :single
|
133
|
+
['-U', opts[:r]]
|
134
|
+
when :coupled
|
135
|
+
pairs = opts[:r].split(',', 2)
|
136
|
+
['-1', pairs[0], '-2', pairs[1], '--no-discordant']
|
137
|
+
when :interleaved
|
138
|
+
['--interleaved', opts[:r], '--no-discordant']
|
139
|
+
else
|
140
|
+
raise Enveomics::OptionError.new(
|
141
|
+
"Unsupported reads type: #{o[:r_type]}"
|
142
|
+
)
|
143
|
+
end
|
144
|
+
run(cmd)
|
145
|
+
end
|
146
|
+
|
147
|
+
# If +@filter_contigs+ is true, reads the genome assembly and saves contig
|
148
|
+
# names to filter the mapping
|
149
|
+
def load_contigs_to_filter
|
150
|
+
return unless @filter_contigs
|
151
|
+
say 'Loading contigs to filter'
|
152
|
+
reader = reader(opts[:g])
|
153
|
+
@contigs_to_filter =
|
154
|
+
case opts[:g_format]
|
155
|
+
when :fasta
|
156
|
+
reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
|
157
|
+
when :list
|
158
|
+
reader.each.map(&:chomp)
|
159
|
+
else
|
160
|
+
raise Enveomics::OptionError.new(
|
161
|
+
"Unsupported genome assembly format: #{opts[:g_format]}"
|
162
|
+
)
|
163
|
+
end
|
164
|
+
reader.close
|
165
|
+
say "- Got #{@contigs_to_filter.size} contigs"
|
166
|
+
end
|
167
|
+
|
168
|
+
# Reads the mapping file assuming SAM format
|
169
|
+
def read_mapping_from_sam
|
170
|
+
say 'Reading mapping from SAM file'
|
171
|
+
reader = reader(opts[:m])
|
172
|
+
reader.each { |ln| parse_sam_line(ln) }
|
173
|
+
reader.close
|
174
|
+
end
|
175
|
+
|
176
|
+
# Reads the mapping file assuming BAM format
|
177
|
+
def read_mapping_from_bam
|
178
|
+
say 'Reading mapping from BAM file'
|
179
|
+
IO.popen(['samtools', 'view', opts[:m]].shelljoin) do |fh|
|
180
|
+
fh.each { |ln| parse_sam_line(ln) }
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
# Reads the mapping file assuming a Tabular BLAST format
|
185
|
+
def read_mapping_from_tab
|
186
|
+
say 'Reading mapping from Tabular BLAST file'
|
187
|
+
reader = reader(opts[:m])
|
188
|
+
reader.each do |ln|
|
189
|
+
next if ln =~ /^\s*(#.*)?$/ # Comment or empty line
|
190
|
+
row = ln.chomp.split("\t")
|
191
|
+
next if @filter_contigs && !@contigs_to_filter.include?(row[1])
|
192
|
+
@identities << row[2].to_f
|
193
|
+
end
|
194
|
+
reader.close
|
195
|
+
end
|
196
|
+
|
197
|
+
# Reads the identities from a raw-text list
|
198
|
+
def read_mapping_from_list
|
199
|
+
say 'Reading identities from raw text list'
|
200
|
+
reader = reader(opts[:m])
|
201
|
+
@identities = reader.each.map(&:to_f)
|
202
|
+
reader.close
|
203
|
+
end
|
204
|
+
|
205
|
+
# Parses one line in SAM format
|
206
|
+
def parse_sam_line(ln)
|
207
|
+
return if ln =~ /^@/ || ln =~ /^\s*$/
|
208
|
+
row = ln.chomp.split("\t")
|
209
|
+
return if row[2] == '*'
|
210
|
+
return if @filter_contigs && !@contigs_to_filter.include?(row[2])
|
211
|
+
length = row[9].size
|
212
|
+
row.shift(11) # Discard non-flag columns
|
213
|
+
flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
|
214
|
+
return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
|
215
|
+
unless flags['MD']
|
216
|
+
raise Enveomics::ParseError.new(
|
217
|
+
"SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
|
218
|
+
)
|
219
|
+
end
|
220
|
+
mismatches = flags['MD'].scan(/[^\d]/).count
|
221
|
+
@identities << 100.0 * (length - mismatches) / length
|
222
|
+
end
|
223
|
+
|
224
|
+
# Save identites as raw text
|
225
|
+
def save_identities
|
226
|
+
return unless opts[:L]
|
227
|
+
say '- Saving identities'
|
228
|
+
File.open(opts[:L], 'w') do |fh|
|
229
|
+
identities.each { |i| fh.puts i }
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# Save identity histogram as raw text
|
234
|
+
def save_histogram
|
235
|
+
return unless opts[:H]
|
236
|
+
say '- Saving histogram'
|
237
|
+
File.open(opts[:H], 'w') do |fh|
|
238
|
+
fh.puts "from\tto\tcount"
|
239
|
+
sample.histo_ranges.each_with_index do |r, k|
|
240
|
+
fh.puts (r + [sample.histo_counts[k]]).join("\t")
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
# -----------------------------------------------------------[ Peak finder ]
|
246
|
+
|
247
|
+
# Detect identity threshold by gaussian mixture model EM
|
248
|
+
def detect_identity_by_gmm
|
249
|
+
model_identities_by_gmm_em
|
250
|
+
detect_valley_by_gmm
|
251
|
+
end
|
252
|
+
|
253
|
+
# Model identities as a 2-gaussian mix by EM
|
254
|
+
def model_identities_by_gmm_em
|
255
|
+
say 'Modeling identities by gaussian mixture model using EM'
|
256
|
+
# TODO: Implement
|
257
|
+
raise Enveomics::UnimplementedError.new('Unimplemented operation')
|
258
|
+
end
|
259
|
+
|
260
|
+
# Detect valley by gaussian mix
|
261
|
+
def detect_valley_by_gmm
|
262
|
+
say 'Detecting valley by gaussian mixture model'
|
263
|
+
# TODO: Implement
|
264
|
+
raise Enveomics::UnimplementedError.new('Unimplemented operation')
|
265
|
+
end
|
266
|
+
|
267
|
+
# -----------------------------------------------------------[ Do the math ]
|
268
|
+
|
269
|
+
# Identities as a Enveomics::Stats::Sample object
|
270
|
+
def sample
|
271
|
+
@sample ||= Enveomics::Stats::Sample.new(
|
272
|
+
identities,
|
273
|
+
effective_range: [nil, 100.0],
|
274
|
+
histo_bin_size: opts[:bin_size]
|
275
|
+
)
|
276
|
+
end
|
277
|
+
|
278
|
+
# Returns the bimodality coefficient indicated by +opts[:coefficient]+
|
279
|
+
def bimodality
|
280
|
+
@bimodality ||=
|
281
|
+
case opts[:coefficient]
|
282
|
+
when :sarle
|
283
|
+
sample.sarle_bimodality
|
284
|
+
when :dma
|
285
|
+
sample.dma_bimodality
|
286
|
+
else
|
287
|
+
raise Enveomics::OptionError.new(
|
288
|
+
"Unsupported coefficient of bimodality: #{opts[:coefficient]}"
|
289
|
+
)
|
290
|
+
end
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|