miga-base 0.7.26.3 → 1.0.0.sr1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/doctor.rb +50 -19
  7. data/lib/miga/cli/action/doctor/base.rb +20 -18
  8. data/lib/miga/cli/action/init.rb +11 -7
  9. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  10. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  11. data/lib/miga/cli/action/tax_dist.rb +2 -2
  12. data/lib/miga/cli/action/wf.rb +5 -4
  13. data/lib/miga/daemon.rb +11 -4
  14. data/lib/miga/dataset/result.rb +10 -6
  15. data/lib/miga/json.rb +1 -2
  16. data/lib/miga/metadata.rb +5 -1
  17. data/lib/miga/parallel.rb +11 -6
  18. data/lib/miga/project.rb +8 -8
  19. data/lib/miga/project/base.rb +4 -4
  20. data/lib/miga/project/result.rb +2 -2
  21. data/lib/miga/sqlite.rb +7 -0
  22. data/lib/miga/version.rb +23 -9
  23. data/scripts/aai_distances.bash +16 -18
  24. data/scripts/ani_distances.bash +16 -17
  25. data/scripts/assembly.bash +31 -16
  26. data/scripts/haai_distances.bash +3 -27
  27. data/scripts/miga.bash +6 -4
  28. data/scripts/p.bash +1 -1
  29. data/scripts/read_quality.bash +9 -18
  30. data/scripts/trimmed_fasta.bash +14 -30
  31. data/scripts/trimmed_reads.bash +36 -36
  32. data/test/parallel_test.rb +31 -0
  33. data/test/project_test.rb +2 -1
  34. data/utils/distance/commands.rb +1 -0
  35. data/utils/distance/runner.rb +2 -4
  36. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  37. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  38. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  39. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  41. data/utils/enveomics/Manifest/categories.json +13 -4
  42. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  43. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  44. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  45. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  46. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  47. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  48. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  49. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  50. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  51. data/utils/enveomics/Scripts/aai.rb +3 -2
  52. data/utils/enveomics/Scripts/anir.rb +137 -0
  53. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  54. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  55. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  56. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  57. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  58. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  59. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  64. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  65. data/utils/enveomics/Scripts/rbm.rb +87 -133
  66. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  67. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  68. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  69. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  70. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  71. data/utils/enveomics/enveomics.R/README.md +1 -0
  72. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  73. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  74. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  75. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  76. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  77. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  78. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  79. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  80. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  81. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  82. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  83. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  84. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  93. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  94. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  95. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  96. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  97. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  98. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  99. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  100. data/utils/multitrim/README.md +67 -0
  101. data/utils/multitrim/multitrim.py +1555 -0
  102. data/utils/multitrim/multitrim.yml +13 -0
  103. data/utils/requirements.txt +4 -3
  104. metadata +33 -6
  105. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -1,63 +1,70 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
- # @author: Luis M. Rodriguez-R
5
- # @update: Feb-06-2015
6
- # @license: artistic license 2.0
7
- #
3
+ # frozen_string_literal: true
8
4
 
9
- require 'optparse'
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ $VERSION = 1.1
10
8
 
11
- o = {:q=>FALSE, :p=>"", :s=>""}
12
- ARGV << '-h' if ARGV.size==0
9
+ o = { q: false, p: '', s: '' }
13
10
  OptionParser.new do |opts|
14
- opts.banner = "
15
- Generates easy-to-parse tagged reads from FastQ files.
11
+ opts.version = $VERSION
12
+ Enveomics.opt_banner(
13
+ opts, 'Generates easy-to-parse tagged reads from FastQ files',
14
+ "#{File.basename($0)} -i in.fasta -o out.fasta [options]"
15
+ )
16
16
 
17
- Usage: #{$0} [options]"
18
- opts.separator ""
19
- opts.separator "Mandatory"
20
- opts.on("-i", "--in FILE", "Path to the FastQ file containing the sequences."){ |v| o[:in] = v }
21
- opts.on("-o", "--out FILE", "Path to the FastQ to create."){ |v| o[:out] = v }
22
- opts.separator ""
23
- opts.separator "ID options"
24
- opts.on("-p", "--prefix STR", "Prefix to use in all IDs."){ |v| o[:p] = v }
25
- opts.on("-s", "--suffix STR", "Suffix to use in all IDs."){ |v| o[:s] = v }
26
- opts.separator ""
27
- opts.separator "Other Options"
28
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
29
- opts.on("-h", "--help", "Display this screen") do
30
- puts opts
31
- exit
32
- end
33
- opts.separator ""
17
+ opts.separator 'Mandatory'
18
+ opts.on(
19
+ '-i', '--in FILE',
20
+ 'Path to the FastQ file containing the sequences',
21
+ 'Supports compression with .gz extension, use - for STDIN'
22
+ ) { |v| o[:in] = v }
23
+ opts.on(
24
+ '-o', '--out FILE', 'Path to the FastQ to create',
25
+ 'Supports compression with .gz extension, use - for STDOUT'
26
+ ) { |v| o[:out] = v }
27
+ opts.separator ''
28
+ opts.separator 'ID options'
29
+ opts.on('-p', '--prefix STR', 'Prefix to use in all IDs') { |v| o[:p] = v }
30
+ opts.on('-s', '--suffix STR', 'Suffix to use in all IDs') { |v| o[:s] = v }
31
+ opts.separator ''
32
+ opts.separator 'Other Options'
33
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
34
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
35
+ opts.separator ''
34
36
  end.parse!
35
- abort "-i is mandatory" if o[:in].nil?
36
- abort "-o is mandatory" if o[:out].nil?
37
-
37
+
38
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
39
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
40
+
38
41
  begin
39
- ifh = File.open(o[:in], 'r');
40
- ofh = File.open(o[:out], 'w');
41
- i=0
42
- while ln=ifh.gets
43
- ln.chomp!
44
- if $.%4==1 and not /^@/.match(ln).nil?
45
- i+=1
46
- ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
47
- elsif $.%4==2 or $.%4==0
48
- ofh.puts ln
49
- elsif $.%4==3 and not /^\+/.match(ln).nil?
50
- ofh.puts "+"
51
- else
52
- abort "Impossible to parse line #{$.}: #{ln}.\n"
53
- end
54
- end
55
- ifh.close
56
- ofh.close
42
+ ifh = reader(o[:in])
43
+ ofh = writer(o[:out])
44
+ i = 0
45
+ lno = 0
46
+ ifh.each do |ln|
47
+ ln.chomp!
48
+ lno += 1
49
+ case lno % 4
50
+ when 1
51
+ ln =~ /^@/ or
52
+ raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
53
+ i += 1
54
+ ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
55
+ when 3
56
+ ln =~ /^\+/ or
57
+ raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
58
+ ofh.puts '+'
59
+ else
60
+ ofh.puts ln
61
+ end
62
+ end
63
+ ifh.close
64
+ ofh.close
57
65
  rescue => err
58
- $stderr.puts "Exception: #{err}\n\n"
59
- err.backtrace.each { |l| $stderr.puts l + "\n" }
60
- err
66
+ $stderr.puts "Exception: #{err}\n\n"
67
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
68
+ err
61
69
  end
62
70
 
63
-
@@ -1,21 +1,19 @@
1
1
  #!/bin/bash
2
2
 
3
- #
4
- # @author Luis M. Rodriguez-R
5
- # @license artistic license 2.0
6
- #
7
-
8
- DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
3
+ DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
9
4
  DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
10
5
  SRX=$1
11
6
  DIR=${2:-$SRX}
7
+ VERSION=1.0
12
8
 
13
9
  if [[ "$SRX" == "" ]] ; then
14
10
  echo "
11
+ [Enveomics Collection: $(basename "$0" .bash) $VERSION]
12
+
15
13
  Downloads the set of runs from a project, sample, or experiment in SRA.
16
14
 
17
15
  Usage:
18
- $0 <SRA-ID>[ <dir>]
16
+ $(basename "$0") <SRA-ID>[ <dir>]
19
17
 
20
18
  <SRA-ID> ID of the SRA Project, Sample, or Experiment.
21
19
  <dir> Directory where the files are to be downladed. By default,
@@ -34,7 +32,7 @@ function md5value {
34
32
  echo "$o"
35
33
  }
36
34
 
37
- curl -s "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
35
+ curl -Ls "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
38
36
  tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
39
37
  srr=$(echo "$ln"|cut -f 1)
40
38
  ftp=$(echo "$ln"|cut -f 2)
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env Rscript
2
+
3
+ #= Load stuff
4
+ args <- commandArgs(trailingOnly = FALSE)
5
+ enveomics_R <- file.path(
6
+ dirname(sub('^--file=', '', args[grep('^--file=', args)])),
7
+ 'lib',
8
+ 'enveomics.R'
9
+ )
10
+ for(file in c('cliopts.R','utils.R','prefscore.R'))
11
+ source(file.path(enveomics_R, 'R', file))
12
+
13
+ #= Generate interface
14
+ opt <- enve.cliopts(
15
+ enve.prefscore,
16
+ file.path(enveomics_R, 'man', 'enve.prefscore.Rd'),
17
+ positional_arguments = c(1, 4),
18
+ usage = 'usage: %prog [options] output.tsv [output.pdf [width height]]',
19
+ mandatory = c('x', 'set'),
20
+ number = c('signif.thr'),
21
+ ignore = c('plot'),
22
+ o_desc = list(
23
+ x = 'A tab-delimited table of presence/absence (1/0) with species as rows and samples as columns.',
24
+ set = 'A list of sample names that constitute the test set, one per line',
25
+ ignore = 'A list of species to exclude from the analysis, one per line'
26
+ )
27
+ )
28
+
29
+ #= Set output files
30
+ opt$options[['x']] <- read.table(
31
+ opt$options[['x']],
32
+ header = TRUE,
33
+ row.names = 1,
34
+ sep = '\t'
35
+ )
36
+ opt$options[['set']] <- read.table(
37
+ opt$options[['set']],
38
+ header = FALSE,
39
+ sep = '\t',
40
+ as.is = TRUE
41
+ )[,1]
42
+ if(!is.null(opt$options[['ignore']]))
43
+ opt$options[['ignore']] <- read.table(
44
+ opt$options[['ignore']],
45
+ header = FALSE,
46
+ sep = '\t',
47
+ as.is = TRUE
48
+ )[,1]
49
+ if(length(opt$args) > 1) {
50
+ args <- as.list(opt$args[-1])
51
+ for(i in 2:3) if(length(args) >= i) args[[i]] <- as.numeric(args[[i]])
52
+ do.call('pdf', args)
53
+ } else {
54
+ opt$options[['plot']] <- FALSE
55
+ }
56
+
57
+ #= Run it!
58
+ y <- do.call('enve.prefscore', opt$options)
59
+ write.table(y, opt$args[1], quote = FALSE, sep = '\t', col.names = FALSE)
60
+ if(length(opt$args)>1) ttt <- dev.off()
@@ -236,8 +236,9 @@ Dir.mktmpdir do |dir|
236
236
  end
237
237
  end
238
238
  response = RestClient.post(
239
- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
240
- db:"nuccore",rettype:"fasta",id:protIds.join(","),idtype:"acc")
239
+ 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
240
+ db: 'protein', rettype: 'fasta', id: protIds.join(','), idtype: 'acc'
241
+ )
241
242
  abort "Unable to reach NCBI EUtils, error code " +
242
243
  response.code.to_s + "." unless response.code == 200
243
244
  fo.puts response.to_str
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # frozen_string_literal: true
4
+
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ require 'enveomics_rb/anir'
8
+ $VERSION = 1.0
9
+
10
+ o = {
11
+ q: false, threads: 2,
12
+ r_format: :fastq, g_format: :fasta, m_format: :sam, r_type: :single,
13
+ identity: 95.0, algorithm: :auto, bimodality: 0.5, bin_size: 1.0,
14
+ coefficient: :sarle
15
+ }
16
+
17
+ OptionParser.new do |opt|
18
+ cmd = File.basename($0)
19
+ opt.banner = <<~BANNER
20
+
21
+ [Enveomics Collection: #{cmd} v#{$VERSION}]
22
+
23
+ Estimates ANIr: the Average Nucleotide Identity of reads against a genome
24
+
25
+ Usage
26
+ # [ Input/output modes ]
27
+ # Run mapping and (optionally) save it as SAM
28
+ # Requires bowtie2
29
+ #{cmd} -r reads.fastq -g genome.fasta -m out_map.sam [options]
30
+
31
+ # Read mapping from BAM file
32
+ # Requires samtools
33
+ #{cmd} -m map.bam --m-format bam [options]
34
+
35
+ # Read mapping from other formats: SAM or Tabular BLAST
36
+ #{cmd} -m map.blast --m-format tab [options]
37
+
38
+ # Read a list of identities as percentage (contig filtering off)
39
+ #{cmd} -m identities.txt --m-format list [options]
40
+
41
+ # [ Identity threshold modes ]
42
+ #{cmd} -i 95 -a fix [options] # Set fixed identity threshold
43
+ #{cmd} -a gmm [options] # Find valley by EM of GMM
44
+ #{cmd} -a auto [options] # Pick method by bimodality (default)"
45
+
46
+ BANNER
47
+
48
+ opt.separator 'Input/Output'
49
+ opt.on('-r', '--reads PATH', 'Metagenomic reads') { |v| o[:r] = v }
50
+ opt.on('-g', '--genome PATH', 'Genome assembly') { |v| o[:g] = v }
51
+ opt.on('-m', '--mapping PATH', 'Mapping file') { |v| o[:m] = v }
52
+ opt.on('-L', '--list PATH', 'Output file with identities') { |v| o[:L] = v }
53
+ opt.on('-H', '--hist PATH', 'Output file with histogram') { |v| o[:H] = v }
54
+ opt.on(
55
+ '-T', '--tab PATH', 'Output file with results in tabular format'
56
+ ) { |v| o[:T] = v }
57
+ opt.separator ''
58
+
59
+ opt.separator 'Formats'
60
+ opt.on(
61
+ '--r-format STRING',
62
+ 'Metagenomic reads format: fastq (default) or fasta',
63
+ 'Both options support compression with .gz file extension'
64
+ ) { |v| o[:r_format] = v.downcase.to_sym }
65
+ opt.on(
66
+ '--r-type STRING', 'Type of metagenomic reads:',
67
+ '~ single (default): Single reads',
68
+ '~ coupled: Coupled reads in separate files (-m must be comma-delimited)',
69
+ '~ interleaved: Coupled reads in a single interposed file'
70
+ ) { |v| o[:r_type] = v.downcase.to_sym }
71
+ opt.on(
72
+ '--g-format STRING',
73
+ 'Genome assembly format: fasta (default) or list',
74
+ 'Both options support compression with .gz file extension',
75
+ 'If passed in mapping-read mode, filters only matches to these contigs'
76
+ ) { |v| o[:g_format] = v.downcase.to_sym }
77
+ opt.on(
78
+ '--m-format STRING',
79
+ 'Mapping file format: sam (default), bam, tab, or list',
80
+ 'sam, tab, and list options support compression with .gz file extension'
81
+ ) { |v| o[:m_format] = v.downcase.to_sym }
82
+ opt.separator ''
83
+
84
+ opt.separator 'Identity threshold'
85
+ opt.on(
86
+ '-i', '--identity FLOAT', Float,
87
+ "Set a fixed threshold of percent identity (default: #{o[:identity]})"
88
+ ) { |v| o[:identity] = v }
89
+ opt.on(
90
+ '-a', '--algorithm STRING',
91
+ 'Set an algorithm to automatically detect identity threshold:',
92
+ '~ gmm: Valley detection by E-M of Gaussian Mixture Model',
93
+ '~ fix: Fixed threshold, see -i',
94
+ '~ auto (default): Pick gmm or fix depending on bimodality, see -b'
95
+ ) { |v| o[:algorithm] = v.downcase.to_sym }
96
+ opt.on(
97
+ '-b', '--bimodality FLOAT', Float,
98
+ 'Threshold of bimodality below which the algorithm is set to fix',
99
+ 'The coefficient used is the de Michele & Accantino (2014) B index',
100
+ "By default: #{o[:bimodality]}"
101
+ ) { |v| o[:bimodality] = v }
102
+ opt.on(
103
+ '--coefficient STRING',
104
+ 'Coefficient of bimodality for -a auto:',
105
+ '~ sarle (default): Sarle\'s bimodality coefficient b',
106
+ '~ dma: de Michele and Accatino (2014 PLoS ONE) B index, use with -b 0.1'
107
+ ) { |v| o[:coefficient] = v.downcase.to_sym }
108
+ opt.on(
109
+ '--bin-size FLOAT', Float,
110
+ "Width of histogram bins (in percent identity). By default: #{o[:bin_size]}"
111
+ ) { |v| o[:bin_size] = v }
112
+ opt.separator ''
113
+
114
+ opt.separator 'General'
115
+ opt.on(
116
+ '-t', '--threads INT', Integer, 'Threads to use'
117
+ ) { |v| o[:threads] = v }
118
+ opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
119
+ opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
120
+ opt.on('-h', '--help', 'Display this screen') do
121
+ puts opt
122
+ exit
123
+ end
124
+ opt.separator ''
125
+ end.parse!
126
+
127
+ anir = Enveomics::ANIr.new(o)
128
+ anir.go!
129
+ if o[:T]
130
+ File.open(o[:T], 'w') do |fh|
131
+ fh.puts "anir\tsd\treads\tid_threshold"
132
+ fh.puts [
133
+ anir.sample.mean, anir.sample.sd, anir.sample.n, anir.opts[:identity]
134
+ ].join("\t")
135
+ end
136
+ end
137
+
@@ -0,0 +1,293 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'enveomics_rb/stats'
4
+ require 'fileutils'
5
+ require 'shellwords'
6
+ require 'tmpdir'
7
+ require 'zlib'
8
+
9
+ module Enveomics
10
+ # Wrapper class for ANIr estimation
11
+ #
12
+ # Use as: +ANIr.new(opts).go!+
13
+ class ANIr
14
+ # Options hash
15
+ attr :opts
16
+
17
+ # Identities list (unsorted)
18
+ attr :identities
19
+
20
+ def initialize(opts)
21
+ @opts = opts
22
+ @identities = []
23
+ end
24
+
25
+ # --------------------------------------------------[ High-level pipelines ]
26
+
27
+ # Perform all the analyses
28
+ def go!
29
+ read_input
30
+ detect_identity
31
+ estimate_ani_r
32
+ end
33
+
34
+ # Identify input/output mode and read mapping
35
+ def read_input
36
+ if opts[:m_format] != :list
37
+ @tmpdir = Dir.mktmpdir
38
+ @filter_contigs = !opts[:g].nil?
39
+ opts[:m] = File.join(@tmpdir, 'map.sam') if opts[:m].nil?
40
+ run_mapping unless File.exist? opts[:m]
41
+ load_contigs_to_filter if @filter_contigs
42
+ end
43
+ read_mapping = :"read_mapping_from_#{opts[:m_format]}"
44
+ raise Enveomics::OptionError.new(
45
+ "Unsupported mapping format: #{opts[:m_format]}"
46
+ ) unless respond_to? read_mapping
47
+ @identities = []
48
+ send(read_mapping)
49
+ say "- Unfiltered average identity: #{sample.mean}"
50
+ say "- Reads mapped: #{sample.n}"
51
+ save_identities
52
+ save_histogram
53
+ ensure
54
+ @tmpdir ||= nil
55
+ FileUtils.rm_rf @tmpdir if @tmpdir
56
+ end
57
+
58
+ # Identify the identity threshold
59
+ def detect_identity
60
+ say 'Detecting identity threshold'
61
+ if opts[:algorithm] == :auto
62
+ say "- Bimodality: #{bimodality}"
63
+ opts[:algorithm] = bimodality >= opts[:bimodality] ? :gmm : :fix
64
+ end
65
+ say "- Algorithm: #{opts[:algorithm]}"
66
+ if opts[:algorithm] == :gmm
67
+ detect_identity_by_gmm
68
+ end
69
+ end
70
+
71
+ # Estimate ANIr
72
+ def estimate_ani_r
73
+ say 'Estimating ANIr'
74
+ @sample = nil # Empty cached sample
75
+ @identities.delete_if { |i| i < opts[:identity] }
76
+ say "- ANIr: #{sample.mean}"
77
+ end
78
+
79
+ # -----------------------------------------------------------------[ Utils ]
80
+
81
+ # Show progress unless +opts[:q]+
82
+ def say(*msg)
83
+ o = '[%s] %s' % [Time.now, msg.join('')]
84
+ $stderr.puts(o) unless opts[:q]
85
+ File.open(opts[:log], 'a') { |fh| fh.puts o } if opts[:log]
86
+ end
87
+
88
+ # Execute command in the shell
89
+ def run(cmd)
90
+ say " - Running: #{cmd.join(' ')}"
91
+ `#{cmd.shelljoin} 2>&1 | tee >> #{opts[:log] || '/dev/null'}`
92
+ unless $?.success?
93
+ raise Enveomics::CommandError.new("#{cmd.first} failed: #{$?}")
94
+ end
95
+ end
96
+
97
+ # Returns an open file handler for the file, supporting .gz
98
+ def reader(file)
99
+ file =~ /\.gz$/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
100
+ end
101
+
102
+ # Is the mapping in SAM format?
103
+ def sam?
104
+ opts[:m_format] == :sam
105
+ end
106
+
107
+ # ------------------------------------------------------------[ Map it out ]
108
+
109
+ # Execute Bowtie2 and generate SAM file
110
+ def run_mapping
111
+ say 'Running mapping using Bowtie2'
112
+ raise Enveomics::OptionError.new(
113
+ 'Only SAM output is supported for mapping'
114
+ ) unless sam?
115
+
116
+ @filter_contigs = false
117
+ say '- Indexing input sequences'
118
+ raise Enveomics::OptionError.new(
119
+ 'Only FastA genome input is supported for mapping'
120
+ ) unless opts[:g_format] == :fasta
121
+
122
+ idx = File.join(@tmpdir, 'genome.idx')
123
+ run(['bowtie2-build', opts[:g], idx])
124
+
125
+ say '- Mapping metagenomic reads to genome assembly'
126
+ cmd = [
127
+ 'bowtie2', '-x', idx, '-p', opts[:threads], '-S', opts[:m], '--no-mixed'
128
+ ]
129
+ cmd << '-f' if opts[:r_format] == :fasta
130
+ cmd +=
131
+ case opts[:r_type]
132
+ when :single
133
+ ['-U', opts[:r]]
134
+ when :coupled
135
+ pairs = opts[:r].split(',', 2)
136
+ ['-1', pairs[0], '-2', pairs[1], '--no-discordant']
137
+ when :interleaved
138
+ ['--interleaved', opts[:r], '--no-discordant']
139
+ else
140
+ raise Enveomics::OptionError.new(
141
+ "Unsupported reads type: #{o[:r_type]}"
142
+ )
143
+ end
144
+ run(cmd)
145
+ end
146
+
147
+ # If +@filter_contigs+ is true, reads the genome assembly and saves contig
148
+ # names to filter the mapping
149
+ def load_contigs_to_filter
150
+ return unless @filter_contigs
151
+ say 'Loading contigs to filter'
152
+ reader = reader(opts[:g])
153
+ @contigs_to_filter =
154
+ case opts[:g_format]
155
+ when :fasta
156
+ reader.each.map { |ln| $1 if ln =~ /^>(\S+)/ }.compact
157
+ when :list
158
+ reader.each.map(&:chomp)
159
+ else
160
+ raise Enveomics::OptionError.new(
161
+ "Unsupported genome assembly format: #{opts[:g_format]}"
162
+ )
163
+ end
164
+ reader.close
165
+ say "- Got #{@contigs_to_filter.size} contigs"
166
+ end
167
+
168
+ # Reads the mapping file assuming SAM format
169
+ def read_mapping_from_sam
170
+ say 'Reading mapping from SAM file'
171
+ reader = reader(opts[:m])
172
+ reader.each { |ln| parse_sam_line(ln) }
173
+ reader.close
174
+ end
175
+
176
+ # Reads the mapping file assuming BAM format
177
+ def read_mapping_from_bam
178
+ say 'Reading mapping from BAM file'
179
+ IO.popen(['samtools', 'view', opts[:m]].shelljoin) do |fh|
180
+ fh.each { |ln| parse_sam_line(ln) }
181
+ end
182
+ end
183
+
184
+ # Reads the mapping file assuming a Tabular BLAST format
185
+ def read_mapping_from_tab
186
+ say 'Reading mapping from Tabular BLAST file'
187
+ reader = reader(opts[:m])
188
+ reader.each do |ln|
189
+ next if ln =~ /^\s*(#.*)?$/ # Comment or empty line
190
+ row = ln.chomp.split("\t")
191
+ next if @filter_contigs && !@contigs_to_filter.include?(row[1])
192
+ @identities << row[2].to_f
193
+ end
194
+ reader.close
195
+ end
196
+
197
+ # Reads the identities from a raw-text list
198
+ def read_mapping_from_list
199
+ say 'Reading identities from raw text list'
200
+ reader = reader(opts[:m])
201
+ @identities = reader.each.map(&:to_f)
202
+ reader.close
203
+ end
204
+
205
+ # Parses one line in SAM format
206
+ def parse_sam_line(ln)
207
+ return if ln =~ /^@/ || ln =~ /^\s*$/
208
+ row = ln.chomp.split("\t")
209
+ return if row[2] == '*'
210
+ return if @filter_contigs && !@contigs_to_filter.include?(row[2])
211
+ length = row[9].size
212
+ row.shift(11) # Discard non-flag columns
213
+ flags = Hash[row.map { |i| i.sub(/:.:/, ':').split(':', 2) }]
214
+ return if flags['YT'] && !%w[CP UU].include?(flags['YT'])
215
+ unless flags['MD']
216
+ raise Enveomics::ParseError.new(
217
+ "SAM line missing MD flag:\n#{ln}\nFlags: #{flags}"
218
+ )
219
+ end
220
+ mismatches = flags['MD'].scan(/[^\d]/).count
221
+ @identities << 100.0 * (length - mismatches) / length
222
+ end
223
+
224
+ # Save identites as raw text
225
+ def save_identities
226
+ return unless opts[:L]
227
+ say '- Saving identities'
228
+ File.open(opts[:L], 'w') do |fh|
229
+ identities.each { |i| fh.puts i }
230
+ end
231
+ end
232
+
233
+ # Save identity histogram as raw text
234
+ def save_histogram
235
+ return unless opts[:H]
236
+ say '- Saving histogram'
237
+ File.open(opts[:H], 'w') do |fh|
238
+ fh.puts "from\tto\tcount"
239
+ sample.histo_ranges.each_with_index do |r, k|
240
+ fh.puts (r + [sample.histo_counts[k]]).join("\t")
241
+ end
242
+ end
243
+ end
244
+
245
+ # -----------------------------------------------------------[ Peak finder ]
246
+
247
+ # Detect identity threshold by gaussian mixture model EM
248
+ def detect_identity_by_gmm
249
+ model_identities_by_gmm_em
250
+ detect_valley_by_gmm
251
+ end
252
+
253
+ # Model identities as a 2-gaussian mix by EM
254
+ def model_identities_by_gmm_em
255
+ say 'Modeling identities by gaussian mixture model using EM'
256
+ # TODO: Implement
257
+ raise Enveomics::UnimplementedError.new('Unimplemented operation')
258
+ end
259
+
260
+ # Detect valley by gaussian mix
261
+ def detect_valley_by_gmm
262
+ say 'Detecting valley by gaussian mixture model'
263
+ # TODO: Implement
264
+ raise Enveomics::UnimplementedError.new('Unimplemented operation')
265
+ end
266
+
267
+ # -----------------------------------------------------------[ Do the math ]
268
+
269
+ # Identities as a Enveomics::Stats::Sample object
270
+ def sample
271
+ @sample ||= Enveomics::Stats::Sample.new(
272
+ identities,
273
+ effective_range: [nil, 100.0],
274
+ histo_bin_size: opts[:bin_size]
275
+ )
276
+ end
277
+
278
+ # Returns the bimodality coefficient indicated by +opts[:coefficient]+
279
+ def bimodality
280
+ @bimodality ||=
281
+ case opts[:coefficient]
282
+ when :sarle
283
+ sample.sarle_bimodality
284
+ when :dma
285
+ sample.dma_bimodality
286
+ else
287
+ raise Enveomics::OptionError.new(
288
+ "Unsupported coefficient of bimodality: #{opts[:coefficient]}"
289
+ )
290
+ end
291
+ end
292
+ end
293
+ end