miga-base 0.7.26.2 → 1.0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/classify_wf.rb +2 -2
  7. data/lib/miga/cli/action/derep_wf.rb +1 -1
  8. data/lib/miga/cli/action/doctor.rb +57 -14
  9. data/lib/miga/cli/action/doctor/base.rb +47 -23
  10. data/lib/miga/cli/action/env.rb +26 -0
  11. data/lib/miga/cli/action/init.rb +11 -7
  12. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  13. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  14. data/lib/miga/cli/action/tax_dist.rb +2 -2
  15. data/lib/miga/cli/action/wf.rb +5 -4
  16. data/lib/miga/cli/base.rb +1 -0
  17. data/lib/miga/common.rb +1 -0
  18. data/lib/miga/daemon.rb +11 -4
  19. data/lib/miga/dataset/result.rb +10 -6
  20. data/lib/miga/json.rb +5 -4
  21. data/lib/miga/metadata.rb +5 -1
  22. data/lib/miga/parallel.rb +36 -0
  23. data/lib/miga/project.rb +8 -8
  24. data/lib/miga/project/base.rb +4 -4
  25. data/lib/miga/project/result.rb +2 -2
  26. data/lib/miga/sqlite.rb +10 -2
  27. data/lib/miga/version.rb +23 -9
  28. data/scripts/aai_distances.bash +16 -18
  29. data/scripts/ani_distances.bash +16 -17
  30. data/scripts/assembly.bash +31 -16
  31. data/scripts/haai_distances.bash +3 -27
  32. data/scripts/miga.bash +12 -8
  33. data/scripts/p.bash +1 -1
  34. data/scripts/read_quality.bash +9 -18
  35. data/scripts/trimmed_fasta.bash +14 -30
  36. data/scripts/trimmed_reads.bash +36 -36
  37. data/test/parallel_test.rb +31 -0
  38. data/test/project_test.rb +2 -1
  39. data/test/remote_dataset_test.rb +1 -1
  40. data/utils/distance/commands.rb +1 -0
  41. data/utils/distance/database.rb +0 -1
  42. data/utils/distance/runner.rb +2 -4
  43. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  44. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  45. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  46. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  47. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  48. data/utils/enveomics/Manifest/categories.json +13 -4
  49. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  50. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  51. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  52. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  53. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  54. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  55. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  56. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  57. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  58. data/utils/enveomics/Scripts/aai.rb +3 -2
  59. data/utils/enveomics/Scripts/anir.rb +137 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  64. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  65. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  66. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  67. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  68. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  69. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  70. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  71. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  72. data/utils/enveomics/Scripts/rbm.rb +87 -133
  73. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  74. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  75. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  76. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  77. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  78. data/utils/enveomics/enveomics.R/README.md +1 -0
  79. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  80. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  81. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  82. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  83. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  84. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  85. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  86. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  88. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  89. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  90. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  100. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  101. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  102. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  103. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  104. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  105. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  106. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  107. data/utils/multitrim/README.md +67 -0
  108. data/utils/multitrim/multitrim.py +1555 -0
  109. data/utils/multitrim/multitrim.yml +13 -0
  110. data/utils/requirements.txt +4 -3
  111. data/utils/subclade/pipeline.rb +2 -2
  112. metadata +33 -4
  113. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $VERSION = 1.2
4
+ $:.push File.expand_path('../lib', __FILE__)
5
+ require 'enveomics_rb/enveomics'
6
+
7
+ o = { q: false, offset: 33, qual: 15, fasta: false }
8
+ OptionParser.new do |opts|
9
+ opts.version = $VERSION
10
+ Enveomics.opt_banner(
11
+ opts, 'Masks low-quality bases in a FastQ file',
12
+ "#{File.basename($0)} -i in.fastq -o out.fastq [options]"
13
+ )
14
+
15
+ opts.separator 'Mandatory'
16
+ opts.on(
17
+ '-i', '--input FILE',
18
+ 'Path to the FastQ file containing the sequences',
19
+ 'Supports compression with .gz extension, use - for STDIN'
20
+ ) { |v| o[:in] = v }
21
+ opts.on(
22
+ '-o', '--out FILE',
23
+ 'Path to the output FastQ file',
24
+ 'Supports compression with .gz extension, use - for STDOUT'
25
+ ) { |v| o[:out] = v }
26
+
27
+ opts.separator ''
28
+ opts.separator 'Quality Options'
29
+ opts.on(
30
+ '-q', '--qual INT', Integer,
31
+ "Minimum quality score to allow a base, by default: #{o[:qual]}"
32
+ ) { |v| o[:qual] = v }
33
+ opts.on(
34
+ '--offset INT', Integer,
35
+ "Q-score offset, by default: #{o[:offset]}"
36
+ ) { |v| o[:offset] = v }
37
+
38
+ opts.separator ''
39
+ opts.separator 'Other Options'
40
+ opts.on(
41
+ '-a', '--fasta', 'Output sequences in FastA format'
42
+ ) { |v| o[:fasta] = v }
43
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
44
+ opts.on('-h', '--help', 'Display this screen') do
45
+ puts opts
46
+ exit
47
+ end
48
+ opts.separator ''
49
+ end.parse!
50
+
51
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
52
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
53
+ $QUIET = o[:q]
54
+
55
+ # Open in/out files
56
+ say 'Reading FastQ file'
57
+ ifh = reader(o[:in])
58
+ ofh = writer(o[:out])
59
+
60
+ # Parse and mask
61
+ entry = []
62
+ lno = 0
63
+ ifh.each_line do |ln|
64
+ lno += 1 # <- Gzip doesn't support $.
65
+ case lno % 4
66
+ when 1
67
+ ln =~ /^@(\S+)/ or
68
+ raise Enveomics::ParseError.new("Unexpected defline format: #{ln}")
69
+ entry << ln
70
+ when 2, 3
71
+ entry << ln
72
+ when 0
73
+ entry << ln
74
+ q = entry[3].chomp.split('').map { |i| (i.ord - o[:offset]) }
75
+ q.map { |i| i < o[:qual] }.each_with_index { |i, k| entry[1][k] = 'N' if i }
76
+ ofh.puts(o[:fasta] ? [entry[0].gsub(/^@/, '>'), entry[1]] : entry)
77
+ entry = []
78
+ end
79
+ end
80
+
81
+ # Finalize
82
+ say " Lines: #{lno}"
83
+ unless entry.empty?
84
+ raise Enveomics::ParseError.new('Unexpected trailing lines in FastQ')
85
+ end
86
+ say " Sequences: #{lno / 4}"
87
+ ifh.close
88
+ ofh.close
89
+
@@ -1,63 +1,70 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- #
4
- # @author: Luis M. Rodriguez-R
5
- # @update: Feb-06-2015
6
- # @license: artistic license 2.0
7
- #
3
+ # frozen_string_literal: true
8
4
 
9
- require 'optparse'
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ $VERSION = 1.1
10
8
 
11
- o = {:q=>FALSE, :p=>"", :s=>""}
12
- ARGV << '-h' if ARGV.size==0
9
+ o = { q: false, p: '', s: '' }
13
10
  OptionParser.new do |opts|
14
- opts.banner = "
15
- Generates easy-to-parse tagged reads from FastQ files.
11
+ opts.version = $VERSION
12
+ Enveomics.opt_banner(
13
+ opts, 'Generates easy-to-parse tagged reads from FastQ files',
14
+ "#{File.basename($0)} -i in.fasta -o out.fasta [options]"
15
+ )
16
16
 
17
- Usage: #{$0} [options]"
18
- opts.separator ""
19
- opts.separator "Mandatory"
20
- opts.on("-i", "--in FILE", "Path to the FastQ file containing the sequences."){ |v| o[:in] = v }
21
- opts.on("-o", "--out FILE", "Path to the FastQ to create."){ |v| o[:out] = v }
22
- opts.separator ""
23
- opts.separator "ID options"
24
- opts.on("-p", "--prefix STR", "Prefix to use in all IDs."){ |v| o[:p] = v }
25
- opts.on("-s", "--suffix STR", "Suffix to use in all IDs."){ |v| o[:s] = v }
26
- opts.separator ""
27
- opts.separator "Other Options"
28
- opts.on("-q", "--quiet", "Run quietly (no STDERR output)"){ o[:q] = TRUE }
29
- opts.on("-h", "--help", "Display this screen") do
30
- puts opts
31
- exit
32
- end
33
- opts.separator ""
17
+ opts.separator 'Mandatory'
18
+ opts.on(
19
+ '-i', '--in FILE',
20
+ 'Path to the FastQ file containing the sequences',
21
+ 'Supports compression with .gz extension, use - for STDIN'
22
+ ) { |v| o[:in] = v }
23
+ opts.on(
24
+ '-o', '--out FILE', 'Path to the FastQ to create',
25
+ 'Supports compression with .gz extension, use - for STDOUT'
26
+ ) { |v| o[:out] = v }
27
+ opts.separator ''
28
+ opts.separator 'ID options'
29
+ opts.on('-p', '--prefix STR', 'Prefix to use in all IDs') { |v| o[:p] = v }
30
+ opts.on('-s', '--suffix STR', 'Suffix to use in all IDs') { |v| o[:s] = v }
31
+ opts.separator ''
32
+ opts.separator 'Other Options'
33
+ opts.on('-q', '--quiet', 'Run quietly (no STDERR output)') { o[:q] = true }
34
+ opts.on('-h', '--help', 'Display this screen') { puts opts ; exit }
35
+ opts.separator ''
34
36
  end.parse!
35
- abort "-i is mandatory" if o[:in].nil?
36
- abort "-o is mandatory" if o[:out].nil?
37
-
37
+
38
+ raise Enveomics::OptionError.new('-i is mandatory') if o[:in].nil?
39
+ raise Enveomics::OptionError.new('-o is mandatory') if o[:out].nil?
40
+
38
41
  begin
39
- ifh = File.open(o[:in], 'r');
40
- ofh = File.open(o[:out], 'w');
41
- i=0
42
- while ln=ifh.gets
43
- ln.chomp!
44
- if $.%4==1 and not /^@/.match(ln).nil?
45
- i+=1
46
- ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
47
- elsif $.%4==2 or $.%4==0
48
- ofh.puts ln
49
- elsif $.%4==3 and not /^\+/.match(ln).nil?
50
- ofh.puts "+"
51
- else
52
- abort "Impossible to parse line #{$.}: #{ln}.\n"
53
- end
54
- end
55
- ifh.close
56
- ofh.close
42
+ ifh = reader(o[:in])
43
+ ofh = writer(o[:out])
44
+ i = 0
45
+ lno = 0
46
+ ifh.each do |ln|
47
+ ln.chomp!
48
+ lno += 1
49
+ case lno % 4
50
+ when 1
51
+ ln =~ /^@/ or
52
+ raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
53
+ i += 1
54
+ ofh.puts "@#{o[:p]}#{i}#{o[:s]}"
55
+ when 3
56
+ ln =~ /^\+/ or
57
+ raise Enveomics::ParseError.new("Cannot parse line #{$.}: #{ln}")
58
+ ofh.puts '+'
59
+ else
60
+ ofh.puts ln
61
+ end
62
+ end
63
+ ifh.close
64
+ ofh.close
57
65
  rescue => err
58
- $stderr.puts "Exception: #{err}\n\n"
59
- err.backtrace.each { |l| $stderr.puts l + "\n" }
60
- err
66
+ $stderr.puts "Exception: #{err}\n\n"
67
+ err.backtrace.each { |l| $stderr.puts l + "\n" }
68
+ err
61
69
  end
62
70
 
63
-
@@ -1,21 +1,19 @@
1
1
  #!/bin/bash
2
2
 
3
- #
4
- # @author Luis M. Rodriguez-R
5
- # @license artistic license 2.0
6
- #
7
-
8
- DATA_LINK="https://www.ebi.ac.uk/ena/data/warehouse/filereport"
3
+ DATA_LINK="https://www.ebi.ac.uk/ena/portal/api/filereport"
9
4
  DATA_OPS="result=read_run&fields=run_accession,fastq_ftp,fastq_md5"
10
5
  SRX=$1
11
6
  DIR=${2:-$SRX}
7
+ VERSION=1.0
12
8
 
13
9
  if [[ "$SRX" == "" ]] ; then
14
10
  echo "
11
+ [Enveomics Collection: $(basename "$0" .bash) $VERSION]
12
+
15
13
  Downloads the set of runs from a project, sample, or experiment in SRA.
16
14
 
17
15
  Usage:
18
- $0 <SRA-ID>[ <dir>]
16
+ $(basename "$0") <SRA-ID>[ <dir>]
19
17
 
20
18
  <SRA-ID> ID of the SRA Project, Sample, or Experiment.
21
19
  <dir> Directory where the files are to be downladed. By default,
@@ -34,7 +32,7 @@ function md5value {
34
32
  echo "$o"
35
33
  }
36
34
 
37
- curl -s "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
35
+ curl -Ls "$DATA_LINK?$DATA_OPS&accession=$SRX" -o "$DIR/srr_list.txt"
38
36
  tail -n +2 "$DIR/srr_list.txt" | while read ln ; do
39
37
  srr=$(echo "$ln"|cut -f 1)
40
38
  ftp=$(echo "$ln"|cut -f 2)
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env Rscript
2
+
3
+ #= Load stuff
4
+ args <- commandArgs(trailingOnly = FALSE)
5
+ enveomics_R <- file.path(
6
+ dirname(sub('^--file=', '', args[grep('^--file=', args)])),
7
+ 'lib',
8
+ 'enveomics.R'
9
+ )
10
+ for(file in c('cliopts.R','utils.R','prefscore.R'))
11
+ source(file.path(enveomics_R, 'R', file))
12
+
13
+ #= Generate interface
14
+ opt <- enve.cliopts(
15
+ enve.prefscore,
16
+ file.path(enveomics_R, 'man', 'enve.prefscore.Rd'),
17
+ positional_arguments = c(1, 4),
18
+ usage = 'usage: %prog [options] output.tsv [output.pdf [width height]]',
19
+ mandatory = c('x', 'set'),
20
+ number = c('signif.thr'),
21
+ ignore = c('plot'),
22
+ o_desc = list(
23
+ x = 'A tab-delimited table of presence/absence (1/0) with species as rows and samples as columns.',
24
+ set = 'A list of sample names that constitute the test set, one per line',
25
+ ignore = 'A list of species to exclude from the analysis, one per line'
26
+ )
27
+ )
28
+
29
+ #= Set output files
30
+ opt$options[['x']] <- read.table(
31
+ opt$options[['x']],
32
+ header = TRUE,
33
+ row.names = 1,
34
+ sep = '\t'
35
+ )
36
+ opt$options[['set']] <- read.table(
37
+ opt$options[['set']],
38
+ header = FALSE,
39
+ sep = '\t',
40
+ as.is = TRUE
41
+ )[,1]
42
+ if(!is.null(opt$options[['ignore']]))
43
+ opt$options[['ignore']] <- read.table(
44
+ opt$options[['ignore']],
45
+ header = FALSE,
46
+ sep = '\t',
47
+ as.is = TRUE
48
+ )[,1]
49
+ if(length(opt$args) > 1) {
50
+ args <- as.list(opt$args[-1])
51
+ for(i in 2:3) if(length(args) >= i) args[[i]] <- as.numeric(args[[i]])
52
+ do.call('pdf', args)
53
+ } else {
54
+ opt$options[['plot']] <- FALSE
55
+ }
56
+
57
+ #= Run it!
58
+ y <- do.call('enve.prefscore', opt$options)
59
+ write.table(y, opt$args[1], quote = FALSE, sep = '\t', col.names = FALSE)
60
+ if(length(opt$args)>1) ttt <- dev.off()
@@ -236,8 +236,9 @@ Dir.mktmpdir do |dir|
236
236
  end
237
237
  end
238
238
  response = RestClient.post(
239
- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
240
- db:"nuccore",rettype:"fasta",id:protIds.join(","),idtype:"acc")
239
+ 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
240
+ db: 'protein', rettype: 'fasta', id: protIds.join(','), idtype: 'acc'
241
+ )
241
242
  abort "Unable to reach NCBI EUtils, error code " +
242
243
  response.code.to_s + "." unless response.code == 200
243
244
  fo.puts response.to_str
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # frozen_string_literal: true
4
+
5
+ $:.push File.expand_path('../lib', __FILE__)
6
+ require 'enveomics_rb/enveomics'
7
+ require 'enveomics_rb/anir'
8
+ $VERSION = 1.0
9
+
10
+ o = {
11
+ q: false, threads: 2,
12
+ r_format: :fastq, g_format: :fasta, m_format: :sam, r_type: :single,
13
+ identity: 95.0, algorithm: :auto, bimodality: 0.5, bin_size: 1.0,
14
+ coefficient: :sarle
15
+ }
16
+
17
+ OptionParser.new do |opt|
18
+ cmd = File.basename($0)
19
+ opt.banner = <<~BANNER
20
+
21
+ [Enveomics Collection: #{cmd} v#{$VERSION}]
22
+
23
+ Estimates ANIr: the Average Nucleotide Identity of reads against a genome
24
+
25
+ Usage
26
+ # [ Input/output modes ]
27
+ # Run mapping and (optionally) save it as SAM
28
+ # Requires bowtie2
29
+ #{cmd} -r reads.fastq -g genome.fasta -m out_map.sam [options]
30
+
31
+ # Read mapping from BAM file
32
+ # Requires samtools
33
+ #{cmd} -m map.bam --m-format bam [options]
34
+
35
+ # Read mapping from other formats: SAM or Tabular BLAST
36
+ #{cmd} -m map.blast --m-format tab [options]
37
+
38
+ # Read a list of identities as percentage (contig filtering off)
39
+ #{cmd} -m identities.txt --m-format list [options]
40
+
41
+ # [ Identity threshold modes ]
42
+ #{cmd} -i 95 -a fix [options] # Set fixed identity threshold
43
+ #{cmd} -a gmm [options] # Find valley by EM of GMM
44
+ #{cmd} -a auto [options] # Pick method by bimodality (default)"
45
+
46
+ BANNER
47
+
48
+ opt.separator 'Input/Output'
49
+ opt.on('-r', '--reads PATH', 'Metagenomic reads') { |v| o[:r] = v }
50
+ opt.on('-g', '--genome PATH', 'Genome assembly') { |v| o[:g] = v }
51
+ opt.on('-m', '--mapping PATH', 'Mapping file') { |v| o[:m] = v }
52
+ opt.on('-L', '--list PATH', 'Output file with identities') { |v| o[:L] = v }
53
+ opt.on('-H', '--hist PATH', 'Output file with histogram') { |v| o[:H] = v }
54
+ opt.on(
55
+ '-T', '--tab PATH', 'Output file with results in tabular format'
56
+ ) { |v| o[:T] = v }
57
+ opt.separator ''
58
+
59
+ opt.separator 'Formats'
60
+ opt.on(
61
+ '--r-format STRING',
62
+ 'Metagenomic reads format: fastq (default) or fasta',
63
+ 'Both options support compression with .gz file extension'
64
+ ) { |v| o[:r_format] = v.downcase.to_sym }
65
+ opt.on(
66
+ '--r-type STRING', 'Type of metagenomic reads:',
67
+ '~ single (default): Single reads',
68
+ '~ coupled: Coupled reads in separate files (-m must be comma-delimited)',
69
+ '~ interleaved: Coupled reads in a single interposed file'
70
+ ) { |v| o[:r_type] = v.downcase.to_sym }
71
+ opt.on(
72
+ '--g-format STRING',
73
+ 'Genome assembly format: fasta (default) or list',
74
+ 'Both options support compression with .gz file extension',
75
+ 'If passed in mapping-read mode, filters only matches to these contigs'
76
+ ) { |v| o[:g_format] = v.downcase.to_sym }
77
+ opt.on(
78
+ '--m-format STRING',
79
+ 'Mapping file format: sam (default), bam, tab, or list',
80
+ 'sam, tab, and list options support compression with .gz file extension'
81
+ ) { |v| o[:m_format] = v.downcase.to_sym }
82
+ opt.separator ''
83
+
84
+ opt.separator 'Identity threshold'
85
+ opt.on(
86
+ '-i', '--identity FLOAT', Float,
87
+ "Set a fixed threshold of percent identity (default: #{o[:identity]})"
88
+ ) { |v| o[:identity] = v }
89
+ opt.on(
90
+ '-a', '--algorithm STRING',
91
+ 'Set an algorithm to automatically detect identity threshold:',
92
+ '~ gmm: Valley detection by E-M of Gaussian Mixture Model',
93
+ '~ fix: Fixed threshold, see -i',
94
+ '~ auto (default): Pick gmm or fix depending on bimodality, see -b'
95
+ ) { |v| o[:algorithm] = v.downcase.to_sym }
96
+ opt.on(
97
+ '-b', '--bimodality FLOAT', Float,
98
+ 'Threshold of bimodality below which the algorithm is set to fix',
99
+ 'The coefficient used is the de Michele & Accantino (2014) B index',
100
+ "By default: #{o[:bimodality]}"
101
+ ) { |v| o[:bimodality] = v }
102
+ opt.on(
103
+ '--coefficient STRING',
104
+ 'Coefficient of bimodality for -a auto:',
105
+ '~ sarle (default): Sarle\'s bimodality coefficient b',
106
+ '~ dma: de Michele and Accatino (2014 PLoS ONE) B index, use with -b 0.1'
107
+ ) { |v| o[:coefficient] = v.downcase.to_sym }
108
+ opt.on(
109
+ '--bin-size FLOAT', Float,
110
+ "Width of histogram bins (in percent identity). By default: #{o[:bin_size]}"
111
+ ) { |v| o[:bin_size] = v }
112
+ opt.separator ''
113
+
114
+ opt.separator 'General'
115
+ opt.on(
116
+ '-t', '--threads INT', Integer, 'Threads to use'
117
+ ) { |v| o[:threads] = v }
118
+ opt.on('-l', '--log PATH', 'Log file to save output') { |v| o[:log] = v }
119
+ opt.on('-q', '--quiet', 'Run quietly') { |v| o[:q] = v }
120
+ opt.on('-h', '--help', 'Display this screen') do
121
+ puts opt
122
+ exit
123
+ end
124
+ opt.separator ''
125
+ end.parse!
126
+
127
+ anir = Enveomics::ANIr.new(o)
128
+ anir.go!
129
+ if o[:T]
130
+ File.open(o[:T], 'w') do |fh|
131
+ fh.puts "anir\tsd\treads\tid_threshold"
132
+ fh.puts [
133
+ anir.sample.mean, anir.sample.sd, anir.sample.n, anir.opts[:identity]
134
+ ].join("\t")
135
+ end
136
+ end
137
+