miga-base 0.7.26.3 → 1.0.0.sr1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/doctor.rb +50 -19
  7. data/lib/miga/cli/action/doctor/base.rb +20 -18
  8. data/lib/miga/cli/action/init.rb +11 -7
  9. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  10. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  11. data/lib/miga/cli/action/tax_dist.rb +2 -2
  12. data/lib/miga/cli/action/wf.rb +5 -4
  13. data/lib/miga/daemon.rb +11 -4
  14. data/lib/miga/dataset/result.rb +10 -6
  15. data/lib/miga/json.rb +1 -2
  16. data/lib/miga/metadata.rb +5 -1
  17. data/lib/miga/parallel.rb +11 -6
  18. data/lib/miga/project.rb +8 -8
  19. data/lib/miga/project/base.rb +4 -4
  20. data/lib/miga/project/result.rb +2 -2
  21. data/lib/miga/sqlite.rb +7 -0
  22. data/lib/miga/version.rb +23 -9
  23. data/scripts/aai_distances.bash +16 -18
  24. data/scripts/ani_distances.bash +16 -17
  25. data/scripts/assembly.bash +31 -16
  26. data/scripts/haai_distances.bash +3 -27
  27. data/scripts/miga.bash +6 -4
  28. data/scripts/p.bash +1 -1
  29. data/scripts/read_quality.bash +9 -18
  30. data/scripts/trimmed_fasta.bash +14 -30
  31. data/scripts/trimmed_reads.bash +36 -36
  32. data/test/parallel_test.rb +31 -0
  33. data/test/project_test.rb +2 -1
  34. data/utils/distance/commands.rb +1 -0
  35. data/utils/distance/runner.rb +2 -4
  36. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  37. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  38. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  39. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  41. data/utils/enveomics/Manifest/categories.json +13 -4
  42. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  43. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  44. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  45. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  46. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  47. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  48. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  49. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  50. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  51. data/utils/enveomics/Scripts/aai.rb +3 -2
  52. data/utils/enveomics/Scripts/anir.rb +137 -0
  53. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  54. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  55. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  56. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  57. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  58. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  59. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  64. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  65. data/utils/enveomics/Scripts/rbm.rb +87 -133
  66. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  67. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  68. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  69. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  70. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  71. data/utils/enveomics/enveomics.R/README.md +1 -0
  72. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  73. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  74. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  75. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  76. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  77. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  78. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  79. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  80. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  81. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  82. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  83. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  84. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  93. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  94. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  95. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  96. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  97. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  98. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  99. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  100. data/utils/multitrim/README.md +67 -0
  101. data/utils/multitrim/multitrim.py +1555 -0
  102. data/utils/multitrim/multitrim.yml +13 -0
  103. data/utils/requirements.txt +4 -3
  104. metadata +33 -6
  105. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -55,12 +55,12 @@ module MiGA::Project::Result
55
55
  ##
56
56
  # Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
57
57
  def add_result_distances(base, _opts)
58
- return nil unless result_files_exist?(base, %w[.Rdata .log .txt])
58
+ return nil unless result_files_exist?(base, %w[.Rdata .txt])
59
59
 
60
60
  r = MiGA::Result.new("#{base}.json")
61
61
  r.add_file(:rdata, 'miga-project.Rdata')
62
62
  r.add_file(:matrix, 'miga-project.txt')
63
- r.add_file(:log, 'miga-project.log')
63
+ r.add_file(:log, 'miga-project.log') # Legacy file
64
64
  r.add_file(:hist, 'miga-project.hist')
65
65
  r
66
66
  end
data/lib/miga/sqlite.rb CHANGED
@@ -37,6 +37,7 @@ class MiGA::SQLite < MiGA::MiGA
37
37
  # Executes +cmd+ and returns the result
38
38
  def run(*cmd)
39
39
  busy_attempts ||= 0
40
+ io_attempts ||= 0
40
41
  y = nil
41
42
  SQLite3::Database.new(path) { |conn| y = conn.execute(*cmd) }
42
43
  y
@@ -44,6 +45,12 @@ class MiGA::SQLite < MiGA::MiGA
44
45
  busy_attempts += 1
45
46
  raise "Database busy #{path}: #{e.message}" if busy_attempts >= 3
46
47
 
48
+ sleep(1)
49
+ retry
50
+ rescue SQLite3::IOException => e
51
+ io_attempts += 1
52
+ raise "Database I/O error #{path}: #{e.message}" if io_attempts >= 3
53
+
47
54
  sleep(1)
48
55
  retry
49
56
  end
data/lib/miga/version.rb CHANGED
@@ -9,23 +9,33 @@ module MiGA
9
9
  # Current version of MiGA. An Array with three values:
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
- # - Integer representing minor changes that require new version number.
13
- VERSION = [0.7, 26, 3].freeze
12
+ # - String indicating release status:
13
+ # - rc* release candidate, not released as gem
14
+ # - sr* stable release, released as gem
15
+ VERSION = [1.0, 0, 'sr1'].freeze
14
16
 
15
17
  ##
16
18
  # Nickname for the current major.minor version.
17
- VERSION_NAME = 'lithograph'
19
+ VERSION_NAME = 'prima'
18
20
 
19
21
  ##
20
22
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2021, 3, 11)
23
+ VERSION_DATE = Date.new(2021, 4, 12)
22
24
 
23
25
  ##
24
- # Reference of MiGA.
25
- CITATION = 'Rodriguez-R et al (2018). ' \
26
- 'The Microbial Genomes Atlas (MiGA) webserver: taxonomic and gene ' \
27
- 'diversity analysis of Archaea and Bacteria at the whole genome level. ' \
28
- 'Nucleic Acids Research 46(W1):W282-W288. doi:10.1093/nar/gky467.'
26
+ # References of MiGA
27
+ CITATION = []
28
+ CITATION << <<~REF
29
+ Rodriguez-R et al (2018). The Microbial Genomes Atlas (MiGA) webserver:
30
+ taxonomic and gene diversity analysis of Archaea and Bacteria at the whole
31
+ genome level. Nucleic Acids Research 46(W1):W282-W288.
32
+ doi:10.1093/nar/gky467.
33
+ REF
34
+ CITATION << <<~REF
35
+ Rodriguez-R et al (2020). Classifying prokaryotic genomes using the
36
+ Microbial Genomes Atlas (MiGA) webserver. Bergey's Manual of Systematics
37
+ of Archaea and Bacteria.
38
+ REF
29
39
  end
30
40
 
31
41
  class MiGA::MiGA
@@ -58,6 +68,10 @@ class MiGA::MiGA
58
68
  ##
59
69
  # Reference of MiGA
60
70
  def self.CITATION
71
+ CITATION.map { |i| "- #{i}" }.join
72
+ end
73
+
74
+ def self.CITATION_ARRAY
61
75
  CITATION
62
76
  end
63
77
  end
@@ -9,34 +9,32 @@ DIR="$PROJECT/data/09.distances/02.aai"
9
9
  # Initialize
10
10
  miga_start_project_step "$DIR"
11
11
 
12
- echo -n "" > miga-project.log
13
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
14
-
15
12
  # Extract values
16
13
  rm -f miga-project.txt
14
+ SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
15
+ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
17
16
  (
18
- echo "metric a b value sd n omega" | tr " " "\\t"
17
+ echo "a b value sd n omega" | tr " " "\\t"
19
18
  for i in $DS ; do
20
- echo "SELECT CASE WHEN omega!=0 THEN 'AAI' ELSE 'hAAI_AAI' END," \
21
- " seq1, seq2, aai, sd, n, omega from aai;" \
22
- | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
23
- echo "$i" >> miga-project.log
19
+ echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
24
20
  done
25
21
  ) | gzip -9c > miga-project.txt.gz
26
22
 
27
23
  # R-ify
28
- echo "
29
- aai <- read.table(gzfile('miga-project.txt.gz'), sep='\\t', h=T, as.is=TRUE);
30
- save(aai, file='miga-project.Rdata');
31
- if(sum(aai[,'a'] != aai[,'b']) > 0){
32
- h <- hist(aai[aai[,'a'] != aai[,'b'], 'value'], breaks=100, plot=FALSE);
24
+ cat <<R | R --vanilla
25
+ file <- gzfile('miga-project.txt.gz')
26
+ aai <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
+ save(aai, file = 'miga-project.Rdata')
28
+ if(sum(aai[, 'a'] != aai[, 'b']) > 0) {
29
+ h <- hist(aai[aai[, 'a'] != aai[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
+ len <- length(h[['breaks']])
33
31
  write.table(
34
- cbind(h[['breaks']][-length(h[['breaks']])],
35
- h[['breaks']][-1], h[['counts']]),
36
- file='miga-project.hist', quote=FALSE, sep='\\t',
37
- col.names=FALSE, row.names=FALSE);
32
+ cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
+ file = 'miga-project.hist', quote = FALSE, sep = '\t',
34
+ col.names = FALSE, row.names = FALSE
35
+ )
38
36
  }
39
- " | R --vanilla
37
+ R
40
38
 
41
39
  # Finalize
42
40
  miga_end_project_step "$DIR"
@@ -9,33 +9,32 @@ DIR="$PROJECT/data/09.distances/03.ani"
9
9
  # Initialize
10
10
  miga_start_project_step "$DIR"
11
11
 
12
- echo -n "" > miga-project.log
13
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
14
-
15
12
  # Extract values
16
13
  rm -f miga-project.txt
14
+ SQL="SELECT seq1, seq2, ani, sd, n, omega from ani;"
15
+ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
17
16
  (
18
- echo "metric a b value sd n omega" | tr " " "\\t"
17
+ echo "a b value sd n omega" | tr " " "\\t"
19
18
  for i in $DS ; do
20
- echo "SELECT 'ANI', seq1, seq2, ani, sd, n, omega from ani ;" \
21
- | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
22
- echo "$i" >> miga-project.log
19
+ echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
23
20
  done
24
21
  ) | gzip -9c > miga-project.txt.gz
25
22
 
26
23
  # R-ify
27
- echo "
28
- ani <- read.table(gzfile('miga-project.txt.gz'), sep='\\t', h=T, as.is=TRUE);
29
- save(ani, file='miga-project.Rdata');
30
- if(sum(ani[,'a'] != ani[,'b']) > 0){
31
- h <- hist(ani[ani[,'a'] != ani[,'b'], 'value'], breaks=100, plot=FALSE);
24
+ cat <<R | R --vanilla
25
+ file <- gzfile('miga-project.txt.gz')
26
+ ani <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
+ save(ani, file = 'miga-project.Rdata')
28
+ if(sum(ani[, 'a'] != ani[, 'b']) > 0) {
29
+ h <- hist(ani[ani[, 'a'] != ani[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
+ len <- length(h[['breaks']])
32
31
  write.table(
33
- cbind(h[['breaks']][-length(h[['breaks']])],
34
- h[['breaks']][-1], h[['counts']]),
35
- file='miga-project.hist', quote=FALSE, sep='\\t',
36
- col.names=FALSE, row.names=FALSE);
32
+ cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
+ file = 'miga-project.hist', quote = FALSE, sep = '\t',
34
+ col.names = FALSE, row.names = FALSE
35
+ )
37
36
  }
38
- " | R --vanilla
37
+ R
39
38
 
40
39
  # Finalize
41
40
  miga_end_project_step "$DIR"
@@ -11,30 +11,44 @@ miga date > "$DATASET.start"
11
11
 
12
12
  # Interpose (if needed)
13
13
  TF="../04.trimmed_fasta"
14
- if [[ -s "$TF/$DATASET.1.fasta" \
15
- && -s "$TF/$DATASET.2.fasta" \
16
- && ! -s "$TF/$DATASET.CoupledReads.fa" ]] ; then
17
- FastA.interpose.pl "$TF/$DATASET.CoupledReads.fa" "$TF/$DATASET".[12].fasta
18
- gzip -9 -f "$TF/$DATASET.1.fasta"
19
- gzip -9 -f "$TF/$DATASET.2.fasta"
20
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
14
+ b=$DATASET
15
+ if [[ -s "$TF/${b}.2.fasta" || -s "$TF/${b}.2.fasta.gz" ]] ; then
16
+ cr="$TF/${b}.CoupledReads.fa"
17
+ if [[ ! -s "$cr" && ! -s "${cr}.gz" ]] ; then
18
+ for s in 1 2 ; do
19
+ if [[ -s "$TF/${b}.${s}.fasta" ]] ; then
20
+ ln -s "$TF/${b}.${s}.fasta" "${b}.${s}.tmp"
21
+ else
22
+ gzip -cd "$TF/${b}.${s}.fasta.gz" > "${b}.${s}.tmp"
23
+ fi
24
+ done
25
+ FastA.interpose.pl "$cr" "$b".[12].tmp
26
+ rm "$b".[12].tmp
27
+ miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
28
+ fi
21
29
  fi
22
30
 
31
+ # Gzip (if needed)
32
+ for i in SingleReads CoupledReads ; do
33
+ base="$TF/${DATASET}.${i}.fa"
34
+ if [[ -e "$base" && ! -s "${base}.gz" ]] ; then
35
+ gzip -9f "$base"
36
+ miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
37
+ fi
38
+ done
39
+
23
40
  # Assemble
24
- FA="$TF/$DATASET.CoupledReads.fa"
25
- [[ -e "$FA" ]] || FA="$FA.gz"
26
- [[ -e "$FA" ]] || FA="../04.trimmed_fasta/$DATASET.SingleReads.fa"
27
- [[ -e "$FA" ]] || FA="$FA.gz"
41
+ FA="$TF/${DATASET}.CoupledReads.fa.gz"
42
+ [[ -e "$FA" ]] || FA="$TF/${DATASET}.SingleReads.fa.gz"
28
43
  RD="r"
29
44
  [[ $FA == *.SingleReads.fa* ]] && RD="l"
30
- idba_ud --pre_correction -$RD "$FA" -o "$DATASET" --num_threads "$CORES" || true
45
+ gzip -cd "$FA" \
46
+ | idba_ud --pre_correction -$RD /dev/stdin \
47
+ -o "$DATASET" --num_threads "$CORES" || true
31
48
  [[ -s "$DATASET/contig.fa" ]] || exit 1
32
49
 
33
50
  # Clean
34
- (
35
- cd "$DATASET"
36
- rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa
37
- )
51
+ ( cd "$DATASET" && rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa )
38
52
 
39
53
  # Extract
40
54
  if [[ -s "$DATASET/scaffold.fa" ]] ; then
@@ -49,3 +63,4 @@ FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2>=1000{print $1}' \
49
63
  # Finalize
50
64
  miga date > "$DATASET.done"
51
65
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
66
+
@@ -12,34 +12,10 @@ miga_start_project_step "$DIR"
12
12
  # Cleanup databases
13
13
  ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
14
14
 
15
- # Run hAAI
15
+ # No real need for hAAI distributions at all
16
16
  echo -n "" > miga-project.log
17
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
18
-
19
- # Extract values
20
- rm -f miga-project.txt
21
- (
22
- echo "metric a b value sd n omega" | tr " " "\\t"
23
- for i in $DS ; do
24
- echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
25
- | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
26
- echo "$i" >> miga-project.log
27
- done
28
- ) | gzip -9c > miga-project.txt.gz
29
-
30
- # R-ify
31
- echo "
32
- haai <- read.table(gzfile('miga-project.txt.gz'), sep='\\t', h=T, as.is=TRUE);
33
- save(haai, file='miga-project.Rdata');
34
- if(sum(haai[,'a'] != haai[,'b']) > 0){
35
- h <- hist(haai[haai[,'a'] != haai[,'b'], 'value'], breaks=100, plot=FALSE);
36
- write.table(
37
- cbind(h[['breaks']][-length(h[['breaks']])],
38
- h[['breaks']][-1], h[['counts']]),
39
- file='miga-project.hist', quote=FALSE, sep='\\t',
40
- col.names=FALSE, row.names=FALSE);
41
- }
42
- " | R --vanilla
17
+ echo -n "" > miga-project.txt
18
+ echo "aai <- NULL; save(aai, file = 'miga-project.Rdata')" | R --vanilla
43
19
 
44
20
  # Finalize
45
21
  miga_end_project_step "$DIR"
data/scripts/miga.bash CHANGED
@@ -7,9 +7,11 @@ SCRIPT=${SCRIPT:-$(basename "$0" .bash)}
7
7
  # shellcheck source=/dev/null
8
8
  . "$MIGA_HOME/.miga_rc"
9
9
 
10
- # Ensure submodules are first in PATH
11
- export PATH="$MIGA/bin:$MIGA/utils/enveomics/Scripts:$PATH"
12
- export PATH="$MIGA/utils/FastAAI/FastAAI:$PATH"
10
+ # Ensure MiGA & submodules are first in PATH
11
+ export PATH="$MIGA/bin:$PATH"
12
+ for util in enveomics/Scripts FastAAI/FastAAI multitrim ; do
13
+ export PATH="$MIGA/utils/$util:$PATH"
14
+ done
13
15
 
14
16
  # Ancillary functions
15
17
  function exists { [[ -e "$1" ]] ; }
@@ -38,7 +40,7 @@ if [[ "$SCRIPT" != "d" && "$SCRIPT" != "p" ]] ; then
38
40
  echo ""
39
41
  echo "######[ $SCRIPT ]######"
40
42
  echo "# Date: $(miga date)"
41
- echo "# Host: $(hostname)"
43
+ echo "# Host: $(hostname) [$CORES]"
42
44
  echo "# MiGA: $MIGA"
43
45
  echo "# Project: $PROJECT"
44
46
  if [[ -n $DATASET ]] ; then
data/scripts/p.bash CHANGED
@@ -12,7 +12,7 @@ while true ; do
12
12
  if [[ "$res" == "$last_res" ]] ; then
13
13
  let k=$k+1
14
14
  if [[ $k -gt 10 ]] ; then
15
- miga new --update -P "$PROJECT" \
15
+ miga edit -P "$PROJECT" \
16
16
  -m "run_$res=false,warn=Too many failed attempts to run $res"
17
17
  fi
18
18
  else
@@ -6,28 +6,19 @@ SCRIPT="read_quality"
6
6
  . "$MIGA/scripts/miga.bash" || exit 1
7
7
  cd "$PROJECT/data/03.read_quality"
8
8
 
9
- b=$DATASET
10
-
11
9
  # Initialize
12
10
  miga date > "$DATASET.start"
13
11
 
14
- # FastQC
15
- [[ -d "$b.fastqc" ]] || mkdir "$b.fastqc"
16
- fastqc "../02.trimmed_reads/$b".[12].clipped.fastq -o "$b.fastqc"
17
-
18
- # SolexaQA++
19
- [[ -d "$b.solexaqa" ]] || mkdir "$b.solexaqa"
20
- exists "../02.trimmed_reads/$b".[12].*.pdf \
21
- && mv "../02.trimmed_reads/$b".[12].*.pdf "$b.solexaqa/"
22
-
23
- # Clean 02.trimmed_reads
24
- rm -f "../02.trimmed_reads/$b".[12].fastq_trimmed.segments
25
- rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.paired
26
- rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.single
27
- rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed
28
- rm -f "../02.trimmed_reads/$b".[12].fastq
29
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
12
+ # Gzip (if necessary)
13
+ for s in 1 2 ; do
14
+ in="../02.trimmed_reads/${DATASET}.${s}.clipped.fastq"
15
+ if [[ -s "$in" ]] ; then
16
+ gzip -9f "$in"
17
+ miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
18
+ fi
19
+ done
30
20
 
31
21
  # Finalize
32
22
  miga date > "$DATASET.done"
33
23
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
24
+
@@ -11,43 +11,27 @@ b=$DATASET
11
11
  # Initialize
12
12
  miga date > "$DATASET.start"
13
13
 
14
- # Gunzip (if necessary)
15
- for sis in 1 2 ; do
16
- for ext in clipped clipped.single ; do
17
- [[ -e "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz" \
18
- && ! -e "../02.trimmed_reads/$b.$sis.${ext}.fastq" ]] \
19
- && gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
20
- done
14
+ # FastQ -> FastA
15
+ for s in 1 2 ; do
16
+ in="../02.trimmed_reads/${b}.${s}.clipped.fastq.gz"
17
+ [[ -s "$in" ]] \
18
+ && FastQ.maskQual.rb -i "$in" -o "${b}.1.fasta" --fasta --qual 18
21
19
  done
22
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
23
20
 
24
- # FastQ -> FastA
25
- FQ2A="$MIGA/utils/enveomics/Scripts/FastQ.toFastA.awk"
26
- awk -f "$FQ2A" < "../02.trimmed_reads/$b.1.clipped.fastq" > "$b.1.fasta"
27
- if [[ -e "../02.trimmed_reads/$b.2.clipped.fastq" ]] ; then
28
- awk -f "$FQ2A" < "../02.trimmed_reads/$b.2.clipped.fastq" > "$b.2.fasta"
29
- FastA.interpose.pl "$b.CoupledReads.fa" "$b".[12].fasta
30
- gzip -9 -f "$b.2.fasta"
31
- gzip -9 -f "$b.1.fasta"
32
- awk -f "$FQ2A" < "../02.trimmed_reads/$b".[12].clipped.single.fastq \
33
- > "$b.SingleReads.fa"
34
- gzip -9 -f "$b.SingleReads.fa"
21
+ # Interpose
22
+ if [[ -e "${b}.2.fasta" ]] ; then
23
+ FastA.interpose.pl "${b}.CoupledReads.fa" "$b".[12].fasta
35
24
  else
36
- mv "$b.1.fasta" "$b.SingleReads.fa"
25
+ mv "${b}.1.fasta" "${b}.SingleReads.fa"
37
26
  fi
38
27
 
39
- # Compress input at 01.raw_reads and 02.trimmed_reads
40
- for sis in 1 2 ; do
41
- [[ -e "../01.raw_reads/$b.$sis.fastq" ]] \
42
- && gzip -9 -f "../01.raw_reads/$b.$sis.fastq"
43
- [[ -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
44
- && gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.fastq"
45
- [[ -e "../02.trimmed_reads/$b.$sis.clipped.single.fastq" ]] \
46
- && gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.single.fastq"
28
+ # Gzip
29
+ for x in 1.fasta 2.fasta SingleReads.fa CoupledReads.fa ; do
30
+ in="${b}.${x}"
31
+ [[ -e "$in" ]] && gzip -9f "$in"
47
32
  done
48
- miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
49
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
50
33
 
51
34
  # Finalize
52
35
  miga date > "$DATASET.done"
53
36
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
37
+
@@ -11,49 +11,49 @@ b=$DATASET
11
11
  # Initialize
12
12
  miga date > "$DATASET.start"
13
13
 
14
- # Unzip (if necessary)
15
- [[ -e "../01.raw_reads/$b.1.fastq.gz" && ! -e "../01.raw_reads/$b.1.fastq" ]] \
16
- && gunzip "../01.raw_reads/$b.1.fastq.gz"
17
- [[ -e "../01.raw_reads/$b.2.fastq.gz" && ! -e "../01.raw_reads/$b.2.fastq" ]] \
18
- && gunzip "../01.raw_reads/$b.2.fastq.gz"
19
- miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
20
-
21
14
  # Clean existing files
22
15
  exists "$b".[12].* && rm "$b".[12].*
23
16
 
17
+ # Gzip (if necessary)
18
+ for s in 1 2 ; do
19
+ in="../01.raw_reads/${b}.${s}.fastq"
20
+ if [[ -s "$in" ]] ; then
21
+ gzip -9f "$in"
22
+ miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
23
+ fi
24
+ done
25
+
24
26
  # Tag
25
- FastQ.tag.rb -i "../01.raw_reads/$b.1.fastq" -p "$b-" -s "/1" -o "$b.1.fastq"
26
- [[ -e "../01.raw_reads/$b.2.fastq" ]] \
27
- && FastQ.tag.rb -i "../01.raw_reads/$b.2.fastq" -p "$b-" -s "/2" \
28
- -o "$b.2.fastq"
29
-
30
- # Trim
31
- SolexaQA++ dynamictrim "$b".[12].fastq -h 20 -d .
32
- SolexaQA++ lengthsort "$b".[12].fastq.trimmed -l 50 -d .
33
-
34
- # Clean adapters
35
- if [[ -e "$b.2.fastq.trimmed.paired" ]] ; then
36
- scythe -a "$MIGA/utils/adapters.fa" "$b.1.fastq.trimmed.paired" \
37
- > "$b.1.clipped.all.fastq"
38
- scythe -a "$MIGA/utils/adapters.fa" "$b.2.fastq.trimmed.paired" \
39
- > "$b.2.clipped.all.fastq"
40
- SolexaQA++ lengthsort "$b".[12].clipped.all.fastq -l 50 -d .
41
- rm "$b".[12].clipped.all.fastq
42
- [[ -e "$b".1.clipped.all.fastq.single ]] \
43
- && mv "$b.1.clipped.all.fastq.single" "$b.1.clipped.single.fastq"
44
- [[ -e "$b".2.clipped.all.fastq.single ]] \
45
- && mv "$b.2.clipped.all.fastq.single" "$b.2.clipped.single.fastq"
46
- mv "$b.1.clipped.all.fastq.paired" "$b.1.clipped.fastq"
47
- mv "$b.2.clipped.all.fastq.paired" "$b.2.clipped.fastq"
48
- rm -f "$b.1.clipped.all.fastq.summary.txt"
27
+ in1="../01.raw_reads/$b.1.fastq.gz"
28
+ in2="../01.raw_reads/$b.2.fastq.gz"
29
+ FastQ.tag.rb -i "$in1" -p "$b-" -s "/1" -o "$b.1.fastq.gz"
30
+ [[ -e "$in2" ]] && FastQ.tag.rb -i "$in2" -p "$b-" -s "/2" -o "$b.2.fastq.gz"
31
+
32
+ # Multitrim
33
+ CMD="multitrim.py --zip gzip --level 9 --threads $CORES -o $b"
34
+ if [[ -s "$b.2.fastq.gz" ]] ; then
35
+ # Paired
36
+ $CMD -1 "$b.1.fastq.gz" -2 "$b.2.fastq.gz"
37
+ for s in 1 2 ; do
38
+ mv "$b/${s}.post_trim_${b}.${s}.fq.gz" "${b}.${s}.clipped.fastq.gz"
39
+ mv "$b/${s}.pre_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.pre.${s}.html"
40
+ mv "$b/${s}.post_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.post.${s}.html"
41
+ done
49
42
  else
50
- scythe -a "$MIGA/utils/adapters.fa" "$b.1.fastq.trimmed.single" \
51
- > "$b.1.clipped.all.fastq"
52
- SolexaQA++ lengthsort "$b.1.clipped.all.fastq" -l 50 -d .
53
- mv "$b.1.clipped.all.fastq.single" "$b.1.clipped.fastq"
43
+ # Unpaired
44
+ $CMD -u "$b.1.fastq.gz"
45
+ mv "$b/unpaired.post_trim_${b}.1.fq.gz" "${b}.1.clipped.fastq.gz"
46
+ mv "$b/unpaired.pre_trim_QC_${b}.1.html" "../03.read_quality/${b}.pre.1.html"
47
+ mv "$b/unpaired.post_trim_QC_${b}.1.html" "../03.read_quality/${b}.post.1.html"
54
48
  fi
55
- rm -f "$b".[12].*.discard
49
+ mv "$b/Subsample_Adapter_Detection.stats.txt" \
50
+ "../03.read_quality/$b.adapters.txt"
51
+
52
+ # Cleanup
53
+ rm -r "$b"
54
+ rm -f "$b".[12].fastq.gz
56
55
 
57
56
  # Finalize
58
57
  miga date > "$DATASET.done"
59
58
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
59
+