miga-base 0.7.26.3 → 1.0.0.sr1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/_data/aai-intax.blast.tsv.gz +0 -0
  3. data/lib/miga/_data/aai-intax.diamond.tsv.gz +0 -0
  4. data/lib/miga/_data/aai-novel.blast.tsv.gz +0 -0
  5. data/lib/miga/_data/aai-novel.diamond.tsv.gz +0 -0
  6. data/lib/miga/cli/action/doctor.rb +50 -19
  7. data/lib/miga/cli/action/doctor/base.rb +20 -18
  8. data/lib/miga/cli/action/init.rb +11 -7
  9. data/lib/miga/cli/action/init/files_helper.rb +1 -0
  10. data/lib/miga/cli/action/ncbi_get.rb +3 -3
  11. data/lib/miga/cli/action/tax_dist.rb +2 -2
  12. data/lib/miga/cli/action/wf.rb +5 -4
  13. data/lib/miga/daemon.rb +11 -4
  14. data/lib/miga/dataset/result.rb +10 -6
  15. data/lib/miga/json.rb +1 -2
  16. data/lib/miga/metadata.rb +5 -1
  17. data/lib/miga/parallel.rb +11 -6
  18. data/lib/miga/project.rb +8 -8
  19. data/lib/miga/project/base.rb +4 -4
  20. data/lib/miga/project/result.rb +2 -2
  21. data/lib/miga/sqlite.rb +7 -0
  22. data/lib/miga/version.rb +23 -9
  23. data/scripts/aai_distances.bash +16 -18
  24. data/scripts/ani_distances.bash +16 -17
  25. data/scripts/assembly.bash +31 -16
  26. data/scripts/haai_distances.bash +3 -27
  27. data/scripts/miga.bash +6 -4
  28. data/scripts/p.bash +1 -1
  29. data/scripts/read_quality.bash +9 -18
  30. data/scripts/trimmed_fasta.bash +14 -30
  31. data/scripts/trimmed_reads.bash +36 -36
  32. data/test/parallel_test.rb +31 -0
  33. data/test/project_test.rb +2 -1
  34. data/utils/distance/commands.rb +1 -0
  35. data/utils/distance/runner.rb +2 -4
  36. data/utils/enveomics/Manifest/Tasks/fasta.json +39 -3
  37. data/utils/enveomics/Manifest/Tasks/fastq.json +50 -2
  38. data/utils/enveomics/Manifest/Tasks/mapping.json +70 -0
  39. data/utils/enveomics/Manifest/Tasks/other.json +77 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +138 -1
  41. data/utils/enveomics/Manifest/categories.json +13 -4
  42. data/utils/enveomics/Scripts/Aln.cat.rb +206 -148
  43. data/utils/enveomics/Scripts/FastA.N50.pl +33 -29
  44. data/utils/enveomics/Scripts/FastA.fragment.rb +69 -61
  45. data/utils/enveomics/Scripts/FastA.sample.rb +61 -46
  46. data/utils/enveomics/Scripts/FastA.toFastQ.rb +69 -0
  47. data/utils/enveomics/Scripts/FastQ.maskQual.rb +89 -0
  48. data/utils/enveomics/Scripts/FastQ.tag.rb +59 -52
  49. data/utils/enveomics/Scripts/SRA.download.bash +6 -8
  50. data/utils/enveomics/Scripts/Table.prefScore.R +60 -0
  51. data/utils/enveomics/Scripts/aai.rb +3 -2
  52. data/utils/enveomics/Scripts/anir.rb +137 -0
  53. data/utils/enveomics/Scripts/lib/enveomics_rb/anir.rb +293 -0
  54. data/utils/enveomics/Scripts/lib/enveomics_rb/bm_set.rb +175 -0
  55. data/utils/enveomics/Scripts/lib/enveomics_rb/enveomics.rb +17 -17
  56. data/utils/enveomics/Scripts/lib/enveomics_rb/errors.rb +17 -0
  57. data/utils/enveomics/Scripts/lib/enveomics_rb/gmm_em.rb +30 -0
  58. data/utils/enveomics/Scripts/lib/enveomics_rb/match.rb +63 -0
  59. data/utils/enveomics/Scripts/lib/enveomics_rb/rbm.rb +49 -0
  60. data/utils/enveomics/Scripts/lib/enveomics_rb/stats.rb +3 -0
  61. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/rand.rb +31 -0
  62. data/utils/enveomics/Scripts/lib/enveomics_rb/stats/sample.rb +152 -0
  63. data/utils/enveomics/Scripts/lib/enveomics_rb/utils.rb +73 -0
  64. data/utils/enveomics/Scripts/rbm-legacy.rb +172 -0
  65. data/utils/enveomics/Scripts/rbm.rb +87 -133
  66. data/utils/enveomics/Scripts/sam.filter.rb +148 -0
  67. data/utils/enveomics/enveomics.R/DESCRIPTION +2 -2
  68. data/utils/enveomics/enveomics.R/NAMESPACE +1 -1
  69. data/utils/enveomics/enveomics.R/R/prefscore.R +79 -0
  70. data/utils/enveomics/enveomics.R/R/utils.R +30 -0
  71. data/utils/enveomics/enveomics.R/README.md +1 -0
  72. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +0 -1
  73. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +0 -1
  74. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +0 -1
  75. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +10 -2
  76. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +16 -4
  77. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +13 -3
  78. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +8 -2
  79. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +8 -2
  80. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +9 -2
  81. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +13 -5
  82. data/utils/enveomics/enveomics.R/man/enve.prefscore.Rd +50 -0
  83. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +9 -2
  84. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +23 -6
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +13 -4
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +8 -2
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +7 -2
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +14 -3
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +10 -2
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +8 -2
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +17 -9
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +6 -2
  93. data/utils/enveomics/enveomics.R/man/enve.selvector.Rd +23 -0
  94. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +14 -5
  95. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +19 -4
  96. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +11 -3
  97. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +11 -4
  98. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +26 -12
  99. data/utils/multitrim/Multitrim How-To.pdf +0 -0
  100. data/utils/multitrim/README.md +67 -0
  101. data/utils/multitrim/multitrim.py +1555 -0
  102. data/utils/multitrim/multitrim.yml +13 -0
  103. data/utils/requirements.txt +4 -3
  104. metadata +33 -6
  105. data/utils/enveomics/Scripts/lib/enveomics_rb/stat.rb +0 -30
@@ -55,12 +55,12 @@ module MiGA::Project::Result
55
55
  ##
56
56
  # Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
57
57
  def add_result_distances(base, _opts)
58
- return nil unless result_files_exist?(base, %w[.Rdata .log .txt])
58
+ return nil unless result_files_exist?(base, %w[.Rdata .txt])
59
59
 
60
60
  r = MiGA::Result.new("#{base}.json")
61
61
  r.add_file(:rdata, 'miga-project.Rdata')
62
62
  r.add_file(:matrix, 'miga-project.txt')
63
- r.add_file(:log, 'miga-project.log')
63
+ r.add_file(:log, 'miga-project.log') # Legacy file
64
64
  r.add_file(:hist, 'miga-project.hist')
65
65
  r
66
66
  end
data/lib/miga/sqlite.rb CHANGED
@@ -37,6 +37,7 @@ class MiGA::SQLite < MiGA::MiGA
37
37
  # Executes +cmd+ and returns the result
38
38
  def run(*cmd)
39
39
  busy_attempts ||= 0
40
+ io_attempts ||= 0
40
41
  y = nil
41
42
  SQLite3::Database.new(path) { |conn| y = conn.execute(*cmd) }
42
43
  y
@@ -44,6 +45,12 @@ class MiGA::SQLite < MiGA::MiGA
44
45
  busy_attempts += 1
45
46
  raise "Database busy #{path}: #{e.message}" if busy_attempts >= 3
46
47
 
48
+ sleep(1)
49
+ retry
50
+ rescue SQLite3::IOException => e
51
+ io_attempts += 1
52
+ raise "Database I/O error #{path}: #{e.message}" if io_attempts >= 3
53
+
47
54
  sleep(1)
48
55
  retry
49
56
  end
data/lib/miga/version.rb CHANGED
@@ -9,23 +9,33 @@ module MiGA
9
9
  # Current version of MiGA. An Array with three values:
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
- # - Integer representing minor changes that require new version number.
13
- VERSION = [0.7, 26, 3].freeze
12
+ # - String indicating release status:
13
+ # - rc* release candidate, not released as gem
14
+ # - sr* stable release, released as gem
15
+ VERSION = [1.0, 0, 'sr1'].freeze
14
16
 
15
17
  ##
16
18
  # Nickname for the current major.minor version.
17
- VERSION_NAME = 'lithograph'
19
+ VERSION_NAME = 'prima'
18
20
 
19
21
  ##
20
22
  # Date of the current gem release.
21
- VERSION_DATE = Date.new(2021, 3, 11)
23
+ VERSION_DATE = Date.new(2021, 4, 12)
22
24
 
23
25
  ##
24
- # Reference of MiGA.
25
- CITATION = 'Rodriguez-R et al (2018). ' \
26
- 'The Microbial Genomes Atlas (MiGA) webserver: taxonomic and gene ' \
27
- 'diversity analysis of Archaea and Bacteria at the whole genome level. ' \
28
- 'Nucleic Acids Research 46(W1):W282-W288. doi:10.1093/nar/gky467.'
26
+ # References of MiGA
27
+ CITATION = []
28
+ CITATION << <<~REF
29
+ Rodriguez-R et al (2018). The Microbial Genomes Atlas (MiGA) webserver:
30
+ taxonomic and gene diversity analysis of Archaea and Bacteria at the whole
31
+ genome level. Nucleic Acids Research 46(W1):W282-W288.
32
+ doi:10.1093/nar/gky467.
33
+ REF
34
+ CITATION << <<~REF
35
+ Rodriguez-R et al (2020). Classifying prokaryotic genomes using the
36
+ Microbial Genomes Atlas (MiGA) webserver. Bergey's Manual of Systematics
37
+ of Archaea and Bacteria.
38
+ REF
29
39
  end
30
40
 
31
41
  class MiGA::MiGA
@@ -58,6 +68,10 @@ class MiGA::MiGA
58
68
  ##
59
69
  # Reference of MiGA
60
70
  def self.CITATION
71
+ CITATION.map { |i| "- #{i}" }.join
72
+ end
73
+
74
+ def self.CITATION_ARRAY
61
75
  CITATION
62
76
  end
63
77
  end
@@ -9,34 +9,32 @@ DIR="$PROJECT/data/09.distances/02.aai"
9
9
  # Initialize
10
10
  miga_start_project_step "$DIR"
11
11
 
12
- echo -n "" > miga-project.log
13
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
14
-
15
12
  # Extract values
16
13
  rm -f miga-project.txt
14
+ SQL="SELECT seq1, seq2, aai, sd, n, omega from aai;"
15
+ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
17
16
  (
18
- echo "metric a b value sd n omega" | tr " " "\\t"
17
+ echo "a b value sd n omega" | tr " " "\\t"
19
18
  for i in $DS ; do
20
- echo "SELECT CASE WHEN omega!=0 THEN 'AAI' ELSE 'hAAI_AAI' END," \
21
- " seq1, seq2, aai, sd, n, omega from aai;" \
22
- | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
23
- echo "$i" >> miga-project.log
19
+ echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
24
20
  done
25
21
  ) | gzip -9c > miga-project.txt.gz
26
22
 
27
23
  # R-ify
28
- echo "
29
- aai <- read.table(gzfile('miga-project.txt.gz'), sep='\\t', h=T, as.is=TRUE);
30
- save(aai, file='miga-project.Rdata');
31
- if(sum(aai[,'a'] != aai[,'b']) > 0){
32
- h <- hist(aai[aai[,'a'] != aai[,'b'], 'value'], breaks=100, plot=FALSE);
24
+ cat <<R | R --vanilla
25
+ file <- gzfile('miga-project.txt.gz')
26
+ aai <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
+ save(aai, file = 'miga-project.Rdata')
28
+ if(sum(aai[, 'a'] != aai[, 'b']) > 0) {
29
+ h <- hist(aai[aai[, 'a'] != aai[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
+ len <- length(h[['breaks']])
33
31
  write.table(
34
- cbind(h[['breaks']][-length(h[['breaks']])],
35
- h[['breaks']][-1], h[['counts']]),
36
- file='miga-project.hist', quote=FALSE, sep='\\t',
37
- col.names=FALSE, row.names=FALSE);
32
+ cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
+ file = 'miga-project.hist', quote = FALSE, sep = '\t',
34
+ col.names = FALSE, row.names = FALSE
35
+ )
38
36
  }
39
- " | R --vanilla
37
+ R
40
38
 
41
39
  # Finalize
42
40
  miga_end_project_step "$DIR"
@@ -9,33 +9,32 @@ DIR="$PROJECT/data/09.distances/03.ani"
9
9
  # Initialize
10
10
  miga_start_project_step "$DIR"
11
11
 
12
- echo -n "" > miga-project.log
13
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
14
-
15
12
  # Extract values
16
13
  rm -f miga-project.txt
14
+ SQL="SELECT seq1, seq2, ani, sd, n, omega from ani;"
15
+ DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
17
16
  (
18
- echo "metric a b value sd n omega" | tr " " "\\t"
17
+ echo "a b value sd n omega" | tr " " "\\t"
19
18
  for i in $DS ; do
20
- echo "SELECT 'ANI', seq1, seq2, ani, sd, n, omega from ani ;" \
21
- | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
22
- echo "$i" >> miga-project.log
19
+ echo "$SQL" | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
23
20
  done
24
21
  ) | gzip -9c > miga-project.txt.gz
25
22
 
26
23
  # R-ify
27
- echo "
28
- ani <- read.table(gzfile('miga-project.txt.gz'), sep='\\t', h=T, as.is=TRUE);
29
- save(ani, file='miga-project.Rdata');
30
- if(sum(ani[,'a'] != ani[,'b']) > 0){
31
- h <- hist(ani[ani[,'a'] != ani[,'b'], 'value'], breaks=100, plot=FALSE);
24
+ cat <<R | R --vanilla
25
+ file <- gzfile('miga-project.txt.gz')
26
+ ani <- read.table(file, sep = '\t', header = TRUE, as.is = TRUE)
27
+ save(ani, file = 'miga-project.Rdata')
28
+ if(sum(ani[, 'a'] != ani[, 'b']) > 0) {
29
+ h <- hist(ani[ani[, 'a'] != ani[, 'b'], 'value'], breaks = 100, plot = FALSE)
30
+ len <- length(h[['breaks']])
32
31
  write.table(
33
- cbind(h[['breaks']][-length(h[['breaks']])],
34
- h[['breaks']][-1], h[['counts']]),
35
- file='miga-project.hist', quote=FALSE, sep='\\t',
36
- col.names=FALSE, row.names=FALSE);
32
+ cbind(h[['breaks']][-len], h[['breaks']][-1], h[['counts']]),
33
+ file = 'miga-project.hist', quote = FALSE, sep = '\t',
34
+ col.names = FALSE, row.names = FALSE
35
+ )
37
36
  }
38
- " | R --vanilla
37
+ R
39
38
 
40
39
  # Finalize
41
40
  miga_end_project_step "$DIR"
@@ -11,30 +11,44 @@ miga date > "$DATASET.start"
11
11
 
12
12
  # Interpose (if needed)
13
13
  TF="../04.trimmed_fasta"
14
- if [[ -s "$TF/$DATASET.1.fasta" \
15
- && -s "$TF/$DATASET.2.fasta" \
16
- && ! -s "$TF/$DATASET.CoupledReads.fa" ]] ; then
17
- FastA.interpose.pl "$TF/$DATASET.CoupledReads.fa" "$TF/$DATASET".[12].fasta
18
- gzip -9 -f "$TF/$DATASET.1.fasta"
19
- gzip -9 -f "$TF/$DATASET.2.fasta"
20
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
14
+ b=$DATASET
15
+ if [[ -s "$TF/${b}.2.fasta" || -s "$TF/${b}.2.fasta.gz" ]] ; then
16
+ cr="$TF/${b}.CoupledReads.fa"
17
+ if [[ ! -s "$cr" && ! -s "${cr}.gz" ]] ; then
18
+ for s in 1 2 ; do
19
+ if [[ -s "$TF/${b}.${s}.fasta" ]] ; then
20
+ ln -s "$TF/${b}.${s}.fasta" "${b}.${s}.tmp"
21
+ else
22
+ gzip -cd "$TF/${b}.${s}.fasta.gz" > "${b}.${s}.tmp"
23
+ fi
24
+ done
25
+ FastA.interpose.pl "$cr" "$b".[12].tmp
26
+ rm "$b".[12].tmp
27
+ miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
28
+ fi
21
29
  fi
22
30
 
31
+ # Gzip (if needed)
32
+ for i in SingleReads CoupledReads ; do
33
+ base="$TF/${DATASET}.${i}.fa"
34
+ if [[ -e "$base" && ! -s "${base}.gz" ]] ; then
35
+ gzip -9f "$base"
36
+ miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f
37
+ fi
38
+ done
39
+
23
40
  # Assemble
24
- FA="$TF/$DATASET.CoupledReads.fa"
25
- [[ -e "$FA" ]] || FA="$FA.gz"
26
- [[ -e "$FA" ]] || FA="../04.trimmed_fasta/$DATASET.SingleReads.fa"
27
- [[ -e "$FA" ]] || FA="$FA.gz"
41
+ FA="$TF/${DATASET}.CoupledReads.fa.gz"
42
+ [[ -e "$FA" ]] || FA="$TF/${DATASET}.SingleReads.fa.gz"
28
43
  RD="r"
29
44
  [[ $FA == *.SingleReads.fa* ]] && RD="l"
30
- idba_ud --pre_correction -$RD "$FA" -o "$DATASET" --num_threads "$CORES" || true
45
+ gzip -cd "$FA" \
46
+ | idba_ud --pre_correction -$RD /dev/stdin \
47
+ -o "$DATASET" --num_threads "$CORES" || true
31
48
  [[ -s "$DATASET/contig.fa" ]] || exit 1
32
49
 
33
50
  # Clean
34
- (
35
- cd "$DATASET"
36
- rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa
37
- )
51
+ ( cd "$DATASET" && rm kmer graph-*.fa align-* local-contig-*.fa contig-*.fa )
38
52
 
39
53
  # Extract
40
54
  if [[ -s "$DATASET/scaffold.fa" ]] ; then
@@ -49,3 +63,4 @@ FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2>=1000{print $1}' \
49
63
  # Finalize
50
64
  miga date > "$DATASET.done"
51
65
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
66
+
@@ -12,34 +12,10 @@ miga_start_project_step "$DIR"
12
12
  # Cleanup databases
13
13
  ruby -I "$MIGA/lib" "$MIGA/utils/cleanup-databases.rb" "$PROJECT" "$CORES"
14
14
 
15
- # Run hAAI
15
+ # No real need for hAAI distributions at all
16
16
  echo -n "" > miga-project.log
17
- DS=$(miga ls -P "$PROJECT" --ref --no-multi --active)
18
-
19
- # Extract values
20
- rm -f miga-project.txt
21
- (
22
- echo "metric a b value sd n omega" | tr " " "\\t"
23
- for i in $DS ; do
24
- echo "SELECT 'hAAI', seq1, seq2, aai, sd, n, omega from aai ;" \
25
- | sqlite3 "$DIR/$i.db" | tr "\\|" "\\t"
26
- echo "$i" >> miga-project.log
27
- done
28
- ) | gzip -9c > miga-project.txt.gz
29
-
30
- # R-ify
31
- echo "
32
- haai <- read.table(gzfile('miga-project.txt.gz'), sep='\\t', h=T, as.is=TRUE);
33
- save(haai, file='miga-project.Rdata');
34
- if(sum(haai[,'a'] != haai[,'b']) > 0){
35
- h <- hist(haai[haai[,'a'] != haai[,'b'], 'value'], breaks=100, plot=FALSE);
36
- write.table(
37
- cbind(h[['breaks']][-length(h[['breaks']])],
38
- h[['breaks']][-1], h[['counts']]),
39
- file='miga-project.hist', quote=FALSE, sep='\\t',
40
- col.names=FALSE, row.names=FALSE);
41
- }
42
- " | R --vanilla
17
+ echo -n "" > miga-project.txt
18
+ echo "aai <- NULL; save(aai, file = 'miga-project.Rdata')" | R --vanilla
43
19
 
44
20
  # Finalize
45
21
  miga_end_project_step "$DIR"
data/scripts/miga.bash CHANGED
@@ -7,9 +7,11 @@ SCRIPT=${SCRIPT:-$(basename "$0" .bash)}
7
7
  # shellcheck source=/dev/null
8
8
  . "$MIGA_HOME/.miga_rc"
9
9
 
10
- # Ensure submodules are first in PATH
11
- export PATH="$MIGA/bin:$MIGA/utils/enveomics/Scripts:$PATH"
12
- export PATH="$MIGA/utils/FastAAI/FastAAI:$PATH"
10
+ # Ensure MiGA & submodules are first in PATH
11
+ export PATH="$MIGA/bin:$PATH"
12
+ for util in enveomics/Scripts FastAAI/FastAAI multitrim ; do
13
+ export PATH="$MIGA/utils/$util:$PATH"
14
+ done
13
15
 
14
16
  # Ancillary functions
15
17
  function exists { [[ -e "$1" ]] ; }
@@ -38,7 +40,7 @@ if [[ "$SCRIPT" != "d" && "$SCRIPT" != "p" ]] ; then
38
40
  echo ""
39
41
  echo "######[ $SCRIPT ]######"
40
42
  echo "# Date: $(miga date)"
41
- echo "# Host: $(hostname)"
43
+ echo "# Host: $(hostname) [$CORES]"
42
44
  echo "# MiGA: $MIGA"
43
45
  echo "# Project: $PROJECT"
44
46
  if [[ -n $DATASET ]] ; then
data/scripts/p.bash CHANGED
@@ -12,7 +12,7 @@ while true ; do
12
12
  if [[ "$res" == "$last_res" ]] ; then
13
13
  let k=$k+1
14
14
  if [[ $k -gt 10 ]] ; then
15
- miga new --update -P "$PROJECT" \
15
+ miga edit -P "$PROJECT" \
16
16
  -m "run_$res=false,warn=Too many failed attempts to run $res"
17
17
  fi
18
18
  else
@@ -6,28 +6,19 @@ SCRIPT="read_quality"
6
6
  . "$MIGA/scripts/miga.bash" || exit 1
7
7
  cd "$PROJECT/data/03.read_quality"
8
8
 
9
- b=$DATASET
10
-
11
9
  # Initialize
12
10
  miga date > "$DATASET.start"
13
11
 
14
- # FastQC
15
- [[ -d "$b.fastqc" ]] || mkdir "$b.fastqc"
16
- fastqc "../02.trimmed_reads/$b".[12].clipped.fastq -o "$b.fastqc"
17
-
18
- # SolexaQA++
19
- [[ -d "$b.solexaqa" ]] || mkdir "$b.solexaqa"
20
- exists "../02.trimmed_reads/$b".[12].*.pdf \
21
- && mv "../02.trimmed_reads/$b".[12].*.pdf "$b.solexaqa/"
22
-
23
- # Clean 02.trimmed_reads
24
- rm -f "../02.trimmed_reads/$b".[12].fastq_trimmed.segments
25
- rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.paired
26
- rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed.single
27
- rm -f "../02.trimmed_reads/$b".[12].fastq.trimmed
28
- rm -f "../02.trimmed_reads/$b".[12].fastq
29
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
12
+ # Gzip (if necessary)
13
+ for s in 1 2 ; do
14
+ in="../02.trimmed_reads/${DATASET}.${s}.clipped.fastq"
15
+ if [[ -s "$in" ]] ; then
16
+ gzip -9f "$in"
17
+ miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
18
+ fi
19
+ done
30
20
 
31
21
  # Finalize
32
22
  miga date > "$DATASET.done"
33
23
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
24
+
@@ -11,43 +11,27 @@ b=$DATASET
11
11
  # Initialize
12
12
  miga date > "$DATASET.start"
13
13
 
14
- # Gunzip (if necessary)
15
- for sis in 1 2 ; do
16
- for ext in clipped clipped.single ; do
17
- [[ -e "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz" \
18
- && ! -e "../02.trimmed_reads/$b.$sis.${ext}.fastq" ]] \
19
- && gzip -d "../02.trimmed_reads/$b.$sis.${ext}.fastq.gz"
20
- done
14
+ # FastQ -> FastA
15
+ for s in 1 2 ; do
16
+ in="../02.trimmed_reads/${b}.${s}.clipped.fastq.gz"
17
+ [[ -s "$in" ]] \
18
+ && FastQ.maskQual.rb -i "$in" -o "${b}.1.fasta" --fasta --qual 18
21
19
  done
22
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
23
20
 
24
- # FastQ -> FastA
25
- FQ2A="$MIGA/utils/enveomics/Scripts/FastQ.toFastA.awk"
26
- awk -f "$FQ2A" < "../02.trimmed_reads/$b.1.clipped.fastq" > "$b.1.fasta"
27
- if [[ -e "../02.trimmed_reads/$b.2.clipped.fastq" ]] ; then
28
- awk -f "$FQ2A" < "../02.trimmed_reads/$b.2.clipped.fastq" > "$b.2.fasta"
29
- FastA.interpose.pl "$b.CoupledReads.fa" "$b".[12].fasta
30
- gzip -9 -f "$b.2.fasta"
31
- gzip -9 -f "$b.1.fasta"
32
- awk -f "$FQ2A" < "../02.trimmed_reads/$b".[12].clipped.single.fastq \
33
- > "$b.SingleReads.fa"
34
- gzip -9 -f "$b.SingleReads.fa"
21
+ # Interpose
22
+ if [[ -e "${b}.2.fasta" ]] ; then
23
+ FastA.interpose.pl "${b}.CoupledReads.fa" "$b".[12].fasta
35
24
  else
36
- mv "$b.1.fasta" "$b.SingleReads.fa"
25
+ mv "${b}.1.fasta" "${b}.SingleReads.fa"
37
26
  fi
38
27
 
39
- # Compress input at 01.raw_reads and 02.trimmed_reads
40
- for sis in 1 2 ; do
41
- [[ -e "../01.raw_reads/$b.$sis.fastq" ]] \
42
- && gzip -9 -f "../01.raw_reads/$b.$sis.fastq"
43
- [[ -e "../02.trimmed_reads/$b.$sis.clipped.fastq" ]] \
44
- && gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.fastq"
45
- [[ -e "../02.trimmed_reads/$b.$sis.clipped.single.fastq" ]] \
46
- && gzip -9 -f "../02.trimmed_reads/$b.$sis.clipped.single.fastq"
28
+ # Gzip
29
+ for x in 1.fasta 2.fasta SingleReads.fa CoupledReads.fa ; do
30
+ in="${b}.${x}"
31
+ [[ -e "$in" ]] && gzip -9f "$in"
47
32
  done
48
- miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
49
- miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_reads -f
50
33
 
51
34
  # Finalize
52
35
  miga date > "$DATASET.done"
53
36
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
37
+
@@ -11,49 +11,49 @@ b=$DATASET
11
11
  # Initialize
12
12
  miga date > "$DATASET.start"
13
13
 
14
- # Unzip (if necessary)
15
- [[ -e "../01.raw_reads/$b.1.fastq.gz" && ! -e "../01.raw_reads/$b.1.fastq" ]] \
16
- && gunzip "../01.raw_reads/$b.1.fastq.gz"
17
- [[ -e "../01.raw_reads/$b.2.fastq.gz" && ! -e "../01.raw_reads/$b.2.fastq" ]] \
18
- && gunzip "../01.raw_reads/$b.2.fastq.gz"
19
- miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
20
-
21
14
  # Clean existing files
22
15
  exists "$b".[12].* && rm "$b".[12].*
23
16
 
17
+ # Gzip (if necessary)
18
+ for s in 1 2 ; do
19
+ in="../01.raw_reads/${b}.${s}.fastq"
20
+ if [[ -s "$in" ]] ; then
21
+ gzip -9f "$in"
22
+ miga add_result -P "$PROJECT" -D "$DATASET" -r raw_reads -f
23
+ fi
24
+ done
25
+
24
26
  # Tag
25
- FastQ.tag.rb -i "../01.raw_reads/$b.1.fastq" -p "$b-" -s "/1" -o "$b.1.fastq"
26
- [[ -e "../01.raw_reads/$b.2.fastq" ]] \
27
- && FastQ.tag.rb -i "../01.raw_reads/$b.2.fastq" -p "$b-" -s "/2" \
28
- -o "$b.2.fastq"
29
-
30
- # Trim
31
- SolexaQA++ dynamictrim "$b".[12].fastq -h 20 -d .
32
- SolexaQA++ lengthsort "$b".[12].fastq.trimmed -l 50 -d .
33
-
34
- # Clean adapters
35
- if [[ -e "$b.2.fastq.trimmed.paired" ]] ; then
36
- scythe -a "$MIGA/utils/adapters.fa" "$b.1.fastq.trimmed.paired" \
37
- > "$b.1.clipped.all.fastq"
38
- scythe -a "$MIGA/utils/adapters.fa" "$b.2.fastq.trimmed.paired" \
39
- > "$b.2.clipped.all.fastq"
40
- SolexaQA++ lengthsort "$b".[12].clipped.all.fastq -l 50 -d .
41
- rm "$b".[12].clipped.all.fastq
42
- [[ -e "$b".1.clipped.all.fastq.single ]] \
43
- && mv "$b.1.clipped.all.fastq.single" "$b.1.clipped.single.fastq"
44
- [[ -e "$b".2.clipped.all.fastq.single ]] \
45
- && mv "$b.2.clipped.all.fastq.single" "$b.2.clipped.single.fastq"
46
- mv "$b.1.clipped.all.fastq.paired" "$b.1.clipped.fastq"
47
- mv "$b.2.clipped.all.fastq.paired" "$b.2.clipped.fastq"
48
- rm -f "$b.1.clipped.all.fastq.summary.txt"
27
+ in1="../01.raw_reads/$b.1.fastq.gz"
28
+ in2="../01.raw_reads/$b.2.fastq.gz"
29
+ FastQ.tag.rb -i "$in1" -p "$b-" -s "/1" -o "$b.1.fastq.gz"
30
+ [[ -e "$in2" ]] && FastQ.tag.rb -i "$in2" -p "$b-" -s "/2" -o "$b.2.fastq.gz"
31
+
32
+ # Multitrim
33
+ CMD="multitrim.py --zip gzip --level 9 --threads $CORES -o $b"
34
+ if [[ -s "$b.2.fastq.gz" ]] ; then
35
+ # Paired
36
+ $CMD -1 "$b.1.fastq.gz" -2 "$b.2.fastq.gz"
37
+ for s in 1 2 ; do
38
+ mv "$b/${s}.post_trim_${b}.${s}.fq.gz" "${b}.${s}.clipped.fastq.gz"
39
+ mv "$b/${s}.pre_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.pre.${s}.html"
40
+ mv "$b/${s}.post_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.post.${s}.html"
41
+ done
49
42
  else
50
- scythe -a "$MIGA/utils/adapters.fa" "$b.1.fastq.trimmed.single" \
51
- > "$b.1.clipped.all.fastq"
52
- SolexaQA++ lengthsort "$b.1.clipped.all.fastq" -l 50 -d .
53
- mv "$b.1.clipped.all.fastq.single" "$b.1.clipped.fastq"
43
+ # Unpaired
44
+ $CMD -u "$b.1.fastq.gz"
45
+ mv "$b/unpaired.post_trim_${b}.1.fq.gz" "${b}.1.clipped.fastq.gz"
46
+ mv "$b/unpaired.pre_trim_QC_${b}.1.html" "../03.read_quality/${b}.pre.1.html"
47
+ mv "$b/unpaired.post_trim_QC_${b}.1.html" "../03.read_quality/${b}.post.1.html"
54
48
  fi
55
- rm -f "$b".[12].*.discard
49
+ mv "$b/Subsample_Adapter_Detection.stats.txt" \
50
+ "../03.read_quality/$b.adapters.txt"
51
+
52
+ # Cleanup
53
+ rm -r "$b"
54
+ rm -f "$b".[12].fastq.gz
56
55
 
57
56
  # Finalize
58
57
  miga date > "$DATASET.done"
59
58
  miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f
59
+