miga-base 1.2.17.1 → 1.2.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/lib/miga/remote_dataset/download.rb +1 -1
  3. data/lib/miga/remote_dataset.rb +9 -4
  4. data/lib/miga/version.rb +2 -2
  5. data/utils/enveomics/Manifest/Tasks/mapping.json +39 -11
  6. data/utils/enveomics/Manifest/Tasks/remote.json +2 -1
  7. data/utils/enveomics/Scripts/BedGraph.tad.rb +98 -53
  8. data/utils/enveomics/Scripts/SRA.download.bash +14 -2
  9. data/utils/enveomics/Tests/low-cov.bg.gz +0 -0
  10. data/utils/enveomics/enveomics.R/DESCRIPTION +5 -5
  11. data/utils/enveomics/enveomics.R/R/autoprune.R +99 -87
  12. data/utils/enveomics/enveomics.R/R/barplot.R +116 -97
  13. data/utils/enveomics/enveomics.R/R/cliopts.R +65 -59
  14. data/utils/enveomics/enveomics.R/R/df2dist.R +96 -58
  15. data/utils/enveomics/enveomics.R/R/growthcurve.R +166 -148
  16. data/utils/enveomics/enveomics.R/R/recplot.R +201 -136
  17. data/utils/enveomics/enveomics.R/R/recplot2.R +371 -304
  18. data/utils/enveomics/enveomics.R/R/tribs.R +318 -263
  19. data/utils/enveomics/enveomics.R/R/utils.R +30 -20
  20. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +4 -3
  21. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +2 -2
  22. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +3 -3
  23. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +7 -4
  24. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +7 -4
  25. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +4 -0
  26. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +25 -17
  27. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +10 -0
  28. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +8 -2
  29. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +14 -0
  30. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +20 -1
  31. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +2 -3
  32. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +5 -2
  33. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +50 -42
  34. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +5 -2
  35. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +3 -0
  36. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +3 -0
  37. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +3 -0
  38. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +3 -0
  39. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +9 -4
  40. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +3 -0
  41. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +3 -3
  42. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +0 -2
  43. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +4 -0
  44. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +5 -0
  45. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +11 -7
  46. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +5 -1
  47. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +3 -0
  48. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +2 -2
  49. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +3 -3
  50. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +2 -2
  51. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +3 -0
  52. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +3 -0
  53. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +6 -3
  54. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +2 -2
  55. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +3 -0
  56. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +3 -0
  57. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +3 -0
  58. metadata +3 -37
  59. data/utils/enveomics/Pipelines/assembly.pbs/CONFIG.mock.bash +0 -69
  60. data/utils/enveomics/Pipelines/assembly.pbs/FastA.N50.pl +0 -1
  61. data/utils/enveomics/Pipelines/assembly.pbs/FastA.filterN.pl +0 -1
  62. data/utils/enveomics/Pipelines/assembly.pbs/FastA.length.pl +0 -1
  63. data/utils/enveomics/Pipelines/assembly.pbs/README.md +0 -189
  64. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-2.bash +0 -112
  65. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-3.bash +0 -23
  66. data/utils/enveomics/Pipelines/assembly.pbs/RUNME-4.bash +0 -44
  67. data/utils/enveomics/Pipelines/assembly.pbs/RUNME.bash +0 -50
  68. data/utils/enveomics/Pipelines/assembly.pbs/kSelector.R +0 -37
  69. data/utils/enveomics/Pipelines/assembly.pbs/newbler.pbs +0 -68
  70. data/utils/enveomics/Pipelines/assembly.pbs/newbler_preparator.pl +0 -49
  71. data/utils/enveomics/Pipelines/assembly.pbs/soap.pbs +0 -80
  72. data/utils/enveomics/Pipelines/assembly.pbs/stats.pbs +0 -57
  73. data/utils/enveomics/Pipelines/assembly.pbs/velvet.pbs +0 -63
  74. data/utils/enveomics/Pipelines/blast.pbs/01.pbs.bash +0 -38
  75. data/utils/enveomics/Pipelines/blast.pbs/02.pbs.bash +0 -73
  76. data/utils/enveomics/Pipelines/blast.pbs/03.pbs.bash +0 -21
  77. data/utils/enveomics/Pipelines/blast.pbs/BlastTab.recover_job.pl +0 -72
  78. data/utils/enveomics/Pipelines/blast.pbs/CONFIG.mock.bash +0 -98
  79. data/utils/enveomics/Pipelines/blast.pbs/FastA.split.pl +0 -1
  80. data/utils/enveomics/Pipelines/blast.pbs/README.md +0 -127
  81. data/utils/enveomics/Pipelines/blast.pbs/RUNME.bash +0 -109
  82. data/utils/enveomics/Pipelines/blast.pbs/TASK.check.bash +0 -128
  83. data/utils/enveomics/Pipelines/blast.pbs/TASK.dry.bash +0 -16
  84. data/utils/enveomics/Pipelines/blast.pbs/TASK.eo.bash +0 -22
  85. data/utils/enveomics/Pipelines/blast.pbs/TASK.pause.bash +0 -26
  86. data/utils/enveomics/Pipelines/blast.pbs/TASK.run.bash +0 -89
  87. data/utils/enveomics/Pipelines/blast.pbs/sentinel.pbs.bash +0 -29
  88. data/utils/enveomics/Pipelines/idba.pbs/README.md +0 -49
  89. data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash +0 -95
  90. data/utils/enveomics/Pipelines/idba.pbs/run.pbs +0 -56
  91. data/utils/enveomics/Pipelines/trim.pbs/README.md +0 -54
  92. data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash +0 -70
  93. data/utils/enveomics/Pipelines/trim.pbs/run.pbs +0 -130
@@ -1,49 +0,0 @@
1
- @author: Luis Miguel Rodriguez-R <lmrodriguezr at gmail dot com>
2
-
3
- @update: Feb-26-2015
4
-
5
- @license: artistic 2.0
6
-
7
- @status: auto
8
-
9
- @pbs: yes
10
-
11
- # IMPORTANT
12
-
13
- This pipeline was developed for the [PACE cluster](http://pace.gatech.edu/). You
14
- are free to use it in other platforms with adequate adjustments.
15
-
16
- # PURPOSE
17
-
18
- Performs assembly using IDBA-UD, designed for Single-Cell Genomics and Metagenomics.
19
-
20
- # HELP
21
-
22
- 1. Files preparation:
23
-
24
- 1.1. Obtain the enveomics package in the cluster. You can use:
25
- `git clone https://github.com/lmrodriguezr/enveomics.git`
26
-
27
- 1.2. Prepare the trimmed reads (e.g., use trim.bs) in interposed FastA format. Files
28
- must be raw, not zipped or packaged. Filenames must conform the format:
29
- <name>.CoupledReads.fa, where <name> is the name of the sample. Locate all the
30
- files within a folder named 04.trimmed_fasta, within your project folder. If you
31
- used trim.pbs, no further action is necessary.
32
-
33
- 2. Pipeline execution:
34
-
35
- 2.1. Simply execute `./RUNME.bash <dir> <data_type>`, where `<dir>` is the folder containing
36
- the 04.trimmed_fasta folder, and `<data_type>` is a supported type of data (see help
37
- message running `./RUNME.bash` without arguments).
38
-
39
- 3. What to expect:
40
-
41
- By the end of the run, you should find the folder *05.assembly*, including the following
42
- files for each dataset:
43
-
44
- 3.1. `<dataset>`: The IDBA output folder.
45
-
46
- 3.2. `<dataset>.AllContigs.fna`: All contigs longer than 200bp in FastA format.
47
-
48
- 3.2. `<dataset>.LargeContigs.fna`: Contigs longer than 500bp in FastA format.
49
-
@@ -1,95 +0,0 @@
1
- #!/bin/bash
2
-
3
- if [[ "$1" == "" || "$1" == "-h" || "$2" == "" ]] ; then
4
- echo "
5
- Usage: ./RUNME.bash folder data_type [max_jobs]
6
-
7
- folder Path to the folder containing the 04.trimmed_fasta folder. The
8
- trimmed reads must be in interposed FastA format, and filenames
9
- must follow the format: <name>.CoupledReads.fa, where <name> is
10
- the name of the sample. If non-paired, the filenames must follow
11
- the format: <name>.SingleReads.fa. If both suffixes are found
12
- for the same <name> prefix, they are both used.
13
- data_type Type of datasets in the project. One of: mg (for metagenomes),
14
- scg (for single-cell genomes), g (for traditional genomes), or t
15
- (for transcriptomes).
16
- max_jobs (optional) Maximum number of jobs to run in parallel. This
17
- number can be increased, but bear in mind that this process is
18
- highly I/O-intensive, and likely to crash or significantly slow
19
- down the hard drive if many jobs are running simultaneously. By
20
- default: 5.
21
- " >&2
22
- exit 1
23
- fi
24
- TYPE=$2
25
- if [[ "$TYPE" != "g" && "$TYPE" != "mg" && "$TYPE" != "scg" \
26
- && "$TYPE" != "t" ]] ; then
27
- echo "Unsupported data type: $TYPE." >&2
28
- exit 1
29
- fi
30
- if [[ "$3" == "" ]] ; then
31
- MAX=5
32
- else
33
- let MAX=$3+0
34
- fi
35
-
36
- dir=$(readlink -f $1)
37
- pac=$(dirname $(readlink -f $0))
38
- cwd=$(pwd)
39
-
40
- cd $dir
41
- if [[ ! -e 04.trimmed_fasta ]] ; then
42
- echo "Cannot locate the 04.trimmed_fasta directory, aborting..." >&2
43
- exit 1
44
- fi
45
- for i in 05.assembly ; do
46
- [[ -d $i ]] || mkdir $i
47
- done
48
-
49
- k=0
50
- for i in $dir/04.trimmed_fasta/*.SingleReads.fa ; do
51
- b=$(basename $i .SingleReads.fa)
52
- touch $dir/04.trimmed_fasta/$b.CoupledReads.fa
53
- done
54
-
55
- for i in $dir/04.trimmed_fasta/*.CoupledReads.fa ; do
56
- b=$(basename $i .CoupledReads.fa)
57
- [[ -d $dir/05.assembly/$b ]] && continue
58
- EXTRA=""
59
- EXTRA_MSG=""
60
- if [[ $k -ge $MAX ]] ; then
61
- let prek=$k-$MAX
62
- EXTRA="-W depend=afterany:${jids[$prek]}"
63
- EXTRA_MSG=" (waiting for ${jids[$prek]})"
64
- fi
65
-
66
- # Predict time (in hours)
67
- SIZE_M=$(($(ls -pl 04.trimmed_fasta/$b.CoupledReads.fa \
68
- | awk '{print $5}')/1000000))
69
- let TIME_H=6+$SIZE_M*2/1000
70
- let RAM_G=20+$SIZE_M*20/1000
71
-
72
- # Find the right queue
73
- if [[ $TIME_H -lt 12 ]] ; then
74
- QUEUE="-q iw-shared-6 -l walltime=12:00:00"
75
- elif [[ $TIME_H -lt 120 ]] ; then
76
- QUEUE="-q microcluster -l walltime=120:00:00"
77
- else
78
- QUEUE="-q microcluster -l walltime=2000:00:00"
79
- fi
80
-
81
- # Launch job
82
- mkdir $dir/05.assembly/$b
83
- OPTS="SAMPLE=$b,FOLDER=$dir,TYPE=$TYPE"
84
- if [[ -s $dir/04.trimmed_fasta/$b.SingleReads.fa ]] ; then
85
- OPTS="$OPTS,FA=$dir/04.trimmed_fasta/$b.SingleReads.fa"
86
- [[ -s $dir/04.trimmed_fasta/$b.CoupledReads.fa ]] \
87
- && OPTS="$OPTS,FA_RL2=$dir/04.trimmed_fasta/$b.CoupledReads.fa"
88
- else
89
- OPTS="$OPTS,FA=$dir/04.trimmed_fasta/$b.CoupledReads.fa"
90
- fi
91
- jids[$k]=$(qsub -v "$OPTS" -N "IDBA-$b" -l "mem=${RAM_G}g" \
92
- $QUEUE $EXTRA $pac/run.pbs | grep .)
93
- echo "$b: ${jids[$k]}$EXTRA_MSG"
94
- let k=$k+1
95
- done
@@ -1,56 +0,0 @@
1
- #!/bin/bash
2
- #PBS -l nodes=1:ppn=10
3
- #PBS -k eo
4
-
5
- module load idba/1.1.1
6
-
7
- b=$SAMPLE
8
- shared=/nv/gpfs-gateway-pace1/project/bio-konstantinidis/shared3
9
- enve=$shared/apps/enveomics/Scripts
10
- THR=10
11
-
12
- #---------------------------------------------------------
13
-
14
- echo "==[ 05.assembly: $(date) ]"
15
- cd $FOLDER/05.assembly
16
-
17
- CMD=""
18
- case "$TYPE" in
19
- *g)
20
- CMD="idba_ud" ;;
21
- t)
22
- CMD="idba_tran" ;;
23
- *)
24
- echo "Unsupported data type: $TYPE" >&2
25
- exit 1
26
- ;;
27
- esac
28
- CMD="$CMD --pre_correction -r $FA -o $SAMPLE --num_threads $THR"
29
- [[ -n "$FA_RL2" ]] && CMD="$CMD --read_level_2 $FA_RL2"
30
- [[ -n "$FA_RL3" ]] && CMD="$CMD --read_level_3 $FA_RL3"
31
- [[ -n "$FA_RL4" ]] && CMD="$CMD --read_level_4 $FA_RL4"
32
- [[ -n "$FA_RL5" ]] && CMD="$CMD --read_level_5 $FA_RL5"
33
-
34
- time $CMD
35
-
36
- rm $SAMPLE/kmer
37
- rm $SAMPLE/graph-*.fa
38
- rm $SAMPLE/align-*
39
- rm $SAMPLE/local-contig-*.fa
40
- rm $SAMPLE/contig-*.fa
41
-
42
- if [[ -s $SAMPLE/scaffold.fa ]] ; then
43
- ln -s $SAMPLE/scaffold.fa $SAMPLE.AllContigs.fna
44
- else
45
- ln -s $SAMPLE/contig.fa $SAMPLE.AllContigs.fna
46
- fi
47
- time $enve/FastA.length.pl $SAMPLE.AllContigs.fna | awk '$2>=500{print $1}' \
48
- > $SAMPLE.LargeContigs.ids
49
- time $enve/FastA.filter.pl $SAMPLE.LargeContigs.ids $SAMPLE.AllContigs.fna \
50
- > $SAMPLE.LargeContigs.fna
51
- rm $SAMPLE.LargeContigs.ids
52
-
53
- #---------------------------------------------------------
54
-
55
- echo "Done: $(date)."
56
-
@@ -1,54 +0,0 @@
1
- @author: Luis Miguel Rodriguez-R <lmrodriguezr at gmail dot com>
2
-
3
- @update: Oct-30-2014
4
-
5
- @license: artistic 2.0
6
-
7
- @status: auto
8
-
9
- @pbs: yes
10
-
11
- # IMPORTANT
12
-
13
- This pipeline was developed for the [PACE cluster](http://pace.gatech.edu/). You
14
- are free to use it in other platforms with adequate adjustments.
15
-
16
- # PURPOSE
17
-
18
- Performs various trimming and quality-control analyses over raw reads.
19
-
20
- # HELP
21
-
22
- 1. Files preparation:
23
-
24
- 1.1. Obtain the enveomics package in the cluster. You can use:
25
- `git clone https://github.com/lmrodriguezr/enveomics.git`
26
-
27
- 1.2. Prepare the raw reads in FastQ format. Files must be raw, not zipped or packaged.
28
- Filenames must conform the format: <name>.<sis>.fastq, where <name> is the name
29
- of the sample, and <sis> is 1 or 2 indicating which sister read the file contains.
30
- Use only '1' as <sis> if you have single reads.
31
-
32
- 1.3. Gather all the FastQ files into the same folder.
33
-
34
- 2. Pipeline execution:
35
-
36
- 2.1. Simply execute `./RUNME.bash <dir>`, where <dir> is the folder containing
37
- the FastQ files.
38
-
39
- 3. What to expect:
40
-
41
- By the end of the run, you should find the following folders:
42
-
43
- 3.1. *01.raw_reads*: Gzip'ed raw FastQ files.
44
-
45
- 3.2. *02.trimmed_reads*: Trimmed and clipped reads. For each sample, there should be
46
- nine files for paired-end, and two for single-reads.
47
-
48
- 3.3. *03.read_quality*: Quality reports. For each sample, there should be two directories,
49
- one with SolexaQA++ information, another with FastQC information.
50
-
51
- 3.4. *04.trimmed_fasta*: Trimmed and clipped in FastA format (and gzip'ed, in the case of
52
- individual files for paired-end).
53
-
54
-
@@ -1,70 +0,0 @@
1
- #!/bin/bash
2
-
3
- if [[ "$1" == "" || "$1" == "-h" ]] ; then
4
- echo "
5
- Usage: ./RUNME.bash folder [clipper [max_jobs]]
6
-
7
- folder Path to the folder containing the raw reads. The raw reads must be in FastQ format,
8
- and filenames must follow the format: <name>.<sis>.fastq, where <name> is the name
9
- of the sample, and <sis> is 1 or 2 indicating which sister read the file contains.
10
- Use only '1' as <sis> if you have single reads.
11
- clipper (optional) One of: trimmomatic, scythe, or none. By default: scythe.
12
- max_jobs (optional) Maximum number of jobs to run in parallel. This number can be increased,
13
- but bear in mind that this process is highly I/O-intensive, and likely to crash or
14
- significantly slow down the hard drive if many jobs are running simultaneously. By
15
- default: 5.
16
- " >&2 ;
17
- exit 1 ;
18
- fi ;
19
- CLIPPER=$2
20
- if [[ "$CLIPPER" == "" ]] ; then
21
- CLIPPER="scythe"
22
- fi ;
23
- if [[ "$3" == "" ]] ; then
24
- MAX=5 ;
25
- else
26
- let MAX=$3+0 ;
27
- fi ;
28
-
29
- dir=$(readlink -f $1) ;
30
- pac=$(dirname $(readlink -f $0)) ;
31
- cwd=$(pwd) ;
32
-
33
- cd $dir ;
34
- for i in 01.raw_reads 02.trimmed_reads 03.read_quality 04.trimmed_fasta zz.info ; do
35
- if [[ ! -d $i ]] ; then mkdir $i ; fi ;
36
- done ;
37
-
38
- k=0 ;
39
- for i in $dir/*.1.fastq ; do
40
- EXTRA="" ;
41
- EXTRA_MSG="" ;
42
- if [[ $k -ge $MAX ]] ; then
43
- let prek=$k-$MAX ;
44
- EXTRA="-W depend=afterany:${jids[$prek]}" ;
45
- EXTRA_MSG=" (waiting for ${jids[$prek]})"
46
- fi ;
47
- b=$(basename $i .1.fastq) ;
48
- mv $b.[12].fastq 01.raw_reads/ ;
49
- # Predict time (in hours)
50
- SIZE_M=$(($(ls -pl 01.raw_reads/$b.1.fastq | awk '{print $5}')/1000000)) ;
51
- let TIME_H=$SIZE_M*5/1000 ;
52
- [[ -e 01.raw_reads/$b.2.fastq ]] || let TIME_H=$TIME_H/2 ;
53
- let RAM_G=$SIZE_M*8/1000 ;
54
- [[ $RAM_G -lt 10 ]] && RAM_G=10 ;
55
-
56
- # Find the right queue
57
- if [[ $TIME_H -lt 12 ]] ; then
58
- QUEUE="-q iw-shared-6 -l walltime=12:00:00" ;
59
- elif [[ $TIME_H -lt 120 ]] ; then
60
- QUEUE="-q microcluster -l walltime=120:00:00" ;
61
- else
62
- QUEUE="-q microcluster -l walltime=2000:00:00" ;
63
- fi ;
64
- # Launch job
65
- jids[$k]=$(qsub -v "SAMPLE=$b,FOLDER=$dir,CLIPPER=$CLIPPER" -N "Trim-$b" -l "mem=${RAM_G}g" $QUEUE $EXTRA $pac/run.pbs | grep .) ;
66
- echo "$b: ${jids[$k]}$EXTRA_MSG" ;
67
- let k=$k+1 ;
68
- done ;
69
-
70
-
@@ -1,130 +0,0 @@
1
- #!/bin/bash
2
- #PBS -l mem=10g
3
- #PBS -l nodes=1:ppn=1
4
- #PBS -k eo
5
-
6
- module load fastqc/0.11.2
7
- module load scythe/0.993
8
-
9
- shared=/gpfs/pace1/project/bio-konstantinidis/shared3
10
- b=$SAMPLE ;
11
- sqa=$shared/bin/SolexaQA++
12
- scythe=scythe
13
- enve=$shared/apps/enveomics/Scripts
14
- trim=$shared/apps/Trimmomatic-0.32/trimmomatic-0.32.jar
15
- SEadapters=$shared/apps/Trimmomatic-0.32/adapters/ALL-SE_PE.fa
16
- PEadapters=$shared/apps/Trimmomatic-0.32/adapters/ALL-PE.fa
17
-
18
- #---------------------------------------------------------
19
-
20
- echo "==[ 02.trimmed_reads: $(date) ]" ;
21
- cd $FOLDER/02.trimmed_reads ;
22
-
23
- time $enve/FastQ.tag.rb -i ../01.raw_reads/$b.1.fastq -p "$b-" -s "/1" -o $b.1.fastq ;
24
- [[ -e ../01.raw_reads/$b.2.fastq ]] && time $enve/FastQ.tag.rb -i ../01.raw_reads/$b.2.fastq -p "$b-" -s "/2" -o $b.2.fastq ;
25
-
26
- RAW_READS=$(cat $b.1.fastq | paste - - - - | wc -l | sed -e 's/ *//') ;
27
- RAW_LENGTH=$(head -n 40000 $b.1.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{SUM+=length($2)}END{print SUM/NR}') ;
28
-
29
- time $sqa dynamictrim $b.[12].fastq -h 20 -d . ;
30
- time $sqa lengthsort $b.[12].fastq.trimmed -l 50 -d . ;
31
-
32
- if [[ "$CLIPPER" == "trimmomatic" ]] ; then
33
- if [[ -e $b.2.fastq.trimmed.paired ]] ; then
34
- time java -jar $trim PE -threads 1 \
35
- $b.1.fastq.trimmed.paired \
36
- $b.2.fastq.trimmed.paired \
37
- $b.1.clipped.fastq $b.1.clipped.single.fastq \
38
- $b.2.clipped.fastq $b.2.clipped.single.fastq \
39
- ILLUMINACLIP:$PEadapters:2:30:10 MINLEN:50
40
- else
41
- time java -jar $trim SE -threads 1 \
42
- $b.1.fastq.trimmed.single $b.1.clipped.fastq \
43
- ILLUMINACLIP:$SEadapters:2:30:10 MINLEN:50
44
- fi ;
45
- elif [[ "$CLIPPER" == "scythe" ]]; then
46
- if [[ -e $b.2.fastq.trimmed.paired ]] ; then
47
- $scythe -a $PEadapters $b.1.fastq.trimmed.paired > $b.1.clipped.all.fastq ;
48
- $scythe -a $PEadapters $b.2.fastq.trimmed.paired > $b.2.clipped.all.fastq ;
49
- time $sqa lengthsort $b.[12].clipped.all.fastq -l 50 -d . ;
50
- rm $b.[12].clipped.all.fastq ;
51
- [[ -e $b.1.clipped.all.fastq.single ]] && mv $b.1.clipped.all.fastq.single $b.1.clipped.single.fastq ;
52
- [[ -e $b.2.clipped.all.fastq.single ]] && mv $b.2.clipped.all.fastq.single $b.2.clipped.single.fastq ;
53
- mv $b.1.clipped.all.fastq.paired $b.1.clipped.fastq ;
54
- mv $b.2.clipped.all.fastq.paired $b.2.clipped.fastq ;
55
- rm $b.1.clipped.all.fastq.summary.txt $b.1.clipped.all.fastq.summary.txt.pdf &>/dev/null ;
56
- else
57
- $scythe -a $PEadapters $b.1.fastq.trimmed.single > $b.1.clipped.all.fastq ;
58
- time $sqa lengthsort $b.1.clipped.all.fastq -l 50 -d . ;
59
- rm $b.1.clipped.all.fastq ;
60
- mv $b.1.clipped.all.fastq.single $b.1.clipped.fastq ;
61
- fi ;
62
- rm $b.[12].*.discard &>/dev/null ;
63
- else
64
- if [[ -e $b.2.fastq.trimmed.paired ]] ; then
65
- ln -s $b.1.fastq.trimmed.paired $b.1.clipped.fastq ;
66
- ln -s $b.2.fastq.trimmed.paired $b.2.clipped.fastq ;
67
- else
68
- ln -s $b.1.fastq.trimmed.single $b.1.clipped.fastq ;
69
- fi ;
70
- fi ;
71
-
72
- TRIMMED_READS=$(cat $b.1.clipped.fastq | paste - - - - | wc -l | sed -e 's/ *//') ;
73
- TRIMMED_LENGTH=$(head -n 40000 $b.1.clipped.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{SUM+=length($2)}END{print SUM/NR}') ;
74
-
75
- #---------------------------------------------------------
76
-
77
- echo "==[ 03.read_quality: $(date) ]" ;
78
- cd $FOLDER/03.read_quality ;
79
- if [ ! -d $b.fastqc ] ; then mkdir $b.fastqc ; fi ;
80
- perl $(which fastqc) ../02.trimmed_reads/$b.[12].clipped.fastq -o $b.fastqc ;
81
-
82
- if [ ! -d $b ] ; then mkdir $b ; fi ;
83
- time $sqa analysis ../01.raw_reads/$b.[12].fastq -h 20 -d $b -v -m ;
84
- rm $b/*.segments ;
85
- mv ../02.trimmed_reads/$b.[12].fastq_trimmed.segments* $b/
86
- mv ../02.trimmed_reads/$b.[12].fastq.trimmed.summary.txt* $b/
87
-
88
-
89
- cd $FOLDER/02.trimmed_reads ;
90
- rm $b.[12].fastq.trimmed.discard ;
91
- rm $b.[12].fastq.trimmed ;
92
- rm $b.[12].fastq ;
93
-
94
- #---------------------------------------------------------
95
-
96
- echo "==[ 04.trimmed_fasta: $(date) ]" ;
97
- cd $FOLDER/04.trimmed_fasta ;
98
- cat ../02.trimmed_reads/$b.1.clipped.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{print ">"substr($1,2)"\\n"$2}' > $b.1.fasta ;
99
- if [[ -e ../02.trimmed_reads/$b.2.clipped.fastq ]] ; then
100
- cat ../02.trimmed_reads/$b.2.clipped.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{print ">"substr($1,2)"\\n"$2}' > $b.2.fasta ;
101
- time $enve/FastA.interpose.pl $b.CoupledReads.fa $b.[12].fasta ;
102
- time gzip $b.2.fasta ;
103
- time gzip $b.1.fasta ;
104
- else
105
- mv $b.1.fasta $b.SingleReads.fa ;
106
- fi ;
107
-
108
- #---------------------------------------------------------
109
-
110
- echo "==[ zz.info: $(date) ]" ;
111
- cd $FOLDER/zz.info ;
112
- echo "
113
- RAW_LENGTH: $RAW_LENGTH
114
- RAW_READS: $RAW_READS
115
- TRIMMED_LENGTH: $TRIMMED_LENGTH
116
- TRIMMED_READS: $TRIMMED_READS
117
- " > $b.summary.txt ;
118
-
119
- #---------------------------------------------------------
120
-
121
- echo "==[ 01.raw_reads: $(date) ]"
122
- cd $FOLDER/01.raw_reads ;
123
- for i in $b.[12].fastq ; do
124
- time gzip $i ;
125
- done ;
126
-
127
- #---------------------------------------------------------
128
-
129
- echo "Done: $(date)." ;
130
-