RubyGems - miga-base - Versions diffs - 1.2.17.1 → 1.2.17.3 - Mend

miga-base 1.2.17.1 → 1.2.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

data/utils/enveomics/Pipelines/idba.pbs/README.md DELETED Viewed

@@ -1,49 +0,0 @@
-@author: Luis Miguel Rodriguez-R <lmrodriguezr at gmail dot com>
-@update: Feb-26-2015
-@license: artistic 2.0
-@status: auto
-@pbs: yes
-# IMPORTANT
-This pipeline was developed for the [PACE cluster](http://pace.gatech.edu/).  You
-are free to use it in other platforms with adequate adjustments.
-# PURPOSE
-Performs assembly using IDBA-UD, designed for Single-Cell Genomics and Metagenomics.
-# HELP
-1. Files preparation:
-   1.1. Obtain the enveomics package in the cluster. You can use:
-      `git clone https://github.com/lmrodriguezr/enveomics.git`
-   1.2. Prepare the trimmed reads (e.g., use trim.bs) in interposed FastA format. Files
-      must be raw, not zipped or packaged. Filenames must conform the format:
-      <name>.CoupledReads.fa, where <name> is the name of the sample. Locate all the
-      files within a folder named 04.trimmed_fasta, within your project folder. If you
-      used trim.pbs, no further action is necessary.
-2. Pipeline execution:
-   2.1. Simply execute `./RUNME.bash <dir> <data_type>`, where `<dir>` is the folder containing
-      the 04.trimmed_fasta folder, and `<data_type>` is a supported type of data (see help
-      message running `./RUNME.bash` without arguments).
-3. What to expect:
-   By the end of the run, you should find the folder *05.assembly*, including the following
-   files for each dataset:
-   3.1. `<dataset>`: The IDBA output folder.
-   3.2. `<dataset>.AllContigs.fna`: All contigs longer than 200bp in FastA format.
-   3.2. `<dataset>.LargeContigs.fna`: Contigs longer than 500bp in FastA format.

data/utils/enveomics/Pipelines/idba.pbs/RUNME.bash DELETED Viewed

@@ -1,95 +0,0 @@
-#!/bin/bash
-if [[ "$1" == "" || "$1" == "-h" || "$2" == "" ]] ; then
-   echo "
-   Usage: ./RUNME.bash folder data_type [max_jobs]
-   folder	Path to the folder containing the 04.trimmed_fasta folder. The
-		trimmed reads must be in interposed FastA format, and filenames
-		must follow the format: <name>.CoupledReads.fa, where <name> is
-		the name of the sample. If non-paired, the filenames must follow
-		the format: <name>.SingleReads.fa. If both suffixes are found
-		for the same <name> prefix, they are both used.
-   data_type	Type of datasets in the project. One of: mg (for metagenomes),
-		scg (for single-cell genomes), g (for traditional genomes), or t
-		(for transcriptomes).
-   max_jobs	(optional) Maximum number of jobs to run in parallel. This
-		number can be increased, but bear in mind that this process is
-		highly I/O-intensive, and likely to crash or significantly slow
-		down the hard drive if many jobs are running simultaneously. By
-		default: 5.
-   " >&2
-   exit 1
-fi
-TYPE=$2
-if [[ "$TYPE" != "g" && "$TYPE" != "mg" && "$TYPE" != "scg" \
-		     && "$TYPE" != "t" ]] ; then
-   echo "Unsupported data type: $TYPE." >&2
-   exit 1
-fi
-if [[ "$3" == "" ]] ; then
-   MAX=5
-else
-   let MAX=$3+0
-fi
-dir=$(readlink -f $1)
-pac=$(dirname $(readlink -f $0))
-cwd=$(pwd)
-cd $dir
-if [[ ! -e 04.trimmed_fasta ]] ; then
-   echo "Cannot locate the 04.trimmed_fasta directory, aborting..." >&2
-   exit 1
-fi
-for i in 05.assembly ; do
-   [[ -d $i ]] || mkdir $i
-done
-k=0
-for i in $dir/04.trimmed_fasta/*.SingleReads.fa ; do
-   b=$(basename $i .SingleReads.fa)
-   touch $dir/04.trimmed_fasta/$b.CoupledReads.fa
-done
-for i in $dir/04.trimmed_fasta/*.CoupledReads.fa ; do
-   b=$(basename $i .CoupledReads.fa)
-   [[ -d $dir/05.assembly/$b ]] && continue
-   EXTRA=""
-   EXTRA_MSG=""
-   if [[ $k -ge $MAX ]] ; then
-      let prek=$k-$MAX
-      EXTRA="-W depend=afterany:${jids[$prek]}"
-      EXTRA_MSG=" (waiting for ${jids[$prek]})"
-   fi
-   # Predict time (in hours)
-   SIZE_M=$(($(ls -pl 04.trimmed_fasta/$b.CoupledReads.fa \
-	       | awk '{print $5}')/1000000))
-   let TIME_H=6+$SIZE_M*2/1000
-   let RAM_G=20+$SIZE_M*20/1000
-   # Find the right queue
-   if [[ $TIME_H -lt 12 ]] ; then
-      QUEUE="-q iw-shared-6 -l walltime=12:00:00"
-   elif [[ $TIME_H -lt 120 ]] ; then
-      QUEUE="-q microcluster -l walltime=120:00:00"
-   else
-      QUEUE="-q microcluster -l walltime=2000:00:00"
-   fi
-   # Launch job
-   mkdir $dir/05.assembly/$b
-   OPTS="SAMPLE=$b,FOLDER=$dir,TYPE=$TYPE"
-   if [[ -s $dir/04.trimmed_fasta/$b.SingleReads.fa ]] ; then
-      OPTS="$OPTS,FA=$dir/04.trimmed_fasta/$b.SingleReads.fa"
-      [[ -s $dir/04.trimmed_fasta/$b.CoupledReads.fa ]] \
-	 && OPTS="$OPTS,FA_RL2=$dir/04.trimmed_fasta/$b.CoupledReads.fa"
-   else
-      OPTS="$OPTS,FA=$dir/04.trimmed_fasta/$b.CoupledReads.fa"
-   fi
-   jids[$k]=$(qsub -v "$OPTS" -N "IDBA-$b" -l "mem=${RAM_G}g" \
-	       $QUEUE $EXTRA $pac/run.pbs | grep .)
-   echo "$b: ${jids[$k]}$EXTRA_MSG"
-   let k=$k+1
-done

data/utils/enveomics/Pipelines/idba.pbs/run.pbs DELETED Viewed

@@ -1,56 +0,0 @@
-#!/bin/bash
-#PBS -l nodes=1:ppn=10
-#PBS -k eo
-module load idba/1.1.1
-b=$SAMPLE
-shared=/nv/gpfs-gateway-pace1/project/bio-konstantinidis/shared3
-enve=$shared/apps/enveomics/Scripts
-THR=10
-#---------------------------------------------------------
-echo "==[ 05.assembly: $(date) ]"
-cd $FOLDER/05.assembly
-CMD=""
-case "$TYPE" in
-*g)
-   CMD="idba_ud" ;;
-t)
-   CMD="idba_tran" ;;
-*)
-   echo "Unsupported data type: $TYPE" >&2
-   exit 1
-   ;;
-esac
-CMD="$CMD --pre_correction -r $FA -o $SAMPLE --num_threads $THR"
-[[ -n "$FA_RL2" ]] && CMD="$CMD --read_level_2 $FA_RL2"
-[[ -n "$FA_RL3" ]] && CMD="$CMD --read_level_3 $FA_RL3"
-[[ -n "$FA_RL4" ]] && CMD="$CMD --read_level_4 $FA_RL4"
-[[ -n "$FA_RL5" ]] && CMD="$CMD --read_level_5 $FA_RL5"
-time $CMD
-rm $SAMPLE/kmer
-rm $SAMPLE/graph-*.fa
-rm $SAMPLE/align-*
-rm $SAMPLE/local-contig-*.fa
-rm $SAMPLE/contig-*.fa
-if [[ -s $SAMPLE/scaffold.fa ]] ; then
-   ln -s $SAMPLE/scaffold.fa $SAMPLE.AllContigs.fna
-else
-   ln -s $SAMPLE/contig.fa $SAMPLE.AllContigs.fna
-fi
-time $enve/FastA.length.pl $SAMPLE.AllContigs.fna | awk '$2>=500{print $1}' \
-   > $SAMPLE.LargeContigs.ids
-time $enve/FastA.filter.pl $SAMPLE.LargeContigs.ids $SAMPLE.AllContigs.fna \
-   > $SAMPLE.LargeContigs.fna
-rm $SAMPLE.LargeContigs.ids
-#---------------------------------------------------------
-echo "Done: $(date)."

data/utils/enveomics/Pipelines/trim.pbs/README.md DELETED Viewed

@@ -1,54 +0,0 @@
-@author: Luis Miguel Rodriguez-R <lmrodriguezr at gmail dot com>
-@update: Oct-30-2014
-@license: artistic 2.0
-@status: auto
-@pbs: yes
-# IMPORTANT
-This pipeline was developed for the [PACE cluster](http://pace.gatech.edu/).  You
-are free to use it in other platforms with adequate adjustments.
-# PURPOSE
-Performs various trimming and quality-control analyses over raw reads.
-# HELP
-1. Files preparation:
-   1.1. Obtain the enveomics package in the cluster. You can use:
-      `git clone https://github.com/lmrodriguezr/enveomics.git`
-   1.2. Prepare the raw reads in FastQ format. Files must be raw, not zipped or packaged.
-      Filenames must conform the format: <name>.<sis>.fastq, where <name> is the name
-      of the sample, and <sis> is 1 or 2 indicating which sister read the file contains.
-      Use only '1' as <sis> if you have single reads.
-   1.3. Gather all the FastQ files into the same folder.
-2. Pipeline execution:
-   2.1. Simply execute `./RUNME.bash <dir>`, where <dir> is the folder containing
-      the FastQ files.
-3. What to expect:
-   By the end of the run, you should find the following folders:
-   3.1. *01.raw_reads*: Gzip'ed raw FastQ files.
-   3.2. *02.trimmed_reads*: Trimmed and clipped reads. For each sample, there should be
-      nine files for paired-end, and two for single-reads.
-   3.3. *03.read_quality*: Quality reports. For each sample, there should be two directories,
-      one with SolexaQA++ information, another with FastQC information.
-   3.4. *04.trimmed_fasta*: Trimmed and clipped in FastA format (and gzip'ed, in the case of
-      individual files for paired-end).

data/utils/enveomics/Pipelines/trim.pbs/RUNME.bash DELETED Viewed

@@ -1,70 +0,0 @@
-#!/bin/bash
-if [[ "$1" == "" || "$1" == "-h" ]] ; then
-   echo "
-   Usage: ./RUNME.bash folder [clipper [max_jobs]]
-   folder	Path to the folder containing the raw reads. The raw reads must be in FastQ format,
-   		and filenames must follow the format: <name>.<sis>.fastq, where <name> is the name
-		of the sample, and <sis> is 1 or 2 indicating which sister read the file contains.
-		Use only '1' as <sis> if you have single reads.
-   clipper	(optional) One of: trimmomatic, scythe, or none. By default: scythe.
-   max_jobs	(optional) Maximum number of jobs to run in parallel. This number can be increased,
-   		but bear in mind that this process is highly I/O-intensive, and likely to crash or
-		significantly slow down the hard drive if many jobs are running simultaneously. By
-		default: 5.
-   " >&2 ;
-   exit 1 ;
-fi ;
-CLIPPER=$2
-if [[ "$CLIPPER" == "" ]] ; then
-   CLIPPER="scythe"
-fi ;
-if [[ "$3" == "" ]] ; then
-   MAX=5 ;
-else
-   let MAX=$3+0 ;
-fi ;
-dir=$(readlink -f $1) ;
-pac=$(dirname $(readlink -f $0)) ;
-cwd=$(pwd) ;
-cd $dir ;
-for i in 01.raw_reads 02.trimmed_reads 03.read_quality 04.trimmed_fasta zz.info ; do
-   if [[ ! -d $i ]] ; then mkdir $i ; fi ;
-done ;
-k=0 ;
-for i in $dir/*.1.fastq ; do
-   EXTRA="" ;
-   EXTRA_MSG="" ;
-   if [[ $k -ge $MAX ]] ; then
-      let prek=$k-$MAX ;
-      EXTRA="-W depend=afterany:${jids[$prek]}" ;
-      EXTRA_MSG=" (waiting for ${jids[$prek]})"
-   fi ;
-   b=$(basename $i .1.fastq) ;
-   mv $b.[12].fastq 01.raw_reads/ ;
-   # Predict time (in hours)
-   SIZE_M=$(($(ls -pl 01.raw_reads/$b.1.fastq | awk '{print $5}')/1000000)) ;
-   let TIME_H=$SIZE_M*5/1000 ;
-   [[ -e 01.raw_reads/$b.2.fastq ]] || let TIME_H=$TIME_H/2 ;
-   let RAM_G=$SIZE_M*8/1000 ;
-   [[ $RAM_G -lt 10 ]] && RAM_G=10 ;
-   # Find the right queue
-   if [[ $TIME_H -lt 12 ]] ; then
-      QUEUE="-q iw-shared-6 -l walltime=12:00:00" ;
-   elif [[ $TIME_H -lt 120 ]] ; then
-      QUEUE="-q microcluster -l walltime=120:00:00" ;
-   else
-      QUEUE="-q microcluster -l walltime=2000:00:00" ;
-   fi ;
-   # Launch job
-   jids[$k]=$(qsub -v "SAMPLE=$b,FOLDER=$dir,CLIPPER=$CLIPPER" -N "Trim-$b" -l "mem=${RAM_G}g" $QUEUE $EXTRA $pac/run.pbs | grep .) ;
-   echo "$b: ${jids[$k]}$EXTRA_MSG" ;
-   let k=$k+1 ;
-done ;

data/utils/enveomics/Pipelines/trim.pbs/run.pbs DELETED Viewed

@@ -1,130 +0,0 @@
-#!/bin/bash
-#PBS -l mem=10g
-#PBS -l nodes=1:ppn=1
-#PBS -k eo
-module load fastqc/0.11.2
-module load scythe/0.993
-shared=/gpfs/pace1/project/bio-konstantinidis/shared3
-b=$SAMPLE ;
-sqa=$shared/bin/SolexaQA++
-scythe=scythe
-enve=$shared/apps/enveomics/Scripts
-trim=$shared/apps/Trimmomatic-0.32/trimmomatic-0.32.jar
-SEadapters=$shared/apps/Trimmomatic-0.32/adapters/ALL-SE_PE.fa
-PEadapters=$shared/apps/Trimmomatic-0.32/adapters/ALL-PE.fa
-#---------------------------------------------------------
-echo "==[ 02.trimmed_reads: $(date) ]" ;
-cd $FOLDER/02.trimmed_reads ;
-time $enve/FastQ.tag.rb -i ../01.raw_reads/$b.1.fastq -p "$b-" -s "/1" -o $b.1.fastq ;
-[[ -e ../01.raw_reads/$b.2.fastq ]] && time $enve/FastQ.tag.rb -i ../01.raw_reads/$b.2.fastq -p "$b-" -s "/2" -o $b.2.fastq ;
-RAW_READS=$(cat $b.1.fastq | paste - - - - | wc -l | sed -e 's/ *//') ;
-RAW_LENGTH=$(head -n 40000 $b.1.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{SUM+=length($2)}END{print SUM/NR}') ;
-time $sqa dynamictrim $b.[12].fastq -h 20 -d . ;
-time $sqa lengthsort $b.[12].fastq.trimmed -l 50 -d . ;
-if [[ "$CLIPPER" == "trimmomatic" ]] ; then
-   if [[ -e $b.2.fastq.trimmed.paired ]] ; then
-      time java -jar $trim PE -threads 1 \
-	 $b.1.fastq.trimmed.paired \
-	 $b.2.fastq.trimmed.paired \
-	 $b.1.clipped.fastq $b.1.clipped.single.fastq \
-	 $b.2.clipped.fastq $b.2.clipped.single.fastq \
-	 ILLUMINACLIP:$PEadapters:2:30:10 MINLEN:50
-   else
-      time java -jar $trim SE -threads 1 \
-	 $b.1.fastq.trimmed.single $b.1.clipped.fastq \
-	 ILLUMINACLIP:$SEadapters:2:30:10 MINLEN:50
-   fi ;
-elif [[ "$CLIPPER" == "scythe" ]]; then
-   if [[ -e $b.2.fastq.trimmed.paired ]] ; then
-      $scythe -a $PEadapters $b.1.fastq.trimmed.paired > $b.1.clipped.all.fastq ;
-      $scythe -a $PEadapters $b.2.fastq.trimmed.paired > $b.2.clipped.all.fastq ;
-      time $sqa lengthsort $b.[12].clipped.all.fastq -l 50 -d . ;
-      rm $b.[12].clipped.all.fastq ;
-      [[ -e $b.1.clipped.all.fastq.single ]] && mv $b.1.clipped.all.fastq.single $b.1.clipped.single.fastq ;
-      [[ -e $b.2.clipped.all.fastq.single ]] && mv $b.2.clipped.all.fastq.single $b.2.clipped.single.fastq ;
-      mv $b.1.clipped.all.fastq.paired $b.1.clipped.fastq ;
-      mv $b.2.clipped.all.fastq.paired $b.2.clipped.fastq ;
-      rm $b.1.clipped.all.fastq.summary.txt $b.1.clipped.all.fastq.summary.txt.pdf &>/dev/null ;
-   else
-      $scythe -a $PEadapters $b.1.fastq.trimmed.single > $b.1.clipped.all.fastq ;
-      time $sqa lengthsort $b.1.clipped.all.fastq -l 50 -d . ;
-      rm $b.1.clipped.all.fastq ;
-      mv $b.1.clipped.all.fastq.single $b.1.clipped.fastq ;
-   fi ;
-   rm $b.[12].*.discard &>/dev/null ;
-else
-   if [[ -e $b.2.fastq.trimmed.paired ]] ; then
-      ln -s $b.1.fastq.trimmed.paired $b.1.clipped.fastq ;
-      ln -s $b.2.fastq.trimmed.paired $b.2.clipped.fastq ;
-   else
-      ln -s $b.1.fastq.trimmed.single $b.1.clipped.fastq ;
-   fi ;
-fi ;
-TRIMMED_READS=$(cat $b.1.clipped.fastq | paste - - - - | wc -l | sed -e 's/ *//') ;
-TRIMMED_LENGTH=$(head -n 40000 $b.1.clipped.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{SUM+=length($2)}END{print SUM/NR}') ;
-#---------------------------------------------------------
-echo "==[ 03.read_quality: $(date) ]" ;
-cd $FOLDER/03.read_quality ;
-if [ ! -d $b.fastqc ] ; then mkdir $b.fastqc ; fi ;
-perl $(which fastqc) ../02.trimmed_reads/$b.[12].clipped.fastq -o $b.fastqc ;
-if [ ! -d $b ] ; then mkdir $b ; fi ;
-time $sqa analysis ../01.raw_reads/$b.[12].fastq -h 20 -d $b -v -m ;
-rm $b/*.segments ;
-mv ../02.trimmed_reads/$b.[12].fastq_trimmed.segments* $b/
-mv ../02.trimmed_reads/$b.[12].fastq.trimmed.summary.txt* $b/
-cd $FOLDER/02.trimmed_reads ;
-rm $b.[12].fastq.trimmed.discard ;
-rm $b.[12].fastq.trimmed ;
-rm $b.[12].fastq ;
-#---------------------------------------------------------
-echo "==[ 04.trimmed_fasta: $(date) ]" ;
-cd $FOLDER/04.trimmed_fasta ;
-cat ../02.trimmed_reads/$b.1.clipped.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{print ">"substr($1,2)"\\n"$2}' > $b.1.fasta ;
-if [[ -e ../02.trimmed_reads/$b.2.clipped.fastq ]] ; then
-   cat ../02.trimmed_reads/$b.2.clipped.fastq | paste - - - - | awk 'BEGIN{FS="\\t"}{print ">"substr($1,2)"\\n"$2}' > $b.2.fasta ;
-   time $enve/FastA.interpose.pl $b.CoupledReads.fa $b.[12].fasta ;
-   time gzip $b.2.fasta ;
-   time gzip $b.1.fasta ;
-else
-   mv $b.1.fasta $b.SingleReads.fa ;
-fi ;
-#---------------------------------------------------------
-echo "==[  zz.info: $(date) ]" ;
-cd $FOLDER/zz.info ;
-echo "
-RAW_LENGTH:      $RAW_LENGTH
-RAW_READS:       $RAW_READS
-TRIMMED_LENGTH:  $TRIMMED_LENGTH
-TRIMMED_READS:   $TRIMMED_READS
-" > $b.summary.txt ;
-#---------------------------------------------------------
-echo "==[ 01.raw_reads: $(date) ]"
-cd $FOLDER/01.raw_reads ;
-for i in $b.[12].fastq ; do
-   time gzip $i ;
-done ;
-#---------------------------------------------------------
-echo "Done: $(date)." ;