bio-ngs 0.3.2.alpha.01
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
#
|
|
2
|
+
# fastx.rb - The FASTX-Toolkit is a collection of command line tools for Short-Reads FASTA/FASTQ files preprocessing.
|
|
3
|
+
#
|
|
4
|
+
# Next-Generation sequencing machines usually produce FASTA or FASTQ files, containing multiple short-reads sequences (possibly with quality information).
|
|
5
|
+
#
|
|
6
|
+
# The main processing of such FASTA/FASTQ files is mapping (aka aligning) the sequences to reference genomes or other databases using specialized programs. Example of such mapping programs are: Blat, SHRiMP, LastZ, MAQ and many many others.
|
|
7
|
+
#
|
|
8
|
+
# However,
|
|
9
|
+
# It is sometimes more productive to preprocess the FASTA/FASTQ files before mapping the sequences to the genome - manipulating the sequences to produce better mapping results.
|
|
10
|
+
#
|
|
11
|
+
# The FASTX-Toolkit tools perform some of these preprocessing tasks.
|
|
12
|
+
# http://hannonlab.cshl.edu/fastx_toolkit/
|
|
13
|
+
#
|
|
14
|
+
# Copyright:: Copyright (C) 2011
|
|
15
|
+
# Raoul Bonnal <r@bioruby.org>
|
|
16
|
+
# License:: The Ruby License
|
|
17
|
+
#
|
|
18
|
+
# + Mapped
|
|
19
|
+
# - Not Yet Mapped
|
|
20
|
+
#
|
|
21
|
+
# - fastx_artifacts_filter
|
|
22
|
+
# - fastx_collapser
|
|
23
|
+
# + fastx_quality_stats
|
|
24
|
+
# - fastx_trimmer
|
|
25
|
+
# - fastx_barcode_splitter.pl
|
|
26
|
+
# - fastx_nucleotide_distribution_graph.sh
|
|
27
|
+
# - fastx_renamer
|
|
28
|
+
# - fastx_uncollapser
|
|
29
|
+
# - fastx_clipper
|
|
30
|
+
# - fastx_nucleotide_distribution_line_graph.sh
|
|
31
|
+
# - fastx_reverse_complement
|
|
32
|
+
# + fastq_coverage_graph.sh
|
|
33
|
+
# - fastq_masker
|
|
34
|
+
# + fastq_quality_boxplot_graph.sh
|
|
35
|
+
# - fastq_quality_converter
|
|
36
|
+
# - fastq_quality_filter
|
|
37
|
+
# - fastq_quality_trimmer
|
|
38
|
+
# - fastq_to_fasta
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
module Bio
|
|
43
|
+
module Ngs
|
|
44
|
+
module Fastx
|
|
45
|
+
|
|
46
|
+
# [-h] = This helpful help screen.
|
|
47
|
+
# [-t N] = Quality threshold - nucleotides with lower
|
|
48
|
+
# quality will be trimmed (from the end of the sequence).
|
|
49
|
+
# [-l N] = Minimum length - sequences shorter than this (after trimming)
|
|
50
|
+
# will be discarded. Default = 0 = no minimum length.
|
|
51
|
+
# [-z] = Compress output with GZIP.
|
|
52
|
+
# [-i INFILE] = FASTQ input file. default is STDIN.
|
|
53
|
+
# [-o OUTFILE] = FASTQ output file. default is STDOUT.
|
|
54
|
+
# [-v] = Verbose - report number of sequences.
|
|
55
|
+
# If [-o] is specified, report will be printed to STDOUT.
|
|
56
|
+
# If [-o] is not specified (and output goes to STDOUT),
|
|
57
|
+
# report will be printed to STDERR.
|
|
58
|
+
class Trim
|
|
59
|
+
include Bio::Command::Wrapper
|
|
60
|
+
set_program Bio::Ngs::Utils.binary("fastq_quality_trimmer")
|
|
61
|
+
use_aliases
|
|
62
|
+
add_option :min_size, :type=>:numeric, :default=>20, :aliases => "-l", :desc=>"Minimum length - sequences shorter than this (after trimming)
|
|
63
|
+
will be discarded. Default = 0 = no minimum length."
|
|
64
|
+
add_option :min_quality, :type=>:numeric, :default=>10, :aliases => "-t", :desc=>"Quality threshold - nucleotides with lower
|
|
65
|
+
quality will be trimmed (from the end of the sequence)."
|
|
66
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
|
|
67
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
|
|
68
|
+
add_option :gzip, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP."
|
|
69
|
+
add_option :verbose, :type => :boolean, :aliases => "-v", :desc => "[-v] = Verbose - report number of sequences.
|
|
70
|
+
If [-o] is specified, report will be printed to STDOUT.
|
|
71
|
+
If [-o] is not specified (and output goes to STDOUT),
|
|
72
|
+
report will be printed to STDERR."
|
|
73
|
+
add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
|
|
74
|
+
end #Trim
|
|
75
|
+
|
|
76
|
+
# Solexa-Quality BoxPlot plotter
|
|
77
|
+
# Generates a solexa quality score box-plot graph
|
|
78
|
+
#
|
|
79
|
+
# Usage: /usr/local/bin/fastq_quality_boxplot_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
|
|
80
|
+
#
|
|
81
|
+
# [-p] - Generate PostScript (.PS) file. Default is PNG image.
|
|
82
|
+
# [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
|
|
83
|
+
# [-o OUTPUT] - Output file name. default is STDOUT.
|
|
84
|
+
# [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
|
|
85
|
+
class ReadsBoxPlot
|
|
86
|
+
include Bio::Command::Wrapper
|
|
87
|
+
set_program Bio::Ngs::Utils.binary("fastq_quality_boxplot_graph.sh")
|
|
88
|
+
use_aliases
|
|
89
|
+
add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
|
|
90
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
|
|
91
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
|
|
92
|
+
add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
|
|
93
|
+
end #ReadsBoxPlot
|
|
94
|
+
|
|
95
|
+
# Solexa-Reads coverage plotter
|
|
96
|
+
# Generates a solexa line coverage graph
|
|
97
|
+
#
|
|
98
|
+
# Usage: /usr/local/bin/fastq_coverage_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
|
|
99
|
+
#
|
|
100
|
+
# [-p] - Generate PostScript (.PS) file. Default is PNG image.
|
|
101
|
+
# [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
|
|
102
|
+
# [-o OUTPUT] - Output file name. default is STDOUT.
|
|
103
|
+
# [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
|
|
104
|
+
class ReadsCoverage
|
|
105
|
+
include Bio::Command::Wrapper
|
|
106
|
+
set_program Bio::Ngs::Utils.binary("fastq_coverage_graph.sh")
|
|
107
|
+
use_aliases
|
|
108
|
+
add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
|
|
109
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
|
|
110
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
|
|
111
|
+
add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
|
|
112
|
+
end #ReadsCoverage
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# usage: fastx_quality_stats [-h] [-N] [-i INFILE] [-o OUTFILE]
|
|
116
|
+
# Part of FASTX Toolkit 0.0.13 by A. Gordon (gordon@cshl.edu)
|
|
117
|
+
#
|
|
118
|
+
# [-h] = This helpful help screen.
|
|
119
|
+
# [-i INFILE] = FASTQ input file. default is STDIN.
|
|
120
|
+
# [-o OUTFILE] = TEXT output file. default is STDOUT.
|
|
121
|
+
# [-N] = New output format (with more information per nucleotide/cycle).
|
|
122
|
+
#
|
|
123
|
+
# The *OLD* output TEXT file will have the following fields (one row per column):
|
|
124
|
+
# column = column number (1 to 36 for a 36-cycles read solexa file)
|
|
125
|
+
# count = number of bases found in this column.
|
|
126
|
+
# min = Lowest quality score value found in this column.
|
|
127
|
+
# max = Highest quality score value found in this column.
|
|
128
|
+
# sum = Sum of quality score values for this column.
|
|
129
|
+
# mean = Mean quality score value for this column.
|
|
130
|
+
# Q1 = 1st quartile quality score.
|
|
131
|
+
# med = Median quality score.
|
|
132
|
+
# Q3 = 3rd quartile quality score.
|
|
133
|
+
# IQR = Inter-Quartile range (Q3-Q1).
|
|
134
|
+
# lW = 'Left-Whisker' value (for boxplotting).
|
|
135
|
+
# rW = 'Right-Whisker' value (for boxplotting).
|
|
136
|
+
# A_Count = Count of 'A' nucleotides found in this column.
|
|
137
|
+
# C_Count = Count of 'C' nucleotides found in this column.
|
|
138
|
+
# G_Count = Count of 'G' nucleotides found in this column.
|
|
139
|
+
# T_Count = Count of 'T' nucleotides found in this column.
|
|
140
|
+
# N_Count = Count of 'N' nucleotides found in this column.
|
|
141
|
+
# max-count = max. number of bases (in all cycles)
|
|
142
|
+
#
|
|
143
|
+
#
|
|
144
|
+
# The *NEW* output format:
|
|
145
|
+
# cycle (previously called 'column') = cycle number
|
|
146
|
+
# max-count
|
|
147
|
+
# For each nucleotide in the cycle (ALL/A/C/G/T/N):
|
|
148
|
+
# count = number of bases found in this column.
|
|
149
|
+
# min = Lowest quality score value found in this column.
|
|
150
|
+
# max = Highest quality score value found in this column.
|
|
151
|
+
# sum = Sum of quality score values for this column.
|
|
152
|
+
# mean = Mean quality score value for this column.
|
|
153
|
+
# Q1 = 1st quartile quality score.
|
|
154
|
+
# med = Median quality score.
|
|
155
|
+
# Q3 = 3rd quartile quality score.
|
|
156
|
+
# IQR = Inter-Quartile range (Q3-Q1).
|
|
157
|
+
# lW = 'Left-Whisker' value (for boxplotting).
|
|
158
|
+
# rW = 'Right-Whisker' value (for boxplotting).
|
|
159
|
+
class FastqStats
|
|
160
|
+
include Bio::Command::Wrapper
|
|
161
|
+
set_program Bio::Ngs::Utils.binary("fastx_quality_stats")
|
|
162
|
+
use_aliases
|
|
163
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
|
|
164
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
|
|
165
|
+
add_option :new_format, :type => :boolean, :aliases => "-N", :desc => "New output format (with more information per nucleotide/cycle)."
|
|
166
|
+
end #ReadsCoverage
|
|
167
|
+
|
|
168
|
+
end #Fastx
|
|
169
|
+
end #Ngs
|
|
170
|
+
end #Bio
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
#
|
|
2
|
+
# samtools.rb - description
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Raoul Bonnal <r@bioruby.org>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
#
|
|
10
|
+
# Program: samtools (Tools for alignments in the SAM format)
|
|
11
|
+
# Version: 0.1.16 (r963:234)
|
|
12
|
+
#
|
|
13
|
+
# Usage: samtools <command> [options]
|
|
14
|
+
#
|
|
15
|
+
# Command: view SAM<->BAM conversion
|
|
16
|
+
# sort sort alignment file
|
|
17
|
+
# pileup generate pileup output
|
|
18
|
+
# mpileup multi-way pileup
|
|
19
|
+
# depth compute the depth
|
|
20
|
+
# faidx index/extract FASTA
|
|
21
|
+
# tview text alignment viewer
|
|
22
|
+
# index index alignment
|
|
23
|
+
# idxstats BAM index stats (r595 or later)
|
|
24
|
+
# fixmate fix mate information
|
|
25
|
+
# glfview print GLFv3 file
|
|
26
|
+
# flagstat simple stats
|
|
27
|
+
# calmd recalculate MD/NM tags and '=' bases
|
|
28
|
+
# merge merge sorted alignments
|
|
29
|
+
# rmdup remove PCR duplicates
|
|
30
|
+
# reheader replace BAM header
|
|
31
|
+
# cat concatenate BAMs
|
|
32
|
+
# targetcut cut fosmid regions (for fosmid pool only)
|
|
33
|
+
# phase phase heterozygotes
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
module Bio
|
|
37
|
+
module Ngs
|
|
38
|
+
module Samtools
|
|
39
|
+
|
|
40
|
+
# Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]
|
|
41
|
+
#
|
|
42
|
+
# Options: -b output BAM
|
|
43
|
+
# -h print header for the SAM output
|
|
44
|
+
# -H print header only (no alignments)
|
|
45
|
+
# -S input is SAM
|
|
46
|
+
# -u uncompressed BAM output (force -b)
|
|
47
|
+
# -1 fast compression (force -b)
|
|
48
|
+
# -x output FLAG in HEX (samtools-C specific)
|
|
49
|
+
# -X output FLAG in string (samtools-C specific)
|
|
50
|
+
# -c print only the count of matching records
|
|
51
|
+
# -L FILE output alignments overlapping the input BED FILE [null]
|
|
52
|
+
# -t FILE list of reference names and lengths (force -S) [null]
|
|
53
|
+
# -T FILE reference sequence file (force -S) [null]
|
|
54
|
+
# -o FILE output file name [stdout]
|
|
55
|
+
# -R FILE list of read groups to be outputted [null]
|
|
56
|
+
# -f INT required flag, 0 for unset [0]
|
|
57
|
+
# -F INT filtering flag, 0 for unset [0]
|
|
58
|
+
# -q INT minimum mapping quality [0]
|
|
59
|
+
# -l STR only output reads in library STR [null]
|
|
60
|
+
# -r STR only output reads in read group STR [null]
|
|
61
|
+
# -? longer help
|
|
62
|
+
class View
|
|
63
|
+
include Bio::Command::Wrapper
|
|
64
|
+
set_program Bio::Ngs::Utils.binary("samtools")
|
|
65
|
+
set_sub_program "view"
|
|
66
|
+
use_aliases
|
|
67
|
+
add_option :bam_output, :type => :boolean, :aliases => "-b", :desc => "output BAM", :default => true
|
|
68
|
+
add_option :print_header_alignment, :type => :boolean, :aliases => "-h", :desc => "print header for the SAM output"
|
|
69
|
+
add_option :print_header_only, :type => :boolean, :aliases => "-H", :desc => "print header only (no alignments)"
|
|
70
|
+
add_option :sam_input, :type => :boolean, :aliases => "-S", :desc => "input is SAM"
|
|
71
|
+
add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output (force -b)"
|
|
72
|
+
add_option :compress, :type => :boolean , :aliases => "-1", :desc => "fast compression (force -b)"
|
|
73
|
+
add_option :flag_hex, :type => :boolean, :aliases => "-x", :desc => "output FLAG in HEX (samtools-C specific)"
|
|
74
|
+
add_option :flag_string, :type => :boolean, :aliases => "-X", :desc => "output FLAS is string (samtools-C specific)"
|
|
75
|
+
add_option :output_alignment, :type => :string, :aliases => "-L", :desc => "output alignments overlapping the input BED FILE [null]"
|
|
76
|
+
add_option :list_ref, :type => :string, :aliases => "-t", :desc => "list of reference names and lengths (force -S) [null]"
|
|
77
|
+
add_option :ref_sequence, :type => :string, :aliases => "-T", :desc => "reference sequence file (force -S) [null]"
|
|
78
|
+
add_option :output, :type => :string, :aliases => "-o", :desc => "output file name [stdout]", :required => true
|
|
79
|
+
add_option :list_group, :type => :string, :aliases => "-R", :desc => "list of read groups to be outputted [null]"
|
|
80
|
+
add_option :required_flag, :type => :numeric, :aliases => "-f", :desc => "required flag, 0 for unset [0]"
|
|
81
|
+
add_option :filtering_flag, :type => :numeric, :aliases => "-F", :desc => "filtering flag, 0 for unset [0]"
|
|
82
|
+
add_option :min_map_qual, :type => :numeric, :aliases => "-q", :desc => "minimum mapping quality [0]"
|
|
83
|
+
add_option :only_lib_reads, :type => :string, :aliases => "-l", :desc => "only output reads in library STR [null]"
|
|
84
|
+
add_option :only_grp_reads, :type => :string, :aliases => "r", :desc => "only output reads in read group STR [null]"
|
|
85
|
+
|
|
86
|
+
end #View
|
|
87
|
+
|
|
88
|
+
# Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]
|
|
89
|
+
#
|
|
90
|
+
# Options: -n sort by read names
|
|
91
|
+
# -r attach RG tag (inferred from file names)
|
|
92
|
+
# -u uncompressed BAM output
|
|
93
|
+
# -f overwrite the output BAM if exist
|
|
94
|
+
# -1 compress level 1
|
|
95
|
+
# -R STR merge file in the specified region STR [all]
|
|
96
|
+
# -h FILE copy the header in FILE to <out.bam> [in1.bam]
|
|
97
|
+
#
|
|
98
|
+
# Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users
|
|
99
|
+
# must provide the correct header with -h, or uses Picard which properly maintains
|
|
100
|
+
# the header dictionary in merging.
|
|
101
|
+
#out, in1, in2, ... inx Must be passed as arguments
|
|
102
|
+
class Merge
|
|
103
|
+
include Bio::Command::Wrapper
|
|
104
|
+
set_program Bio::Ngs::Utils.binary("samtools")
|
|
105
|
+
set_sub_program "merge"
|
|
106
|
+
use_aliases
|
|
107
|
+
add_option :sort_by_read_name, :type => :boolean, :aliases => "-n", :desc => "sort by read names"
|
|
108
|
+
add_option :attach_rg, :type => :boolean, :aliases => "-r", :desc => "attach RG tag (inferred from file names)"
|
|
109
|
+
add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output"
|
|
110
|
+
add_option :overwrite_output, :type => :boolean, :aliases => "-f", :desc => "overwrite the output BAM if exist"
|
|
111
|
+
add_option :compress, :type => :boolean , :aliases => "-1", :desc => "compress level 1"
|
|
112
|
+
add_option :merge_regions, :type => :string, :aliases => "-R", :desc => "merge file in the specified region STR [all]"
|
|
113
|
+
add_option :copy_header, :type => :string, :aliases => "-h", :desc => "copy the header in FILE to <out.bam> [in1.bam]"
|
|
114
|
+
end #Merge
|
|
115
|
+
|
|
116
|
+
end #Samtools
|
|
117
|
+
end #Ngs
|
|
118
|
+
end #Bio
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
module Bio
|
|
2
|
+
module Ngs
|
|
3
|
+
class SffExtract
|
|
4
|
+
|
|
5
|
+
include Bio::Command::Wrapper
|
|
6
|
+
|
|
7
|
+
set_program Bio::Ngs::Utils.binary("sff_extract")
|
|
8
|
+
add_option "append", :type => :boolean, :aliases => "-a", :desc => "append output to existing files"
|
|
9
|
+
add_option "xml_info", :type => :string, :aliases => "-i", :desc => "extra info to write in the xml file"
|
|
10
|
+
add_option "linker_file", :type => :string, :aliases => "-l", :desc => "FASTA file with paired-end linker sequences"
|
|
11
|
+
add_option "clip", :type => :boolean, :aliases => "-c", :desc => "clip (completely remove) ends with low qual and/or adaptor sequence"
|
|
12
|
+
add_option "upper_case", :type => :boolean, :aliases => "-u", :desc => "all bases in upper case, including clipped ends"
|
|
13
|
+
add_option "min_left_clip", :type => :numeric, :desc => "if the left clip coming from the SFF is smaller than this value, override it"
|
|
14
|
+
add_option "fastq", :type => :boolean, :aliases => "-Q", :desc => "store as FASTQ file instead of FASTA + FASTA quality file"
|
|
15
|
+
add_option "out_basename", :type => :string, :aliases => "-o", :desc => "base name for all output files"
|
|
16
|
+
add_option "seq_file", :type => :string, :aliases => "-s", :desc => "output sequence file name"
|
|
17
|
+
add_option "qual_file", :type => :string, :aliases => "-q", :desc => "output quality file name"
|
|
18
|
+
add_option "xml_file", :type => :string, :aliases => "-x", :desc => "output ancillary xml file name"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#
|
|
2
|
+
# tophat.rb - description
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Raoul Bonnal <@bioruby.org>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
#require 'bio/command'
|
|
12
|
+
#require 'shellwords'
|
|
13
|
+
#require 'thor'
|
|
14
|
+
#require 'bio/ngs/utils'
|
|
15
|
+
|
|
16
|
+
# TopHat maps short sequences from spliced transcripts to whole genomes.
|
|
17
|
+
#
|
|
18
|
+
# Usage:
|
|
19
|
+
# tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \
|
|
20
|
+
# [quals1,[quals2,...]] [quals1[,quals2,...]]
|
|
21
|
+
#
|
|
22
|
+
# Options:
|
|
23
|
+
# -v/--version
|
|
24
|
+
# -o/--output-dir <string> [ default: ./tophat_out ]
|
|
25
|
+
# -a/--min-anchor <int> [ default: 8 ]
|
|
26
|
+
# -m/--splice-mismatches <0-2> [ default: 0 ]
|
|
27
|
+
# -i/--min-intron-length <int> [ default: 50 ]
|
|
28
|
+
# -I/--max-intron-length <int> [ default: 500000 ]
|
|
29
|
+
# -g/--max-multihits <int> [ default: 20 ]
|
|
30
|
+
# -F/--min-isoform-fraction <float> [ default: 0.15 ]
|
|
31
|
+
# --max-insertion-length <int> [ default: 3 ]
|
|
32
|
+
# --max-deletion-length <int> [ default: 3 ]
|
|
33
|
+
# --solexa-quals
|
|
34
|
+
# --solexa1.3-quals (same as phred64-quals)
|
|
35
|
+
# --phred64-quals (same as solexa1.3-quals)
|
|
36
|
+
# -Q/--quals
|
|
37
|
+
# --integer-quals
|
|
38
|
+
# -C/--color (Solid - color space)
|
|
39
|
+
# --color-out
|
|
40
|
+
# --library-type <string> (fr-unstranded, fr-firststrand,
|
|
41
|
+
# fr-secondstrand)
|
|
42
|
+
# -p/--num-threads <int> [ default: 1 ]
|
|
43
|
+
# -G/--GTF <filename>
|
|
44
|
+
# -j/--raw-juncs <filename>
|
|
45
|
+
# --insertions <filename>
|
|
46
|
+
# --deletions <filename>
|
|
47
|
+
# -r/--mate-inner-dist <int>
|
|
48
|
+
# --mate-std-dev <int> [ default: 20 ]
|
|
49
|
+
# --no-novel-juncs
|
|
50
|
+
# --no-novel-indels
|
|
51
|
+
# --no-gtf-juncs
|
|
52
|
+
# --no-coverage-search
|
|
53
|
+
# --coverage-search
|
|
54
|
+
# --no-closure-search
|
|
55
|
+
# --closure-search
|
|
56
|
+
# --microexon-search
|
|
57
|
+
# --butterfly-search
|
|
58
|
+
# --no-butterfly-search
|
|
59
|
+
# --keep-tmp
|
|
60
|
+
# --tmp-dir <dirname> [ default: <output_dir>/tmp ]
|
|
61
|
+
# -z/--zpacker <program> [ default: gzip ]
|
|
62
|
+
# -X/--unmapped-fifo [ use mkfifo to compress more temporary files]
|
|
63
|
+
#
|
|
64
|
+
# Advanced Options:
|
|
65
|
+
# --initial-read-mismatches <int> [ default: 2 ]
|
|
66
|
+
# --segment-mismatches <int> [ default: 2 ]
|
|
67
|
+
# --segment-length <int> [ default: 25 ]
|
|
68
|
+
# --bowtie-n [ default: bowtie -v ]
|
|
69
|
+
# --min-closure-exon <int> [ default: 100 ]
|
|
70
|
+
# --min-closure-intron <int> [ default: 50 ]
|
|
71
|
+
# --max-closure-intron <int> [ default: 5000 ]
|
|
72
|
+
# --min-coverage-intron <int> [ default: 50 ]
|
|
73
|
+
# --max-coverage-intron <int> [ default: 20000 ]
|
|
74
|
+
# --min-segment-intron <int> [ default: 50 ]
|
|
75
|
+
# --max-segment-intron <int> [ default: 500000 ]
|
|
76
|
+
# --no-sort-bam [Output BAM is not coordinate-sorted]
|
|
77
|
+
# --no-convert-bam [Do not convert to bam format.
|
|
78
|
+
# Output is <output_dir>accepted_hit.sam.
|
|
79
|
+
# Implies --no-sort-bam.]
|
|
80
|
+
#
|
|
81
|
+
# SAM Header Options (for embedding sequencing run metadata in output):
|
|
82
|
+
# --rg-id <string> (read group ID)
|
|
83
|
+
# --rg-sample <string> (sample ID)
|
|
84
|
+
# --rg-library <string> (library ID)
|
|
85
|
+
# --rg-description <string> (descriptive string, no tabs allowed)
|
|
86
|
+
# --rg-platform-unit <string> (e.g Illumina lane ID)
|
|
87
|
+
# --rg-center <string> (sequencing center name)
|
|
88
|
+
# --rg-date <string> (ISO 8601 date of the sequencing run)
|
|
89
|
+
# --rg-platform <string> (Sequencing platform descriptor)
|
|
90
|
+
#
|
|
91
|
+
# for detailed help see http://tophat.cbcb.umd.edu/manual.html
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
module Bio
|
|
95
|
+
module Ngs
|
|
96
|
+
class Tophat
|
|
97
|
+
|
|
98
|
+
include Bio::Command::Wrapper
|
|
99
|
+
|
|
100
|
+
set_program Bio::Ngs::Utils.binary("tophat")
|
|
101
|
+
|
|
102
|
+
add_option "output-dir",:type => :string, :aliases => '-o'
|
|
103
|
+
add_option "min-anchor", :type => :numeric, :aliases => '-a'
|
|
104
|
+
add_option "splice-mismatches", :type => :numeric, :aliases => '-m'
|
|
105
|
+
add_option "min-intron-length", :type => :numeric , :aliases => '-i'
|
|
106
|
+
add_option "max-intron-length", :type => :numeric, :aliases => '-I'
|
|
107
|
+
add_option "max-multihits", :type => :numeric, :aliases => '-g'
|
|
108
|
+
add_option "min-isoform_fraction", :type => :numeric, :aliases => '-F'
|
|
109
|
+
add_option "max-insertion-length", :type => :numeric
|
|
110
|
+
add_option "max-deletion-length", :type => :numeric
|
|
111
|
+
add_option "solexa-quals", :type => :boolean
|
|
112
|
+
add_option "solexa1.3-quals", :type => :boolean, :aliases => '--phred64-quals'
|
|
113
|
+
add_option :quals, :type => :boolean, :aliases => '-Q'
|
|
114
|
+
add_option "integer-quals", :type => :boolean
|
|
115
|
+
add_option :color, :type => :boolean, :aliases => '-C'
|
|
116
|
+
add_option "library-type", :type => :string
|
|
117
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p'
|
|
118
|
+
add_option "GTF", :type => :string, :aliases => '-G'
|
|
119
|
+
add_option "raw-juncs", :type => :string, :aliases => '-j'
|
|
120
|
+
add_option :insertions, :type => :string
|
|
121
|
+
add_option :deletions, :type => :string
|
|
122
|
+
add_option "mate-inner-dist", :type=>:numeric, :aliases => '-r'
|
|
123
|
+
add_option "mate-std-dev", :type => :numeric
|
|
124
|
+
add_option "no-novel-juncs", :type => :boolean
|
|
125
|
+
add_option "allow-indels", :type => :boolean
|
|
126
|
+
add_option "no-novel-indels", :type => :boolean
|
|
127
|
+
add_option "no-gtf-juncs", :type => :boolean
|
|
128
|
+
add_option "no-coverage-search", :type => :boolean
|
|
129
|
+
add_option "coverage-search", :type => :boolean
|
|
130
|
+
add_option "no-closure-search", :type => :boolean
|
|
131
|
+
add_option "closure-search", :type => :boolean
|
|
132
|
+
add_option "fill-gaps", :type => :boolean
|
|
133
|
+
add_option "microexon-search", :type => :boolean
|
|
134
|
+
add_option "butterfly-search", :type => :boolean
|
|
135
|
+
add_option "no-butterfly-search", :type => :boolean
|
|
136
|
+
add_option "keep-tmp", :type => :boolean
|
|
137
|
+
add_option "tmp-dir", :type => :string
|
|
138
|
+
add_option "segment-mismatches", :type => :numeric
|
|
139
|
+
add_option "segment-length", :type => :numeric
|
|
140
|
+
add_option "min-closure-exon", :type => :numeric
|
|
141
|
+
add_option "min-closure-intron", :type => :numeric
|
|
142
|
+
add_option "max-closure-intron", :type => :numeric
|
|
143
|
+
add_option "min-coverage-intron", :type => :numeric
|
|
144
|
+
add_option "max-coverage-intron", :type => :numeric
|
|
145
|
+
add_option "min-segment-intron", :type => :numeric
|
|
146
|
+
add_option "max-segment-intron", :type => :numeric
|
|
147
|
+
add_option "rg-id", :type => :string
|
|
148
|
+
add_option "rg-sample", :type => :string
|
|
149
|
+
add_option "rg-library", :type => :string
|
|
150
|
+
add_option "rg-description", :type => :string
|
|
151
|
+
add_option "rg-platform-unit", :type => :string
|
|
152
|
+
add_option "rg-center", :type => :string
|
|
153
|
+
add_option "rg-date", :type => :string
|
|
154
|
+
add_option "rg-platform", :type => :string
|
|
155
|
+
|
|
156
|
+
end #That
|
|
157
|
+
end #Ngs
|
|
158
|
+
end #Bio
|