bio-ngs 0.3.2.alpha.01
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
@@ -0,0 +1,170 @@
|
|
1
|
+
#
|
2
|
+
# fastx.rb - The FASTX-Toolkit is a collection of command line tools for Short-Reads FASTA/FASTQ files preprocessing.
|
3
|
+
#
|
4
|
+
# Next-Generation sequencing machines usually produce FASTA or FASTQ files, containing multiple short-reads sequences (possibly with quality information).
|
5
|
+
#
|
6
|
+
# The main processing of such FASTA/FASTQ files is mapping (aka aligning) the sequences to reference genomes or other databases using specialized programs. Example of such mapping programs are: Blat, SHRiMP, LastZ, MAQ and many many others.
|
7
|
+
#
|
8
|
+
# However,
|
9
|
+
# It is sometimes more productive to preprocess the FASTA/FASTQ files before mapping the sequences to the genome - manipulating the sequences to produce better mapping results.
|
10
|
+
#
|
11
|
+
# The FASTX-Toolkit tools perform some of these preprocessing tasks.
|
12
|
+
# http://hannonlab.cshl.edu/fastx_toolkit/
|
13
|
+
#
|
14
|
+
# Copyright:: Copyright (C) 2011
|
15
|
+
# Raoul Bonnal <r@bioruby.org>
|
16
|
+
# License:: The Ruby License
|
17
|
+
#
|
18
|
+
# + Mapped
|
19
|
+
# - Not Yet Mapped
|
20
|
+
#
|
21
|
+
# - fastx_artifacts_filter
|
22
|
+
# - fastx_collapser
|
23
|
+
# + fastx_quality_stats
|
24
|
+
# - fastx_trimmer
|
25
|
+
# - fastx_barcode_splitter.pl
|
26
|
+
# - fastx_nucleotide_distribution_graph.sh
|
27
|
+
# - fastx_renamer
|
28
|
+
# - fastx_uncollapser
|
29
|
+
# - fastx_clipper
|
30
|
+
# - fastx_nucleotide_distribution_line_graph.sh
|
31
|
+
# - fastx_reverse_complement
|
32
|
+
# + fastq_coverage_graph.sh
|
33
|
+
# - fastq_masker
|
34
|
+
# + fastq_quality_boxplot_graph.sh
|
35
|
+
# - fastq_quality_converter
|
36
|
+
# - fastq_quality_filter
|
37
|
+
# - fastq_quality_trimmer
|
38
|
+
# - fastq_to_fasta
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
module Bio
|
43
|
+
module Ngs
|
44
|
+
module Fastx
|
45
|
+
|
46
|
+
# [-h] = This helpful help screen.
|
47
|
+
# [-t N] = Quality threshold - nucleotides with lower
|
48
|
+
# quality will be trimmed (from the end of the sequence).
|
49
|
+
# [-l N] = Minimum length - sequences shorter than this (after trimming)
|
50
|
+
# will be discarded. Default = 0 = no minimum length.
|
51
|
+
# [-z] = Compress output with GZIP.
|
52
|
+
# [-i INFILE] = FASTQ input file. default is STDIN.
|
53
|
+
# [-o OUTFILE] = FASTQ output file. default is STDOUT.
|
54
|
+
# [-v] = Verbose - report number of sequences.
|
55
|
+
# If [-o] is specified, report will be printed to STDOUT.
|
56
|
+
# If [-o] is not specified (and output goes to STDOUT),
|
57
|
+
# report will be printed to STDERR.
|
58
|
+
class Trim
|
59
|
+
include Bio::Command::Wrapper
|
60
|
+
set_program Bio::Ngs::Utils.binary("fastq_quality_trimmer")
|
61
|
+
use_aliases
|
62
|
+
add_option :min_size, :type=>:numeric, :default=>20, :aliases => "-l", :desc=>"Minimum length - sequences shorter than this (after trimming)
|
63
|
+
will be discarded. Default = 0 = no minimum length."
|
64
|
+
add_option :min_quality, :type=>:numeric, :default=>10, :aliases => "-t", :desc=>"Quality threshold - nucleotides with lower
|
65
|
+
quality will be trimmed (from the end of the sequence)."
|
66
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
|
67
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
|
68
|
+
add_option :gzip, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP."
|
69
|
+
add_option :verbose, :type => :boolean, :aliases => "-v", :desc => "[-v] = Verbose - report number of sequences.
|
70
|
+
If [-o] is specified, report will be printed to STDOUT.
|
71
|
+
If [-o] is not specified (and output goes to STDOUT),
|
72
|
+
report will be printed to STDERR."
|
73
|
+
add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
|
74
|
+
end #Trim
|
75
|
+
|
76
|
+
# Solexa-Quality BoxPlot plotter
|
77
|
+
# Generates a solexa quality score box-plot graph
|
78
|
+
#
|
79
|
+
# Usage: /usr/local/bin/fastq_quality_boxplot_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
|
80
|
+
#
|
81
|
+
# [-p] - Generate PostScript (.PS) file. Default is PNG image.
|
82
|
+
# [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
|
83
|
+
# [-o OUTPUT] - Output file name. default is STDOUT.
|
84
|
+
# [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
|
85
|
+
class ReadsBoxPlot
|
86
|
+
include Bio::Command::Wrapper
|
87
|
+
set_program Bio::Ngs::Utils.binary("fastq_quality_boxplot_graph.sh")
|
88
|
+
use_aliases
|
89
|
+
add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
|
90
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
|
91
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
|
92
|
+
add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
|
93
|
+
end #ReadsBoxPlot
|
94
|
+
|
95
|
+
# Solexa-Reads coverage plotter
|
96
|
+
# Generates a solexa line coverage graph
|
97
|
+
#
|
98
|
+
# Usage: /usr/local/bin/fastq_coverage_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
|
99
|
+
#
|
100
|
+
# [-p] - Generate PostScript (.PS) file. Default is PNG image.
|
101
|
+
# [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
|
102
|
+
# [-o OUTPUT] - Output file name. default is STDOUT.
|
103
|
+
# [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
|
104
|
+
class ReadsCoverage
|
105
|
+
include Bio::Command::Wrapper
|
106
|
+
set_program Bio::Ngs::Utils.binary("fastq_coverage_graph.sh")
|
107
|
+
use_aliases
|
108
|
+
add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
|
109
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
|
110
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
|
111
|
+
add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
|
112
|
+
end #ReadsCoverage
|
113
|
+
|
114
|
+
|
115
|
+
# usage: fastx_quality_stats [-h] [-N] [-i INFILE] [-o OUTFILE]
|
116
|
+
# Part of FASTX Toolkit 0.0.13 by A. Gordon (gordon@cshl.edu)
|
117
|
+
#
|
118
|
+
# [-h] = This helpful help screen.
|
119
|
+
# [-i INFILE] = FASTQ input file. default is STDIN.
|
120
|
+
# [-o OUTFILE] = TEXT output file. default is STDOUT.
|
121
|
+
# [-N] = New output format (with more information per nucleotide/cycle).
|
122
|
+
#
|
123
|
+
# The *OLD* output TEXT file will have the following fields (one row per column):
|
124
|
+
# column = column number (1 to 36 for a 36-cycles read solexa file)
|
125
|
+
# count = number of bases found in this column.
|
126
|
+
# min = Lowest quality score value found in this column.
|
127
|
+
# max = Highest quality score value found in this column.
|
128
|
+
# sum = Sum of quality score values for this column.
|
129
|
+
# mean = Mean quality score value for this column.
|
130
|
+
# Q1 = 1st quartile quality score.
|
131
|
+
# med = Median quality score.
|
132
|
+
# Q3 = 3rd quartile quality score.
|
133
|
+
# IQR = Inter-Quartile range (Q3-Q1).
|
134
|
+
# lW = 'Left-Whisker' value (for boxplotting).
|
135
|
+
# rW = 'Right-Whisker' value (for boxplotting).
|
136
|
+
# A_Count = Count of 'A' nucleotides found in this column.
|
137
|
+
# C_Count = Count of 'C' nucleotides found in this column.
|
138
|
+
# G_Count = Count of 'G' nucleotides found in this column.
|
139
|
+
# T_Count = Count of 'T' nucleotides found in this column.
|
140
|
+
# N_Count = Count of 'N' nucleotides found in this column.
|
141
|
+
# max-count = max. number of bases (in all cycles)
|
142
|
+
#
|
143
|
+
#
|
144
|
+
# The *NEW* output format:
|
145
|
+
# cycle (previously called 'column') = cycle number
|
146
|
+
# max-count
|
147
|
+
# For each nucleotide in the cycle (ALL/A/C/G/T/N):
|
148
|
+
# count = number of bases found in this column.
|
149
|
+
# min = Lowest quality score value found in this column.
|
150
|
+
# max = Highest quality score value found in this column.
|
151
|
+
# sum = Sum of quality score values for this column.
|
152
|
+
# mean = Mean quality score value for this column.
|
153
|
+
# Q1 = 1st quartile quality score.
|
154
|
+
# med = Median quality score.
|
155
|
+
# Q3 = 3rd quartile quality score.
|
156
|
+
# IQR = Inter-Quartile range (Q3-Q1).
|
157
|
+
# lW = 'Left-Whisker' value (for boxplotting).
|
158
|
+
# rW = 'Right-Whisker' value (for boxplotting).
|
159
|
+
class FastqStats
|
160
|
+
include Bio::Command::Wrapper
|
161
|
+
set_program Bio::Ngs::Utils.binary("fastx_quality_stats")
|
162
|
+
use_aliases
|
163
|
+
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
|
164
|
+
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
|
165
|
+
add_option :new_format, :type => :boolean, :aliases => "-N", :desc => "New output format (with more information per nucleotide/cycle)."
|
166
|
+
end #ReadsCoverage
|
167
|
+
|
168
|
+
end #Fastx
|
169
|
+
end #Ngs
|
170
|
+
end #Bio
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#
|
2
|
+
# samtools.rb - description
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2011
|
5
|
+
# Raoul Bonnal <r@bioruby.org>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
#
|
9
|
+
#
|
10
|
+
# Program: samtools (Tools for alignments in the SAM format)
|
11
|
+
# Version: 0.1.16 (r963:234)
|
12
|
+
#
|
13
|
+
# Usage: samtools <command> [options]
|
14
|
+
#
|
15
|
+
# Command: view SAM<->BAM conversion
|
16
|
+
# sort sort alignment file
|
17
|
+
# pileup generate pileup output
|
18
|
+
# mpileup multi-way pileup
|
19
|
+
# depth compute the depth
|
20
|
+
# faidx index/extract FASTA
|
21
|
+
# tview text alignment viewer
|
22
|
+
# index index alignment
|
23
|
+
# idxstats BAM index stats (r595 or later)
|
24
|
+
# fixmate fix mate information
|
25
|
+
# glfview print GLFv3 file
|
26
|
+
# flagstat simple stats
|
27
|
+
# calmd recalculate MD/NM tags and '=' bases
|
28
|
+
# merge merge sorted alignments
|
29
|
+
# rmdup remove PCR duplicates
|
30
|
+
# reheader replace BAM header
|
31
|
+
# cat concatenate BAMs
|
32
|
+
# targetcut cut fosmid regions (for fosmid pool only)
|
33
|
+
# phase phase heterozygotes
|
34
|
+
|
35
|
+
|
36
|
+
module Bio
|
37
|
+
module Ngs
|
38
|
+
module Samtools
|
39
|
+
|
40
|
+
# Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]
|
41
|
+
#
|
42
|
+
# Options: -b output BAM
|
43
|
+
# -h print header for the SAM output
|
44
|
+
# -H print header only (no alignments)
|
45
|
+
# -S input is SAM
|
46
|
+
# -u uncompressed BAM output (force -b)
|
47
|
+
# -1 fast compression (force -b)
|
48
|
+
# -x output FLAG in HEX (samtools-C specific)
|
49
|
+
# -X output FLAG in string (samtools-C specific)
|
50
|
+
# -c print only the count of matching records
|
51
|
+
# -L FILE output alignments overlapping the input BED FILE [null]
|
52
|
+
# -t FILE list of reference names and lengths (force -S) [null]
|
53
|
+
# -T FILE reference sequence file (force -S) [null]
|
54
|
+
# -o FILE output file name [stdout]
|
55
|
+
# -R FILE list of read groups to be outputted [null]
|
56
|
+
# -f INT required flag, 0 for unset [0]
|
57
|
+
# -F INT filtering flag, 0 for unset [0]
|
58
|
+
# -q INT minimum mapping quality [0]
|
59
|
+
# -l STR only output reads in library STR [null]
|
60
|
+
# -r STR only output reads in read group STR [null]
|
61
|
+
# -? longer help
|
62
|
+
class View
|
63
|
+
include Bio::Command::Wrapper
|
64
|
+
set_program Bio::Ngs::Utils.binary("samtools")
|
65
|
+
set_sub_program "view"
|
66
|
+
use_aliases
|
67
|
+
add_option :bam_output, :type => :boolean, :aliases => "-b", :desc => "output BAM", :default => true
|
68
|
+
add_option :print_header_alignment, :type => :boolean, :aliases => "-h", :desc => "print header for the SAM output"
|
69
|
+
add_option :print_header_only, :type => :boolean, :aliases => "-H", :desc => "print header only (no alignments)"
|
70
|
+
add_option :sam_input, :type => :boolean, :aliases => "-S", :desc => "input is SAM"
|
71
|
+
add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output (force -b)"
|
72
|
+
add_option :compress, :type => :boolean , :aliases => "-1", :desc => "fast compression (force -b)"
|
73
|
+
add_option :flag_hex, :type => :boolean, :aliases => "-x", :desc => "output FLAG in HEX (samtools-C specific)"
|
74
|
+
add_option :flag_string, :type => :boolean, :aliases => "-X", :desc => "output FLAS is string (samtools-C specific)"
|
75
|
+
add_option :output_alignment, :type => :string, :aliases => "-L", :desc => "output alignments overlapping the input BED FILE [null]"
|
76
|
+
add_option :list_ref, :type => :string, :aliases => "-t", :desc => "list of reference names and lengths (force -S) [null]"
|
77
|
+
add_option :ref_sequence, :type => :string, :aliases => "-T", :desc => "reference sequence file (force -S) [null]"
|
78
|
+
add_option :output, :type => :string, :aliases => "-o", :desc => "output file name [stdout]", :required => true
|
79
|
+
add_option :list_group, :type => :string, :aliases => "-R", :desc => "list of read groups to be outputted [null]"
|
80
|
+
add_option :required_flag, :type => :numeric, :aliases => "-f", :desc => "required flag, 0 for unset [0]"
|
81
|
+
add_option :filtering_flag, :type => :numeric, :aliases => "-F", :desc => "filtering flag, 0 for unset [0]"
|
82
|
+
add_option :min_map_qual, :type => :numeric, :aliases => "-q", :desc => "minimum mapping quality [0]"
|
83
|
+
add_option :only_lib_reads, :type => :string, :aliases => "-l", :desc => "only output reads in library STR [null]"
|
84
|
+
add_option :only_grp_reads, :type => :string, :aliases => "r", :desc => "only output reads in read group STR [null]"
|
85
|
+
|
86
|
+
end #View
|
87
|
+
|
88
|
+
# Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]
|
89
|
+
#
|
90
|
+
# Options: -n sort by read names
|
91
|
+
# -r attach RG tag (inferred from file names)
|
92
|
+
# -u uncompressed BAM output
|
93
|
+
# -f overwrite the output BAM if exist
|
94
|
+
# -1 compress level 1
|
95
|
+
# -R STR merge file in the specified region STR [all]
|
96
|
+
# -h FILE copy the header in FILE to <out.bam> [in1.bam]
|
97
|
+
#
|
98
|
+
# Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users
|
99
|
+
# must provide the correct header with -h, or uses Picard which properly maintains
|
100
|
+
# the header dictionary in merging.
|
101
|
+
#out, in1, in2, ... inx Must be passed as arguments
|
102
|
+
class Merge
|
103
|
+
include Bio::Command::Wrapper
|
104
|
+
set_program Bio::Ngs::Utils.binary("samtools")
|
105
|
+
set_sub_program "merge"
|
106
|
+
use_aliases
|
107
|
+
add_option :sort_by_read_name, :type => :boolean, :aliases => "-n", :desc => "sort by read names"
|
108
|
+
add_option :attach_rg, :type => :boolean, :aliases => "-r", :desc => "attach RG tag (inferred from file names)"
|
109
|
+
add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output"
|
110
|
+
add_option :overwrite_output, :type => :boolean, :aliases => "-f", :desc => "overwrite the output BAM if exist"
|
111
|
+
add_option :compress, :type => :boolean , :aliases => "-1", :desc => "compress level 1"
|
112
|
+
add_option :merge_regions, :type => :string, :aliases => "-R", :desc => "merge file in the specified region STR [all]"
|
113
|
+
add_option :copy_header, :type => :string, :aliases => "-h", :desc => "copy the header in FILE to <out.bam> [in1.bam]"
|
114
|
+
end #Merge
|
115
|
+
|
116
|
+
end #Samtools
|
117
|
+
end #Ngs
|
118
|
+
end #Bio
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Bio
|
2
|
+
module Ngs
|
3
|
+
class SffExtract
|
4
|
+
|
5
|
+
include Bio::Command::Wrapper
|
6
|
+
|
7
|
+
set_program Bio::Ngs::Utils.binary("sff_extract")
|
8
|
+
add_option "append", :type => :boolean, :aliases => "-a", :desc => "append output to existing files"
|
9
|
+
add_option "xml_info", :type => :string, :aliases => "-i", :desc => "extra info to write in the xml file"
|
10
|
+
add_option "linker_file", :type => :string, :aliases => "-l", :desc => "FASTA file with paired-end linker sequences"
|
11
|
+
add_option "clip", :type => :boolean, :aliases => "-c", :desc => "clip (completely remove) ends with low qual and/or adaptor sequence"
|
12
|
+
add_option "upper_case", :type => :boolean, :aliases => "-u", :desc => "all bases in upper case, including clipped ends"
|
13
|
+
add_option "min_left_clip", :type => :numeric, :desc => "if the left clip coming from the SFF is smaller than this value, override it"
|
14
|
+
add_option "fastq", :type => :boolean, :aliases => "-Q", :desc => "store as FASTQ file instead of FASTA + FASTA quality file"
|
15
|
+
add_option "out_basename", :type => :string, :aliases => "-o", :desc => "base name for all output files"
|
16
|
+
add_option "seq_file", :type => :string, :aliases => "-s", :desc => "output sequence file name"
|
17
|
+
add_option "qual_file", :type => :string, :aliases => "-q", :desc => "output quality file name"
|
18
|
+
add_option "xml_file", :type => :string, :aliases => "-x", :desc => "output ancillary xml file name"
|
19
|
+
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
#
|
2
|
+
# tophat.rb - description
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2011
|
5
|
+
# Raoul Bonnal <@bioruby.org>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
|
11
|
+
#require 'bio/command'
|
12
|
+
#require 'shellwords'
|
13
|
+
#require 'thor'
|
14
|
+
#require 'bio/ngs/utils'
|
15
|
+
|
16
|
+
# TopHat maps short sequences from spliced transcripts to whole genomes.
|
17
|
+
#
|
18
|
+
# Usage:
|
19
|
+
# tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \
|
20
|
+
# [quals1,[quals2,...]] [quals1[,quals2,...]]
|
21
|
+
#
|
22
|
+
# Options:
|
23
|
+
# -v/--version
|
24
|
+
# -o/--output-dir <string> [ default: ./tophat_out ]
|
25
|
+
# -a/--min-anchor <int> [ default: 8 ]
|
26
|
+
# -m/--splice-mismatches <0-2> [ default: 0 ]
|
27
|
+
# -i/--min-intron-length <int> [ default: 50 ]
|
28
|
+
# -I/--max-intron-length <int> [ default: 500000 ]
|
29
|
+
# -g/--max-multihits <int> [ default: 20 ]
|
30
|
+
# -F/--min-isoform-fraction <float> [ default: 0.15 ]
|
31
|
+
# --max-insertion-length <int> [ default: 3 ]
|
32
|
+
# --max-deletion-length <int> [ default: 3 ]
|
33
|
+
# --solexa-quals
|
34
|
+
# --solexa1.3-quals (same as phred64-quals)
|
35
|
+
# --phred64-quals (same as solexa1.3-quals)
|
36
|
+
# -Q/--quals
|
37
|
+
# --integer-quals
|
38
|
+
# -C/--color (Solid - color space)
|
39
|
+
# --color-out
|
40
|
+
# --library-type <string> (fr-unstranded, fr-firststrand,
|
41
|
+
# fr-secondstrand)
|
42
|
+
# -p/--num-threads <int> [ default: 1 ]
|
43
|
+
# -G/--GTF <filename>
|
44
|
+
# -j/--raw-juncs <filename>
|
45
|
+
# --insertions <filename>
|
46
|
+
# --deletions <filename>
|
47
|
+
# -r/--mate-inner-dist <int>
|
48
|
+
# --mate-std-dev <int> [ default: 20 ]
|
49
|
+
# --no-novel-juncs
|
50
|
+
# --no-novel-indels
|
51
|
+
# --no-gtf-juncs
|
52
|
+
# --no-coverage-search
|
53
|
+
# --coverage-search
|
54
|
+
# --no-closure-search
|
55
|
+
# --closure-search
|
56
|
+
# --microexon-search
|
57
|
+
# --butterfly-search
|
58
|
+
# --no-butterfly-search
|
59
|
+
# --keep-tmp
|
60
|
+
# --tmp-dir <dirname> [ default: <output_dir>/tmp ]
|
61
|
+
# -z/--zpacker <program> [ default: gzip ]
|
62
|
+
# -X/--unmapped-fifo [ use mkfifo to compress more temporary files]
|
63
|
+
#
|
64
|
+
# Advanced Options:
|
65
|
+
# --initial-read-mismatches <int> [ default: 2 ]
|
66
|
+
# --segment-mismatches <int> [ default: 2 ]
|
67
|
+
# --segment-length <int> [ default: 25 ]
|
68
|
+
# --bowtie-n [ default: bowtie -v ]
|
69
|
+
# --min-closure-exon <int> [ default: 100 ]
|
70
|
+
# --min-closure-intron <int> [ default: 50 ]
|
71
|
+
# --max-closure-intron <int> [ default: 5000 ]
|
72
|
+
# --min-coverage-intron <int> [ default: 50 ]
|
73
|
+
# --max-coverage-intron <int> [ default: 20000 ]
|
74
|
+
# --min-segment-intron <int> [ default: 50 ]
|
75
|
+
# --max-segment-intron <int> [ default: 500000 ]
|
76
|
+
# --no-sort-bam [Output BAM is not coordinate-sorted]
|
77
|
+
# --no-convert-bam [Do not convert to bam format.
|
78
|
+
# Output is <output_dir>accepted_hit.sam.
|
79
|
+
# Implies --no-sort-bam.]
|
80
|
+
#
|
81
|
+
# SAM Header Options (for embedding sequencing run metadata in output):
|
82
|
+
# --rg-id <string> (read group ID)
|
83
|
+
# --rg-sample <string> (sample ID)
|
84
|
+
# --rg-library <string> (library ID)
|
85
|
+
# --rg-description <string> (descriptive string, no tabs allowed)
|
86
|
+
# --rg-platform-unit <string> (e.g Illumina lane ID)
|
87
|
+
# --rg-center <string> (sequencing center name)
|
88
|
+
# --rg-date <string> (ISO 8601 date of the sequencing run)
|
89
|
+
# --rg-platform <string> (Sequencing platform descriptor)
|
90
|
+
#
|
91
|
+
# for detailed help see http://tophat.cbcb.umd.edu/manual.html
|
92
|
+
|
93
|
+
|
94
|
+
module Bio
|
95
|
+
module Ngs
|
96
|
+
class Tophat
|
97
|
+
|
98
|
+
include Bio::Command::Wrapper
|
99
|
+
|
100
|
+
set_program Bio::Ngs::Utils.binary("tophat")
|
101
|
+
|
102
|
+
add_option "output-dir",:type => :string, :aliases => '-o'
|
103
|
+
add_option "min-anchor", :type => :numeric, :aliases => '-a'
|
104
|
+
add_option "splice-mismatches", :type => :numeric, :aliases => '-m'
|
105
|
+
add_option "min-intron-length", :type => :numeric , :aliases => '-i'
|
106
|
+
add_option "max-intron-length", :type => :numeric, :aliases => '-I'
|
107
|
+
add_option "max-multihits", :type => :numeric, :aliases => '-g'
|
108
|
+
add_option "min-isoform_fraction", :type => :numeric, :aliases => '-F'
|
109
|
+
add_option "max-insertion-length", :type => :numeric
|
110
|
+
add_option "max-deletion-length", :type => :numeric
|
111
|
+
add_option "solexa-quals", :type => :boolean
|
112
|
+
add_option "solexa1.3-quals", :type => :boolean, :aliases => '--phred64-quals'
|
113
|
+
add_option :quals, :type => :boolean, :aliases => '-Q'
|
114
|
+
add_option "integer-quals", :type => :boolean
|
115
|
+
add_option :color, :type => :boolean, :aliases => '-C'
|
116
|
+
add_option "library-type", :type => :string
|
117
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p'
|
118
|
+
add_option "GTF", :type => :string, :aliases => '-G'
|
119
|
+
add_option "raw-juncs", :type => :string, :aliases => '-j'
|
120
|
+
add_option :insertions, :type => :string
|
121
|
+
add_option :deletions, :type => :string
|
122
|
+
add_option "mate-inner-dist", :type=>:numeric, :aliases => '-r'
|
123
|
+
add_option "mate-std-dev", :type => :numeric
|
124
|
+
add_option "no-novel-juncs", :type => :boolean
|
125
|
+
add_option "allow-indels", :type => :boolean
|
126
|
+
add_option "no-novel-indels", :type => :boolean
|
127
|
+
add_option "no-gtf-juncs", :type => :boolean
|
128
|
+
add_option "no-coverage-search", :type => :boolean
|
129
|
+
add_option "coverage-search", :type => :boolean
|
130
|
+
add_option "no-closure-search", :type => :boolean
|
131
|
+
add_option "closure-search", :type => :boolean
|
132
|
+
add_option "fill-gaps", :type => :boolean
|
133
|
+
add_option "microexon-search", :type => :boolean
|
134
|
+
add_option "butterfly-search", :type => :boolean
|
135
|
+
add_option "no-butterfly-search", :type => :boolean
|
136
|
+
add_option "keep-tmp", :type => :boolean
|
137
|
+
add_option "tmp-dir", :type => :string
|
138
|
+
add_option "segment-mismatches", :type => :numeric
|
139
|
+
add_option "segment-length", :type => :numeric
|
140
|
+
add_option "min-closure-exon", :type => :numeric
|
141
|
+
add_option "min-closure-intron", :type => :numeric
|
142
|
+
add_option "max-closure-intron", :type => :numeric
|
143
|
+
add_option "min-coverage-intron", :type => :numeric
|
144
|
+
add_option "max-coverage-intron", :type => :numeric
|
145
|
+
add_option "min-segment-intron", :type => :numeric
|
146
|
+
add_option "max-segment-intron", :type => :numeric
|
147
|
+
add_option "rg-id", :type => :string
|
148
|
+
add_option "rg-sample", :type => :string
|
149
|
+
add_option "rg-library", :type => :string
|
150
|
+
add_option "rg-description", :type => :string
|
151
|
+
add_option "rg-platform-unit", :type => :string
|
152
|
+
add_option "rg-center", :type => :string
|
153
|
+
add_option "rg-date", :type => :string
|
154
|
+
add_option "rg-platform", :type => :string
|
155
|
+
|
156
|
+
end #That
|
157
|
+
end #Ngs
|
158
|
+
end #Bio
|