bio-ngs 0.3.2.alpha.01

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,170 @@
1
+ #
2
+ # fastx.rb - The FASTX-Toolkit is a collection of command line tools for Short-Reads FASTA/FASTQ files preprocessing.
3
+ #
4
+ # Next-Generation sequencing machines usually produce FASTA or FASTQ files, containing multiple short-reads sequences (possibly with quality information).
5
+ #
6
+ # The main processing of such FASTA/FASTQ files is mapping (aka aligning) the sequences to reference genomes or other databases using specialized programs. Example of such mapping programs are: Blat, SHRiMP, LastZ, MAQ and many many others.
7
+ #
8
+ # However,
9
+ # It is sometimes more productive to preprocess the FASTA/FASTQ files before mapping the sequences to the genome - manipulating the sequences to produce better mapping results.
10
+ #
11
+ # The FASTX-Toolkit tools perform some of these preprocessing tasks.
12
+ # http://hannonlab.cshl.edu/fastx_toolkit/
13
+ #
14
+ # Copyright:: Copyright (C) 2011
15
+ # Raoul Bonnal <r@bioruby.org>
16
+ # License:: The Ruby License
17
+ #
18
+ # + Mapped
19
+ # - Not Yet Mapped
20
+ #
21
+ # - fastx_artifacts_filter
22
+ # - fastx_collapser
23
+ # + fastx_quality_stats
24
+ # - fastx_trimmer
25
+ # - fastx_barcode_splitter.pl
26
+ # - fastx_nucleotide_distribution_graph.sh
27
+ # - fastx_renamer
28
+ # - fastx_uncollapser
29
+ # - fastx_clipper
30
+ # - fastx_nucleotide_distribution_line_graph.sh
31
+ # - fastx_reverse_complement
32
+ # + fastq_coverage_graph.sh
33
+ # - fastq_masker
34
+ # + fastq_quality_boxplot_graph.sh
35
+ # - fastq_quality_converter
36
+ # - fastq_quality_filter
37
+ # - fastq_quality_trimmer
38
+ # - fastq_to_fasta
39
+
40
+
41
+
42
+ module Bio
43
+ module Ngs
44
+ module Fastx
45
+
46
+ # [-h] = This helpful help screen.
47
+ # [-t N] = Quality threshold - nucleotides with lower
48
+ # quality will be trimmed (from the end of the sequence).
49
+ # [-l N] = Minimum length - sequences shorter than this (after trimming)
50
+ # will be discarded. Default = 0 = no minimum length.
51
+ # [-z] = Compress output with GZIP.
52
+ # [-i INFILE] = FASTQ input file. default is STDIN.
53
+ # [-o OUTFILE] = FASTQ output file. default is STDOUT.
54
+ # [-v] = Verbose - report number of sequences.
55
+ # If [-o] is specified, report will be printed to STDOUT.
56
+ # If [-o] is not specified (and output goes to STDOUT),
57
+ # report will be printed to STDERR.
58
+ class Trim
59
+ include Bio::Command::Wrapper
60
+ set_program Bio::Ngs::Utils.binary("fastq_quality_trimmer")
61
+ use_aliases
62
+ add_option :min_size, :type=>:numeric, :default=>20, :aliases => "-l", :desc=>"Minimum length - sequences shorter than this (after trimming)
63
+ will be discarded. Default = 0 = no minimum length."
64
+ add_option :min_quality, :type=>:numeric, :default=>10, :aliases => "-t", :desc=>"Quality threshold - nucleotides with lower
65
+ quality will be trimmed (from the end of the sequence)."
66
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
67
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
68
+ add_option :gzip, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP."
69
+ add_option :verbose, :type => :boolean, :aliases => "-v", :desc => "[-v] = Verbose - report number of sequences.
70
+ If [-o] is specified, report will be printed to STDOUT.
71
+ If [-o] is not specified (and output goes to STDOUT),
72
+ report will be printed to STDERR."
73
+ add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
74
+ end #Trim
75
+
76
+ # Solexa-Quality BoxPlot plotter
77
+ # Generates a solexa quality score box-plot graph
78
+ #
79
+ # Usage: /usr/local/bin/fastq_quality_boxplot_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
80
+ #
81
+ # [-p] - Generate PostScript (.PS) file. Default is PNG image.
82
+ # [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
83
+ # [-o OUTPUT] - Output file name. default is STDOUT.
84
+ # [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
85
+ class ReadsBoxPlot
86
+ include Bio::Command::Wrapper
87
+ set_program Bio::Ngs::Utils.binary("fastq_quality_boxplot_graph.sh")
88
+ use_aliases
89
+ add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
90
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
91
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
92
+ add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
93
+ end #ReadsBoxPlot
94
+
95
+ # Solexa-Reads coverage plotter
96
+ # Generates a solexa line coverage graph
97
+ #
98
+ # Usage: /usr/local/bin/fastq_coverage_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
99
+ #
100
+ # [-p] - Generate PostScript (.PS) file. Default is PNG image.
101
+ # [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
102
+ # [-o OUTPUT] - Output file name. default is STDOUT.
103
+ # [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
104
+ class ReadsCoverage
105
+ include Bio::Command::Wrapper
106
+ set_program Bio::Ngs::Utils.binary("fastq_coverage_graph.sh")
107
+ use_aliases
108
+ add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
109
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
110
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
111
+ add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
112
+ end #ReadsCoverage
113
+
114
+
115
+ # usage: fastx_quality_stats [-h] [-N] [-i INFILE] [-o OUTFILE]
116
+ # Part of FASTX Toolkit 0.0.13 by A. Gordon (gordon@cshl.edu)
117
+ #
118
+ # [-h] = This helpful help screen.
119
+ # [-i INFILE] = FASTQ input file. default is STDIN.
120
+ # [-o OUTFILE] = TEXT output file. default is STDOUT.
121
+ # [-N] = New output format (with more information per nucleotide/cycle).
122
+ #
123
+ # The *OLD* output TEXT file will have the following fields (one row per column):
124
+ # column = column number (1 to 36 for a 36-cycles read solexa file)
125
+ # count = number of bases found in this column.
126
+ # min = Lowest quality score value found in this column.
127
+ # max = Highest quality score value found in this column.
128
+ # sum = Sum of quality score values for this column.
129
+ # mean = Mean quality score value for this column.
130
+ # Q1 = 1st quartile quality score.
131
+ # med = Median quality score.
132
+ # Q3 = 3rd quartile quality score.
133
+ # IQR = Inter-Quartile range (Q3-Q1).
134
+ # lW = 'Left-Whisker' value (for boxplotting).
135
+ # rW = 'Right-Whisker' value (for boxplotting).
136
+ # A_Count = Count of 'A' nucleotides found in this column.
137
+ # C_Count = Count of 'C' nucleotides found in this column.
138
+ # G_Count = Count of 'G' nucleotides found in this column.
139
+ # T_Count = Count of 'T' nucleotides found in this column.
140
+ # N_Count = Count of 'N' nucleotides found in this column.
141
+ # max-count = max. number of bases (in all cycles)
142
+ #
143
+ #
144
+ # The *NEW* output format:
145
+ # cycle (previously called 'column') = cycle number
146
+ # max-count
147
+ # For each nucleotide in the cycle (ALL/A/C/G/T/N):
148
+ # count = number of bases found in this column.
149
+ # min = Lowest quality score value found in this column.
150
+ # max = Highest quality score value found in this column.
151
+ # sum = Sum of quality score values for this column.
152
+ # mean = Mean quality score value for this column.
153
+ # Q1 = 1st quartile quality score.
154
+ # med = Median quality score.
155
+ # Q3 = 3rd quartile quality score.
156
+ # IQR = Inter-Quartile range (Q3-Q1).
157
+ # lW = 'Left-Whisker' value (for boxplotting).
158
+ # rW = 'Right-Whisker' value (for boxplotting).
159
+ class FastqStats
160
+ include Bio::Command::Wrapper
161
+ set_program Bio::Ngs::Utils.binary("fastx_quality_stats")
162
+ use_aliases
163
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
164
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
165
+ add_option :new_format, :type => :boolean, :aliases => "-N", :desc => "New output format (with more information per nucleotide/cycle)."
166
+ end #ReadsCoverage
167
+
168
+ end #Fastx
169
+ end #Ngs
170
+ end #Bio
@@ -0,0 +1,118 @@
1
+ #
2
+ # samtools.rb - description
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+ #
10
+ # Program: samtools (Tools for alignments in the SAM format)
11
+ # Version: 0.1.16 (r963:234)
12
+ #
13
+ # Usage: samtools <command> [options]
14
+ #
15
+ # Command: view SAM<->BAM conversion
16
+ # sort sort alignment file
17
+ # pileup generate pileup output
18
+ # mpileup multi-way pileup
19
+ # depth compute the depth
20
+ # faidx index/extract FASTA
21
+ # tview text alignment viewer
22
+ # index index alignment
23
+ # idxstats BAM index stats (r595 or later)
24
+ # fixmate fix mate information
25
+ # glfview print GLFv3 file
26
+ # flagstat simple stats
27
+ # calmd recalculate MD/NM tags and '=' bases
28
+ # merge merge sorted alignments
29
+ # rmdup remove PCR duplicates
30
+ # reheader replace BAM header
31
+ # cat concatenate BAMs
32
+ # targetcut cut fosmid regions (for fosmid pool only)
33
+ # phase phase heterozygotes
34
+
35
+
36
+ module Bio
37
+ module Ngs
38
+ module Samtools
39
+
40
+ # Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]
41
+ #
42
+ # Options: -b output BAM
43
+ # -h print header for the SAM output
44
+ # -H print header only (no alignments)
45
+ # -S input is SAM
46
+ # -u uncompressed BAM output (force -b)
47
+ # -1 fast compression (force -b)
48
+ # -x output FLAG in HEX (samtools-C specific)
49
+ # -X output FLAG in string (samtools-C specific)
50
+ # -c print only the count of matching records
51
+ # -L FILE output alignments overlapping the input BED FILE [null]
52
+ # -t FILE list of reference names and lengths (force -S) [null]
53
+ # -T FILE reference sequence file (force -S) [null]
54
+ # -o FILE output file name [stdout]
55
+ # -R FILE list of read groups to be outputted [null]
56
+ # -f INT required flag, 0 for unset [0]
57
+ # -F INT filtering flag, 0 for unset [0]
58
+ # -q INT minimum mapping quality [0]
59
+ # -l STR only output reads in library STR [null]
60
+ # -r STR only output reads in read group STR [null]
61
+ # -? longer help
62
+ class View
63
+ include Bio::Command::Wrapper
64
+ set_program Bio::Ngs::Utils.binary("samtools")
65
+ set_sub_program "view"
66
+ use_aliases
67
+ add_option :bam_output, :type => :boolean, :aliases => "-b", :desc => "output BAM", :default => true
68
+ add_option :print_header_alignment, :type => :boolean, :aliases => "-h", :desc => "print header for the SAM output"
69
+ add_option :print_header_only, :type => :boolean, :aliases => "-H", :desc => "print header only (no alignments)"
70
+ add_option :sam_input, :type => :boolean, :aliases => "-S", :desc => "input is SAM"
71
+ add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output (force -b)"
72
+ add_option :compress, :type => :boolean , :aliases => "-1", :desc => "fast compression (force -b)"
73
+ add_option :flag_hex, :type => :boolean, :aliases => "-x", :desc => "output FLAG in HEX (samtools-C specific)"
74
+ add_option :flag_string, :type => :boolean, :aliases => "-X", :desc => "output FLAS is string (samtools-C specific)"
75
+ add_option :output_alignment, :type => :string, :aliases => "-L", :desc => "output alignments overlapping the input BED FILE [null]"
76
+ add_option :list_ref, :type => :string, :aliases => "-t", :desc => "list of reference names and lengths (force -S) [null]"
77
+ add_option :ref_sequence, :type => :string, :aliases => "-T", :desc => "reference sequence file (force -S) [null]"
78
+ add_option :output, :type => :string, :aliases => "-o", :desc => "output file name [stdout]", :required => true
79
+ add_option :list_group, :type => :string, :aliases => "-R", :desc => "list of read groups to be outputted [null]"
80
+ add_option :required_flag, :type => :numeric, :aliases => "-f", :desc => "required flag, 0 for unset [0]"
81
+ add_option :filtering_flag, :type => :numeric, :aliases => "-F", :desc => "filtering flag, 0 for unset [0]"
82
+ add_option :min_map_qual, :type => :numeric, :aliases => "-q", :desc => "minimum mapping quality [0]"
83
+ add_option :only_lib_reads, :type => :string, :aliases => "-l", :desc => "only output reads in library STR [null]"
84
+ add_option :only_grp_reads, :type => :string, :aliases => "r", :desc => "only output reads in read group STR [null]"
85
+
86
+ end #View
87
+
88
+ # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]
89
+ #
90
+ # Options: -n sort by read names
91
+ # -r attach RG tag (inferred from file names)
92
+ # -u uncompressed BAM output
93
+ # -f overwrite the output BAM if exist
94
+ # -1 compress level 1
95
+ # -R STR merge file in the specified region STR [all]
96
+ # -h FILE copy the header in FILE to <out.bam> [in1.bam]
97
+ #
98
+ # Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users
99
+ # must provide the correct header with -h, or uses Picard which properly maintains
100
+ # the header dictionary in merging.
101
+ #out, in1, in2, ... inx Must be passed as arguments
102
+ class Merge
103
+ include Bio::Command::Wrapper
104
+ set_program Bio::Ngs::Utils.binary("samtools")
105
+ set_sub_program "merge"
106
+ use_aliases
107
+ add_option :sort_by_read_name, :type => :boolean, :aliases => "-n", :desc => "sort by read names"
108
+ add_option :attach_rg, :type => :boolean, :aliases => "-r", :desc => "attach RG tag (inferred from file names)"
109
+ add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output"
110
+ add_option :overwrite_output, :type => :boolean, :aliases => "-f", :desc => "overwrite the output BAM if exist"
111
+ add_option :compress, :type => :boolean , :aliases => "-1", :desc => "compress level 1"
112
+ add_option :merge_regions, :type => :string, :aliases => "-R", :desc => "merge file in the specified region STR [all]"
113
+ add_option :copy_header, :type => :string, :aliases => "-h", :desc => "copy the header in FILE to <out.bam> [in1.bam]"
114
+ end #Merge
115
+
116
+ end #Samtools
117
+ end #Ngs
118
+ end #Bio
@@ -0,0 +1,23 @@
1
+ module Bio
2
+ module Ngs
3
+ class SffExtract
4
+
5
+ include Bio::Command::Wrapper
6
+
7
+ set_program Bio::Ngs::Utils.binary("sff_extract")
8
+ add_option "append", :type => :boolean, :aliases => "-a", :desc => "append output to existing files"
9
+ add_option "xml_info", :type => :string, :aliases => "-i", :desc => "extra info to write in the xml file"
10
+ add_option "linker_file", :type => :string, :aliases => "-l", :desc => "FASTA file with paired-end linker sequences"
11
+ add_option "clip", :type => :boolean, :aliases => "-c", :desc => "clip (completely remove) ends with low qual and/or adaptor sequence"
12
+ add_option "upper_case", :type => :boolean, :aliases => "-u", :desc => "all bases in upper case, including clipped ends"
13
+ add_option "min_left_clip", :type => :numeric, :desc => "if the left clip coming from the SFF is smaller than this value, override it"
14
+ add_option "fastq", :type => :boolean, :aliases => "-Q", :desc => "store as FASTQ file instead of FASTA + FASTA quality file"
15
+ add_option "out_basename", :type => :string, :aliases => "-o", :desc => "base name for all output files"
16
+ add_option "seq_file", :type => :string, :aliases => "-s", :desc => "output sequence file name"
17
+ add_option "qual_file", :type => :string, :aliases => "-q", :desc => "output quality file name"
18
+ add_option "xml_file", :type => :string, :aliases => "-x", :desc => "output ancillary xml file name"
19
+
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,158 @@
1
+ #
2
+ # tophat.rb - description
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+ #require 'bio/command'
12
+ #require 'shellwords'
13
+ #require 'thor'
14
+ #require 'bio/ngs/utils'
15
+
16
+ # TopHat maps short sequences from spliced transcripts to whole genomes.
17
+ #
18
+ # Usage:
19
+ # tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \
20
+ # [quals1,[quals2,...]] [quals1[,quals2,...]]
21
+ #
22
+ # Options:
23
+ # -v/--version
24
+ # -o/--output-dir <string> [ default: ./tophat_out ]
25
+ # -a/--min-anchor <int> [ default: 8 ]
26
+ # -m/--splice-mismatches <0-2> [ default: 0 ]
27
+ # -i/--min-intron-length <int> [ default: 50 ]
28
+ # -I/--max-intron-length <int> [ default: 500000 ]
29
+ # -g/--max-multihits <int> [ default: 20 ]
30
+ # -F/--min-isoform-fraction <float> [ default: 0.15 ]
31
+ # --max-insertion-length <int> [ default: 3 ]
32
+ # --max-deletion-length <int> [ default: 3 ]
33
+ # --solexa-quals
34
+ # --solexa1.3-quals (same as phred64-quals)
35
+ # --phred64-quals (same as solexa1.3-quals)
36
+ # -Q/--quals
37
+ # --integer-quals
38
+ # -C/--color (Solid - color space)
39
+ # --color-out
40
+ # --library-type <string> (fr-unstranded, fr-firststrand,
41
+ # fr-secondstrand)
42
+ # -p/--num-threads <int> [ default: 1 ]
43
+ # -G/--GTF <filename>
44
+ # -j/--raw-juncs <filename>
45
+ # --insertions <filename>
46
+ # --deletions <filename>
47
+ # -r/--mate-inner-dist <int>
48
+ # --mate-std-dev <int> [ default: 20 ]
49
+ # --no-novel-juncs
50
+ # --no-novel-indels
51
+ # --no-gtf-juncs
52
+ # --no-coverage-search
53
+ # --coverage-search
54
+ # --no-closure-search
55
+ # --closure-search
56
+ # --microexon-search
57
+ # --butterfly-search
58
+ # --no-butterfly-search
59
+ # --keep-tmp
60
+ # --tmp-dir <dirname> [ default: <output_dir>/tmp ]
61
+ # -z/--zpacker <program> [ default: gzip ]
62
+ # -X/--unmapped-fifo [ use mkfifo to compress more temporary files]
63
+ #
64
+ # Advanced Options:
65
+ # --initial-read-mismatches <int> [ default: 2 ]
66
+ # --segment-mismatches <int> [ default: 2 ]
67
+ # --segment-length <int> [ default: 25 ]
68
+ # --bowtie-n [ default: bowtie -v ]
69
+ # --min-closure-exon <int> [ default: 100 ]
70
+ # --min-closure-intron <int> [ default: 50 ]
71
+ # --max-closure-intron <int> [ default: 5000 ]
72
+ # --min-coverage-intron <int> [ default: 50 ]
73
+ # --max-coverage-intron <int> [ default: 20000 ]
74
+ # --min-segment-intron <int> [ default: 50 ]
75
+ # --max-segment-intron <int> [ default: 500000 ]
76
+ # --no-sort-bam [Output BAM is not coordinate-sorted]
77
+ # --no-convert-bam [Do not convert to bam format.
78
+ # Output is <output_dir>accepted_hit.sam.
79
+ # Implies --no-sort-bam.]
80
+ #
81
+ # SAM Header Options (for embedding sequencing run metadata in output):
82
+ # --rg-id <string> (read group ID)
83
+ # --rg-sample <string> (sample ID)
84
+ # --rg-library <string> (library ID)
85
+ # --rg-description <string> (descriptive string, no tabs allowed)
86
+ # --rg-platform-unit <string> (e.g Illumina lane ID)
87
+ # --rg-center <string> (sequencing center name)
88
+ # --rg-date <string> (ISO 8601 date of the sequencing run)
89
+ # --rg-platform <string> (Sequencing platform descriptor)
90
+ #
91
+ # for detailed help see http://tophat.cbcb.umd.edu/manual.html
92
+
93
+
94
+ module Bio
95
+ module Ngs
96
+ class Tophat
97
+
98
+ include Bio::Command::Wrapper
99
+
100
+ set_program Bio::Ngs::Utils.binary("tophat")
101
+
102
+ add_option "output-dir",:type => :string, :aliases => '-o'
103
+ add_option "min-anchor", :type => :numeric, :aliases => '-a'
104
+ add_option "splice-mismatches", :type => :numeric, :aliases => '-m'
105
+ add_option "min-intron-length", :type => :numeric , :aliases => '-i'
106
+ add_option "max-intron-length", :type => :numeric, :aliases => '-I'
107
+ add_option "max-multihits", :type => :numeric, :aliases => '-g'
108
+ add_option "min-isoform_fraction", :type => :numeric, :aliases => '-F'
109
+ add_option "max-insertion-length", :type => :numeric
110
+ add_option "max-deletion-length", :type => :numeric
111
+ add_option "solexa-quals", :type => :boolean
112
+ add_option "solexa1.3-quals", :type => :boolean, :aliases => '--phred64-quals'
113
+ add_option :quals, :type => :boolean, :aliases => '-Q'
114
+ add_option "integer-quals", :type => :boolean
115
+ add_option :color, :type => :boolean, :aliases => '-C'
116
+ add_option "library-type", :type => :string
117
+ add_option "num-threads", :type => :numeric, :aliases => '-p'
118
+ add_option "GTF", :type => :string, :aliases => '-G'
119
+ add_option "raw-juncs", :type => :string, :aliases => '-j'
120
+ add_option :insertions, :type => :string
121
+ add_option :deletions, :type => :string
122
+ add_option "mate-inner-dist", :type=>:numeric, :aliases => '-r'
123
+ add_option "mate-std-dev", :type => :numeric
124
+ add_option "no-novel-juncs", :type => :boolean
125
+ add_option "allow-indels", :type => :boolean
126
+ add_option "no-novel-indels", :type => :boolean
127
+ add_option "no-gtf-juncs", :type => :boolean
128
+ add_option "no-coverage-search", :type => :boolean
129
+ add_option "coverage-search", :type => :boolean
130
+ add_option "no-closure-search", :type => :boolean
131
+ add_option "closure-search", :type => :boolean
132
+ add_option "fill-gaps", :type => :boolean
133
+ add_option "microexon-search", :type => :boolean
134
+ add_option "butterfly-search", :type => :boolean
135
+ add_option "no-butterfly-search", :type => :boolean
136
+ add_option "keep-tmp", :type => :boolean
137
+ add_option "tmp-dir", :type => :string
138
+ add_option "segment-mismatches", :type => :numeric
139
+ add_option "segment-length", :type => :numeric
140
+ add_option "min-closure-exon", :type => :numeric
141
+ add_option "min-closure-intron", :type => :numeric
142
+ add_option "max-closure-intron", :type => :numeric
143
+ add_option "min-coverage-intron", :type => :numeric
144
+ add_option "max-coverage-intron", :type => :numeric
145
+ add_option "min-segment-intron", :type => :numeric
146
+ add_option "max-segment-intron", :type => :numeric
147
+ add_option "rg-id", :type => :string
148
+ add_option "rg-sample", :type => :string
149
+ add_option "rg-library", :type => :string
150
+ add_option "rg-description", :type => :string
151
+ add_option "rg-platform-unit", :type => :string
152
+ add_option "rg-center", :type => :string
153
+ add_option "rg-date", :type => :string
154
+ add_option "rg-platform", :type => :string
155
+
156
+ end #That
157
+ end #Ngs
158
+ end #Bio