bio-ngs 0.3.2.alpha.01

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,170 @@
1
+ #
2
+ # fastx.rb - The FASTX-Toolkit is a collection of command line tools for Short-Reads FASTA/FASTQ files preprocessing.
3
+ #
4
+ # Next-Generation sequencing machines usually produce FASTA or FASTQ files, containing multiple short-reads sequences (possibly with quality information).
5
+ #
6
+ # The main processing of such FASTA/FASTQ files is mapping (aka aligning) the sequences to reference genomes or other databases using specialized programs. Example of such mapping programs are: Blat, SHRiMP, LastZ, MAQ and many many others.
7
+ #
8
+ # However,
9
+ # It is sometimes more productive to preprocess the FASTA/FASTQ files before mapping the sequences to the genome - manipulating the sequences to produce better mapping results.
10
+ #
11
+ # The FASTX-Toolkit tools perform some of these preprocessing tasks.
12
+ # http://hannonlab.cshl.edu/fastx_toolkit/
13
+ #
14
+ # Copyright:: Copyright (C) 2011
15
+ # Raoul Bonnal <r@bioruby.org>
16
+ # License:: The Ruby License
17
+ #
18
+ # + Mapped
19
+ # - Not Yet Mapped
20
+ #
21
+ # - fastx_artifacts_filter
22
+ # - fastx_collapser
23
+ # + fastx_quality_stats
24
+ # - fastx_trimmer
25
+ # - fastx_barcode_splitter.pl
26
+ # - fastx_nucleotide_distribution_graph.sh
27
+ # - fastx_renamer
28
+ # - fastx_uncollapser
29
+ # - fastx_clipper
30
+ # - fastx_nucleotide_distribution_line_graph.sh
31
+ # - fastx_reverse_complement
32
+ # + fastq_coverage_graph.sh
33
+ # - fastq_masker
34
+ # + fastq_quality_boxplot_graph.sh
35
+ # - fastq_quality_converter
36
+ # - fastq_quality_filter
37
+ # - fastq_quality_trimmer
38
+ # - fastq_to_fasta
39
+
40
+
41
+
42
+ module Bio
43
+ module Ngs
44
+ module Fastx
45
+
46
+ # [-h] = This helpful help screen.
47
+ # [-t N] = Quality threshold - nucleotides with lower
48
+ # quality will be trimmed (from the end of the sequence).
49
+ # [-l N] = Minimum length - sequences shorter than this (after trimming)
50
+ # will be discarded. Default = 0 = no minimum length.
51
+ # [-z] = Compress output with GZIP.
52
+ # [-i INFILE] = FASTQ input file. default is STDIN.
53
+ # [-o OUTFILE] = FASTQ output file. default is STDOUT.
54
+ # [-v] = Verbose - report number of sequences.
55
+ # If [-o] is specified, report will be printed to STDOUT.
56
+ # If [-o] is not specified (and output goes to STDOUT),
57
+ # report will be printed to STDERR.
58
+ class Trim
59
+ include Bio::Command::Wrapper
60
+ set_program Bio::Ngs::Utils.binary("fastq_quality_trimmer")
61
+ use_aliases
62
+ add_option :min_size, :type=>:numeric, :default=>20, :aliases => "-l", :desc=>"Minimum length - sequences shorter than this (after trimming)
63
+ will be discarded. Default = 0 = no minimum length."
64
+ add_option :min_quality, :type=>:numeric, :default=>10, :aliases => "-t", :desc=>"Quality threshold - nucleotides with lower
65
+ quality will be trimmed (from the end of the sequence)."
66
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
67
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
68
+ add_option :gzip, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP."
69
+ add_option :verbose, :type => :boolean, :aliases => "-v", :desc => "[-v] = Verbose - report number of sequences.
70
+ If [-o] is specified, report will be printed to STDOUT.
71
+ If [-o] is not specified (and output goes to STDOUT),
72
+ report will be printed to STDERR."
73
+ add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
74
+ end #Trim
75
+
76
+ # Solexa-Quality BoxPlot plotter
77
+ # Generates a solexa quality score box-plot graph
78
+ #
79
+ # Usage: /usr/local/bin/fastq_quality_boxplot_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
80
+ #
81
+ # [-p] - Generate PostScript (.PS) file. Default is PNG image.
82
+ # [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
83
+ # [-o OUTPUT] - Output file name. default is STDOUT.
84
+ # [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
85
+ class ReadsBoxPlot
86
+ include Bio::Command::Wrapper
87
+ set_program Bio::Ngs::Utils.binary("fastq_quality_boxplot_graph.sh")
88
+ use_aliases
89
+ add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
90
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
91
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
92
+ add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
93
+ end #ReadsBoxPlot
94
+
95
+ # Solexa-Reads coverage plotter
96
+ # Generates a solexa line coverage graph
97
+ #
98
+ # Usage: /usr/local/bin/fastq_coverage_graph.sh [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]
99
+ #
100
+ # [-p] - Generate PostScript (.PS) file. Default is PNG image.
101
+ # [-i INPUT.TXT] - Input file. Should be the output of "solexa_quality_statistics" program.
102
+ # [-o OUTPUT] - Output file name. default is STDOUT.
103
+ # [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph.
104
+ class ReadsCoverage
105
+ include Bio::Command::Wrapper
106
+ set_program Bio::Ngs::Utils.binary("fastq_coverage_graph.sh")
107
+ use_aliases
108
+ add_option :ps, :type => :boolean, :aliases => "-p", :desc => "Generate PostScript (.PS) file. Default is PNG image."
109
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
110
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
111
+ add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
112
+ end #ReadsCoverage
113
+
114
+
115
+ # usage: fastx_quality_stats [-h] [-N] [-i INFILE] [-o OUTFILE]
116
+ # Part of FASTX Toolkit 0.0.13 by A. Gordon (gordon@cshl.edu)
117
+ #
118
+ # [-h] = This helpful help screen.
119
+ # [-i INFILE] = FASTQ input file. default is STDIN.
120
+ # [-o OUTFILE] = TEXT output file. default is STDOUT.
121
+ # [-N] = New output format (with more information per nucleotide/cycle).
122
+ #
123
+ # The *OLD* output TEXT file will have the following fields (one row per column):
124
+ # column = column number (1 to 36 for a 36-cycles read solexa file)
125
+ # count = number of bases found in this column.
126
+ # min = Lowest quality score value found in this column.
127
+ # max = Highest quality score value found in this column.
128
+ # sum = Sum of quality score values for this column.
129
+ # mean = Mean quality score value for this column.
130
+ # Q1 = 1st quartile quality score.
131
+ # med = Median quality score.
132
+ # Q3 = 3rd quartile quality score.
133
+ # IQR = Inter-Quartile range (Q3-Q1).
134
+ # lW = 'Left-Whisker' value (for boxplotting).
135
+ # rW = 'Right-Whisker' value (for boxplotting).
136
+ # A_Count = Count of 'A' nucleotides found in this column.
137
+ # C_Count = Count of 'C' nucleotides found in this column.
138
+ # G_Count = Count of 'G' nucleotides found in this column.
139
+ # T_Count = Count of 'T' nucleotides found in this column.
140
+ # N_Count = Count of 'N' nucleotides found in this column.
141
+ # max-count = max. number of bases (in all cycles)
142
+ #
143
+ #
144
+ # The *NEW* output format:
145
+ # cycle (previously called 'column') = cycle number
146
+ # max-count
147
+ # For each nucleotide in the cycle (ALL/A/C/G/T/N):
148
+ # count = number of bases found in this column.
149
+ # min = Lowest quality score value found in this column.
150
+ # max = Highest quality score value found in this column.
151
+ # sum = Sum of quality score values for this column.
152
+ # mean = Mean quality score value for this column.
153
+ # Q1 = 1st quartile quality score.
154
+ # med = Median quality score.
155
+ # Q3 = 3rd quartile quality score.
156
+ # IQR = Inter-Quartile range (Q3-Q1).
157
+ # lW = 'Left-Whisker' value (for boxplotting).
158
+ # rW = 'Right-Whisker' value (for boxplotting).
159
+ class FastqStats
160
+ include Bio::Command::Wrapper
161
+ set_program Bio::Ngs::Utils.binary("fastx_quality_stats")
162
+ use_aliases
163
+ add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
164
+ add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
165
+ add_option :new_format, :type => :boolean, :aliases => "-N", :desc => "New output format (with more information per nucleotide/cycle)."
166
+ end #ReadsCoverage
167
+
168
+ end #Fastx
169
+ end #Ngs
170
+ end #Bio
@@ -0,0 +1,118 @@
1
+ #
2
+ # samtools.rb - description
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+ #
10
+ # Program: samtools (Tools for alignments in the SAM format)
11
+ # Version: 0.1.16 (r963:234)
12
+ #
13
+ # Usage: samtools <command> [options]
14
+ #
15
+ # Command: view SAM<->BAM conversion
16
+ # sort sort alignment file
17
+ # pileup generate pileup output
18
+ # mpileup multi-way pileup
19
+ # depth compute the depth
20
+ # faidx index/extract FASTA
21
+ # tview text alignment viewer
22
+ # index index alignment
23
+ # idxstats BAM index stats (r595 or later)
24
+ # fixmate fix mate information
25
+ # glfview print GLFv3 file
26
+ # flagstat simple stats
27
+ # calmd recalculate MD/NM tags and '=' bases
28
+ # merge merge sorted alignments
29
+ # rmdup remove PCR duplicates
30
+ # reheader replace BAM header
31
+ # cat concatenate BAMs
32
+ # targetcut cut fosmid regions (for fosmid pool only)
33
+ # phase phase heterozygotes
34
+
35
+
36
+ module Bio
37
+ module Ngs
38
+ module Samtools
39
+
40
+ # Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]
41
+ #
42
+ # Options: -b output BAM
43
+ # -h print header for the SAM output
44
+ # -H print header only (no alignments)
45
+ # -S input is SAM
46
+ # -u uncompressed BAM output (force -b)
47
+ # -1 fast compression (force -b)
48
+ # -x output FLAG in HEX (samtools-C specific)
49
+ # -X output FLAG in string (samtools-C specific)
50
+ # -c print only the count of matching records
51
+ # -L FILE output alignments overlapping the input BED FILE [null]
52
+ # -t FILE list of reference names and lengths (force -S) [null]
53
+ # -T FILE reference sequence file (force -S) [null]
54
+ # -o FILE output file name [stdout]
55
+ # -R FILE list of read groups to be outputted [null]
56
+ # -f INT required flag, 0 for unset [0]
57
+ # -F INT filtering flag, 0 for unset [0]
58
+ # -q INT minimum mapping quality [0]
59
+ # -l STR only output reads in library STR [null]
60
+ # -r STR only output reads in read group STR [null]
61
+ # -? longer help
62
+ class View
63
+ include Bio::Command::Wrapper
64
+ set_program Bio::Ngs::Utils.binary("samtools")
65
+ set_sub_program "view"
66
+ use_aliases
67
+ add_option :bam_output, :type => :boolean, :aliases => "-b", :desc => "output BAM", :default => true
68
+ add_option :print_header_alignment, :type => :boolean, :aliases => "-h", :desc => "print header for the SAM output"
69
+ add_option :print_header_only, :type => :boolean, :aliases => "-H", :desc => "print header only (no alignments)"
70
+ add_option :sam_input, :type => :boolean, :aliases => "-S", :desc => "input is SAM"
71
+ add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output (force -b)"
72
+ add_option :compress, :type => :boolean , :aliases => "-1", :desc => "fast compression (force -b)"
73
+ add_option :flag_hex, :type => :boolean, :aliases => "-x", :desc => "output FLAG in HEX (samtools-C specific)"
74
+ add_option :flag_string, :type => :boolean, :aliases => "-X", :desc => "output FLAS is string (samtools-C specific)"
75
+ add_option :output_alignment, :type => :string, :aliases => "-L", :desc => "output alignments overlapping the input BED FILE [null]"
76
+ add_option :list_ref, :type => :string, :aliases => "-t", :desc => "list of reference names and lengths (force -S) [null]"
77
+ add_option :ref_sequence, :type => :string, :aliases => "-T", :desc => "reference sequence file (force -S) [null]"
78
+ add_option :output, :type => :string, :aliases => "-o", :desc => "output file name [stdout]", :required => true
79
+ add_option :list_group, :type => :string, :aliases => "-R", :desc => "list of read groups to be outputted [null]"
80
+ add_option :required_flag, :type => :numeric, :aliases => "-f", :desc => "required flag, 0 for unset [0]"
81
+ add_option :filtering_flag, :type => :numeric, :aliases => "-F", :desc => "filtering flag, 0 for unset [0]"
82
+ add_option :min_map_qual, :type => :numeric, :aliases => "-q", :desc => "minimum mapping quality [0]"
83
+ add_option :only_lib_reads, :type => :string, :aliases => "-l", :desc => "only output reads in library STR [null]"
84
+ add_option :only_grp_reads, :type => :string, :aliases => "r", :desc => "only output reads in read group STR [null]"
85
+
86
+ end #View
87
+
88
+ # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]
89
+ #
90
+ # Options: -n sort by read names
91
+ # -r attach RG tag (inferred from file names)
92
+ # -u uncompressed BAM output
93
+ # -f overwrite the output BAM if exist
94
+ # -1 compress level 1
95
+ # -R STR merge file in the specified region STR [all]
96
+ # -h FILE copy the header in FILE to <out.bam> [in1.bam]
97
+ #
98
+ # Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users
99
+ # must provide the correct header with -h, or uses Picard which properly maintains
100
+ # the header dictionary in merging.
101
+ #out, in1, in2, ... inx Must be passed as arguments
102
+ class Merge
103
+ include Bio::Command::Wrapper
104
+ set_program Bio::Ngs::Utils.binary("samtools")
105
+ set_sub_program "merge"
106
+ use_aliases
107
+ add_option :sort_by_read_name, :type => :boolean, :aliases => "-n", :desc => "sort by read names"
108
+ add_option :attach_rg, :type => :boolean, :aliases => "-r", :desc => "attach RG tag (inferred from file names)"
109
+ add_option :uncompress, :type => :boolean, :aliases => "-u", :desc => "uncompressed BAM output"
110
+ add_option :overwrite_output, :type => :boolean, :aliases => "-f", :desc => "overwrite the output BAM if exist"
111
+ add_option :compress, :type => :boolean , :aliases => "-1", :desc => "compress level 1"
112
+ add_option :merge_regions, :type => :string, :aliases => "-R", :desc => "merge file in the specified region STR [all]"
113
+ add_option :copy_header, :type => :string, :aliases => "-h", :desc => "copy the header in FILE to <out.bam> [in1.bam]"
114
+ end #Merge
115
+
116
+ end #Samtools
117
+ end #Ngs
118
+ end #Bio
@@ -0,0 +1,23 @@
1
+ module Bio
2
+ module Ngs
3
+ class SffExtract
4
+
5
+ include Bio::Command::Wrapper
6
+
7
+ set_program Bio::Ngs::Utils.binary("sff_extract")
8
+ add_option "append", :type => :boolean, :aliases => "-a", :desc => "append output to existing files"
9
+ add_option "xml_info", :type => :string, :aliases => "-i", :desc => "extra info to write in the xml file"
10
+ add_option "linker_file", :type => :string, :aliases => "-l", :desc => "FASTA file with paired-end linker sequences"
11
+ add_option "clip", :type => :boolean, :aliases => "-c", :desc => "clip (completely remove) ends with low qual and/or adaptor sequence"
12
+ add_option "upper_case", :type => :boolean, :aliases => "-u", :desc => "all bases in upper case, including clipped ends"
13
+ add_option "min_left_clip", :type => :numeric, :desc => "if the left clip coming from the SFF is smaller than this value, override it"
14
+ add_option "fastq", :type => :boolean, :aliases => "-Q", :desc => "store as FASTQ file instead of FASTA + FASTA quality file"
15
+ add_option "out_basename", :type => :string, :aliases => "-o", :desc => "base name for all output files"
16
+ add_option "seq_file", :type => :string, :aliases => "-s", :desc => "output sequence file name"
17
+ add_option "qual_file", :type => :string, :aliases => "-q", :desc => "output quality file name"
18
+ add_option "xml_file", :type => :string, :aliases => "-x", :desc => "output ancillary xml file name"
19
+
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,158 @@
1
+ #
2
+ # tophat.rb - description
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+ #require 'bio/command'
12
+ #require 'shellwords'
13
+ #require 'thor'
14
+ #require 'bio/ngs/utils'
15
+
16
+ # TopHat maps short sequences from spliced transcripts to whole genomes.
17
+ #
18
+ # Usage:
19
+ # tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \
20
+ # [quals1,[quals2,...]] [quals1[,quals2,...]]
21
+ #
22
+ # Options:
23
+ # -v/--version
24
+ # -o/--output-dir <string> [ default: ./tophat_out ]
25
+ # -a/--min-anchor <int> [ default: 8 ]
26
+ # -m/--splice-mismatches <0-2> [ default: 0 ]
27
+ # -i/--min-intron-length <int> [ default: 50 ]
28
+ # -I/--max-intron-length <int> [ default: 500000 ]
29
+ # -g/--max-multihits <int> [ default: 20 ]
30
+ # -F/--min-isoform-fraction <float> [ default: 0.15 ]
31
+ # --max-insertion-length <int> [ default: 3 ]
32
+ # --max-deletion-length <int> [ default: 3 ]
33
+ # --solexa-quals
34
+ # --solexa1.3-quals (same as phred64-quals)
35
+ # --phred64-quals (same as solexa1.3-quals)
36
+ # -Q/--quals
37
+ # --integer-quals
38
+ # -C/--color (Solid - color space)
39
+ # --color-out
40
+ # --library-type <string> (fr-unstranded, fr-firststrand,
41
+ # fr-secondstrand)
42
+ # -p/--num-threads <int> [ default: 1 ]
43
+ # -G/--GTF <filename>
44
+ # -j/--raw-juncs <filename>
45
+ # --insertions <filename>
46
+ # --deletions <filename>
47
+ # -r/--mate-inner-dist <int>
48
+ # --mate-std-dev <int> [ default: 20 ]
49
+ # --no-novel-juncs
50
+ # --no-novel-indels
51
+ # --no-gtf-juncs
52
+ # --no-coverage-search
53
+ # --coverage-search
54
+ # --no-closure-search
55
+ # --closure-search
56
+ # --microexon-search
57
+ # --butterfly-search
58
+ # --no-butterfly-search
59
+ # --keep-tmp
60
+ # --tmp-dir <dirname> [ default: <output_dir>/tmp ]
61
+ # -z/--zpacker <program> [ default: gzip ]
62
+ # -X/--unmapped-fifo [ use mkfifo to compress more temporary files]
63
+ #
64
+ # Advanced Options:
65
+ # --initial-read-mismatches <int> [ default: 2 ]
66
+ # --segment-mismatches <int> [ default: 2 ]
67
+ # --segment-length <int> [ default: 25 ]
68
+ # --bowtie-n [ default: bowtie -v ]
69
+ # --min-closure-exon <int> [ default: 100 ]
70
+ # --min-closure-intron <int> [ default: 50 ]
71
+ # --max-closure-intron <int> [ default: 5000 ]
72
+ # --min-coverage-intron <int> [ default: 50 ]
73
+ # --max-coverage-intron <int> [ default: 20000 ]
74
+ # --min-segment-intron <int> [ default: 50 ]
75
+ # --max-segment-intron <int> [ default: 500000 ]
76
+ # --no-sort-bam [Output BAM is not coordinate-sorted]
77
+ # --no-convert-bam [Do not convert to bam format.
78
+ # Output is <output_dir>accepted_hit.sam.
79
+ # Implies --no-sort-bam.]
80
+ #
81
+ # SAM Header Options (for embedding sequencing run metadata in output):
82
+ # --rg-id <string> (read group ID)
83
+ # --rg-sample <string> (sample ID)
84
+ # --rg-library <string> (library ID)
85
+ # --rg-description <string> (descriptive string, no tabs allowed)
86
+ # --rg-platform-unit <string> (e.g Illumina lane ID)
87
+ # --rg-center <string> (sequencing center name)
88
+ # --rg-date <string> (ISO 8601 date of the sequencing run)
89
+ # --rg-platform <string> (Sequencing platform descriptor)
90
+ #
91
+ # for detailed help see http://tophat.cbcb.umd.edu/manual.html
92
+
93
+
94
+ module Bio
95
+ module Ngs
96
+ class Tophat
97
+
98
+ include Bio::Command::Wrapper
99
+
100
+ set_program Bio::Ngs::Utils.binary("tophat")
101
+
102
+ add_option "output-dir",:type => :string, :aliases => '-o'
103
+ add_option "min-anchor", :type => :numeric, :aliases => '-a'
104
+ add_option "splice-mismatches", :type => :numeric, :aliases => '-m'
105
+ add_option "min-intron-length", :type => :numeric , :aliases => '-i'
106
+ add_option "max-intron-length", :type => :numeric, :aliases => '-I'
107
+ add_option "max-multihits", :type => :numeric, :aliases => '-g'
108
+ add_option "min-isoform_fraction", :type => :numeric, :aliases => '-F'
109
+ add_option "max-insertion-length", :type => :numeric
110
+ add_option "max-deletion-length", :type => :numeric
111
+ add_option "solexa-quals", :type => :boolean
112
+ add_option "solexa1.3-quals", :type => :boolean, :aliases => '--phred64-quals'
113
+ add_option :quals, :type => :boolean, :aliases => '-Q'
114
+ add_option "integer-quals", :type => :boolean
115
+ add_option :color, :type => :boolean, :aliases => '-C'
116
+ add_option "library-type", :type => :string
117
+ add_option "num-threads", :type => :numeric, :aliases => '-p'
118
+ add_option "GTF", :type => :string, :aliases => '-G'
119
+ add_option "raw-juncs", :type => :string, :aliases => '-j'
120
+ add_option :insertions, :type => :string
121
+ add_option :deletions, :type => :string
122
+ add_option "mate-inner-dist", :type=>:numeric, :aliases => '-r'
123
+ add_option "mate-std-dev", :type => :numeric
124
+ add_option "no-novel-juncs", :type => :boolean
125
+ add_option "allow-indels", :type => :boolean
126
+ add_option "no-novel-indels", :type => :boolean
127
+ add_option "no-gtf-juncs", :type => :boolean
128
+ add_option "no-coverage-search", :type => :boolean
129
+ add_option "coverage-search", :type => :boolean
130
+ add_option "no-closure-search", :type => :boolean
131
+ add_option "closure-search", :type => :boolean
132
+ add_option "fill-gaps", :type => :boolean
133
+ add_option "microexon-search", :type => :boolean
134
+ add_option "butterfly-search", :type => :boolean
135
+ add_option "no-butterfly-search", :type => :boolean
136
+ add_option "keep-tmp", :type => :boolean
137
+ add_option "tmp-dir", :type => :string
138
+ add_option "segment-mismatches", :type => :numeric
139
+ add_option "segment-length", :type => :numeric
140
+ add_option "min-closure-exon", :type => :numeric
141
+ add_option "min-closure-intron", :type => :numeric
142
+ add_option "max-closure-intron", :type => :numeric
143
+ add_option "min-coverage-intron", :type => :numeric
144
+ add_option "max-coverage-intron", :type => :numeric
145
+ add_option "min-segment-intron", :type => :numeric
146
+ add_option "max-segment-intron", :type => :numeric
147
+ add_option "rg-id", :type => :string
148
+ add_option "rg-sample", :type => :string
149
+ add_option "rg-library", :type => :string
150
+ add_option "rg-description", :type => :string
151
+ add_option "rg-platform-unit", :type => :string
152
+ add_option "rg-center", :type => :string
153
+ add_option "rg-date", :type => :string
154
+ add_option "rg-platform", :type => :string
155
+
156
+ end #That
157
+ end #Ngs
158
+ end #Bio