bio-ngs 0.3.2.alpha.01

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,51 @@
1
+ #
2
+ #
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+ class History < Thor
11
+
12
+
13
+
14
+ history = Bio::Ngs::Record.new(Bio::Ngs::HISTORY_FILE)
15
+
16
+ history.load.each_with_index do |task,index|
17
+ description = ""
18
+ if task[:args].is_a?(Array)
19
+ task[:args].each do |a|
20
+ if a.is_a?(Hash)
21
+ a.each_pair do |key,value|
22
+ description << "#{key} => #{value} "
23
+ end
24
+ else
25
+ description << a+" "
26
+ end
27
+ end
28
+ else
29
+ description = task[:args]
30
+ end
31
+
32
+ # creating History tasks on the fly
33
+ History.class_eval do
34
+ desc (index+1).to_s,"Task #{task[:name]} PARAMETERS: #{description}"
35
+ define_method (index+1).to_s.to_sym do
36
+ if task[:args].size > 1
37
+ invoke task[:name], [task[:args][0]], task[:args][1]
38
+ else
39
+ invoke task[:name], task[:args]
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ desc "clear","Wipe out the tasks history"
46
+ def clear
47
+ history = Bio::Ngs::Record.new(Bio::Ngs::HISTORY_FILE)
48
+ history.clear
49
+ end
50
+
51
+ end
@@ -0,0 +1,121 @@
1
+ class Homology < Thor
2
+
3
+ class Run < Homology
4
+ desc "blastn", "Run BlastN"
5
+ Bio::Ngs::Blast::BlastN.new.thor_task(self,:blastn) do |wrapper, task|
6
+ wrapper.params = task.options
7
+ puts wrapper.run :arguments => [file]
8
+ end
9
+
10
+ desc "blastx", "Run BlastX"
11
+ Bio::Ngs::Blast::BlastX.new.thor_task(self,:blastx) do |wrapper, task|
12
+ wrapper.params = task.options
13
+ puts wrapper.run :arguments => [file]
14
+ end
15
+
16
+ end
17
+
18
+
19
+ class Db < Homology
20
+
21
+ desc "init", "Initialize Homology DB"
22
+ def init
23
+ if Dir.exists? "db" and Dir.exists? "conf"
24
+ db = Bio::Ngs::Db.new :homology
25
+ db.create_tables
26
+ else
27
+ puts "No db or conf directory found! Please run 'biongs project:update:annotation'"
28
+ exit
29
+ end
30
+ end
31
+
32
+ desc "export [TABLE]","Export the data from a table to a tab-separated file"
33
+ method_option :fileout, :type => :string, :desc => "file used to save the output", :required => true
34
+ def export(table)
35
+ if Dir.exists? "db"
36
+ db = Bio::Ngs::Db.new :homology
37
+ db.export(table,options[:fileout])
38
+ else
39
+ puts "No conf directory found! Can't load database connection information"
40
+ exit
41
+ end
42
+ end
43
+
44
+ end
45
+
46
+
47
+ class Load < Homology
48
+
49
+ desc "blast [FILE]","Parse Blast XML output and load the results into Homology DB"
50
+ def blast(file)
51
+ Bio::Ngs::Homology.blast_import file
52
+ puts "Parising completed. All the data are now stored into the db.\n"
53
+ end
54
+
55
+ desc "goa","Import GO Annotation file"
56
+ method_option :file, :type => :string, :default => "data/goa_uniprot"
57
+ def goa
58
+ Bio::Ngs::Homology.goa_import options[:file]
59
+ puts "Import completed.\n"
60
+ end
61
+
62
+ end
63
+
64
+ class Report < Homology
65
+
66
+ desc "blast","Output a graphical report on the Blast homology search"
67
+ method_option :file, :type => :string, :desc => "Read the results from a file and not from the db"
68
+ method_option :fileout, :type => :string, :desc => "File to write the SVG", :default => "blast_report.svg"
69
+ def blast
70
+ db = Bio::Ngs::Db.new :homology
71
+ evalues = []
72
+ positive_70 = 0
73
+ total = BlastOutput.count(:all)
74
+ positive_70 = BlastOutput.count(:conditions => "positive >= 70")
75
+ evalue_5 = BlastOutput.count(:conditions => "evalue <= 1e-5")
76
+ BlastOutput.find(:all).each do |result|
77
+ evalues << result.evalue
78
+ end
79
+ Bio::Ngs::Graphics.bar_charts(["Total mapped","Positive (>=70)","E-value (<=1-e5)"],[total,positive_70,evalue_5],options[:fileout])
80
+ end
81
+
82
+ end
83
+
84
+ class Convert < Homology
85
+
86
+
87
+ desc "blast2text [XML FILE]","Convert Blast output to tab-separated file"
88
+ method_option :file_out, :type => :string, :required => true, :desc => "File name for report"
89
+ def blast(file)
90
+ Bio::Ngs::Homology.blast2text(file,options[:file_out])
91
+ end
92
+
93
+ desc "go2json", "Convert the GO annotations from the db into a JSON file"
94
+ method_option :file_out, :type => :string, :default => "data/go_annotations.json", :desc => "File name for JSON"
95
+ def go2json
96
+ Bio::Ngs::Homology.go_annotation_to_json(options[:file_out],options[:library])
97
+ end
98
+
99
+ end
100
+
101
+ class Download < Homology
102
+
103
+ desc "uniprot","Download the Uniprot-SwissProt file from UniprotKB"
104
+ def uniprot
105
+ Bio::Ngs::Utils.download_and_uncompress("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz","data/uniprot_sprot.fasta.gz")
106
+ end
107
+
108
+ desc "goannotation","Download the Uniprot GeneOntology Annotation file"
109
+ def goannotation
110
+ Bio::Ngs::Utils.download_and_uncompress("http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_uniprot_noiea.gz?rev=HEAD","data/goa_uniprot.gz")
111
+ end
112
+
113
+ desc "all", "Download the Uniprot and GO Annotation file"
114
+ def all
115
+ invoke :uniprot
116
+ invoke :goannotation
117
+ end
118
+
119
+ end
120
+
121
+ end
@@ -0,0 +1,93 @@
1
+ class Ontology < Thor
2
+
3
+
4
+ class Db < Ontology
5
+
6
+ desc "init", "Initialize Ontology DB"
7
+ def init
8
+ if Dir.exists? "db" and Dir.exists? "conf"
9
+ db = Bio::Ngs::Db.new :ontology
10
+ db.create_tables
11
+ invoke "ontology:download:go"
12
+ invoke "ontology:load:go" ["data/gene_ontology.1_2.obo"]
13
+ else
14
+ puts "No db or conf directory found! Please run 'biongs project:update:annotation'"
15
+ exit
16
+ end
17
+ end
18
+
19
+ desc "export [TABLE]","Export the data from a table to a tab-separated file"
20
+ method_option :fileout, :type => :string, :desc => "file used to save the output", :required => true
21
+ def export(table)
22
+ if Dir.exists? "db"
23
+ db = Bio::Ngs::Db.new :ontology
24
+ db.export(table,options[:fileout])
25
+ else
26
+ puts "No conf directory found! Can't load database connection information"
27
+ exit
28
+ end
29
+ end
30
+
31
+ end
32
+
33
+
34
+ class Load < Ontology
35
+
36
+ desc "go [FILE]", "Import GO definition file"
37
+ def go(file)
38
+ Bio::Ngs::Ontology.go_import file
39
+ puts "Import completed.\n"
40
+ end
41
+
42
+ desc "genego [FILE]", "Import Gene-GO file (JSON)"
43
+ def genego(file)
44
+ Bio::Ngs::Ontology.load_go_genes file
45
+ puts "Import completed"
46
+ end
47
+
48
+
49
+ end
50
+
51
+ class Report < Ontology
52
+
53
+ desc "go","Output a graphical report on the GO for the sequences annotated in the db"
54
+ def go
55
+ db = Bio::Ngs::Db.new :ontology
56
+ ontologies = {}
57
+ Gene.find(:all).each do |gene|
58
+ gene.go.each do |ontology|
59
+ ontologies[ontology.namespace] = Hash.new(0) unless ontologies.has_key? ontology.namespace
60
+ ontologies[ontology.namespace][ontology.name] += 1
61
+ end
62
+ end
63
+ ontologies.each_pair do |namespace,terms|
64
+ terms = terms.sort {|a,b| b[1] <=> a[1]}
65
+ terms.flatten!
66
+ Bio::Ngs::Graphics.bubble_chart(namespace+"_go.svg",Hash[*terms[0..39]])
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ class Download < Ontology
73
+
74
+ desc "go", "Download the GeneOntology file"
75
+ def go
76
+ Bio::Ngs::Utils.download_with_progress(:url => "http://www.geneontology.org/ontology/obo_format_1_2/gene_ontology.1_2.obo", :filename => "data/gene_ontology.1_2.obo")
77
+ end
78
+
79
+ desc "goslim", "Download the Uniprot GeneOntology Slim file"
80
+ def goslim
81
+ Bio::Ngs::Utils.download_with_progress(:url => "http://www.geneontology.org/GO_slims/goslim_goa.obo", :filename => "data/goslim_goa.obo")
82
+ end
83
+
84
+ desc "all", "Download the GO files"
85
+ def all
86
+ invoke :goslim
87
+ invoke :go
88
+ end
89
+
90
+
91
+ end
92
+
93
+ end
@@ -0,0 +1,51 @@
1
+ #
2
+ #
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+ class Project < Thor
11
+ include Thor::Actions
12
+
13
+ def self.source_root
14
+ File.expand_path(File.dirname(__FILE__))
15
+ end
16
+
17
+ attr_accessor :name
18
+
19
+ desc "new [NAME]","Create a new NGS project directory"
20
+ method_option :type, :type => :string, :desc => "The project type (e.g. annotation)"
21
+ def new(name)
22
+ empty_directory name
23
+ empty_directory File.join("#{name}","data")
24
+ empty_directory File.join("#{name}","tasks")
25
+ empty_directory File.join("#{name}","scripts")
26
+ self.name = name # for template to take the correct values
27
+ template(File.join("..","templates","README.tt"), "#{name}/README.txt")
28
+
29
+ if options[:type] == "annotation"
30
+ invoke "project:update:annotation", [],{:dir => name}
31
+ else
32
+ empty_directory File.join("#{name}","log")
33
+ empty_directory File.join("#{name}","conf")
34
+ end
35
+ end
36
+
37
+ attr_accessor :type
38
+
39
+ desc "update [TYPE]", "Update the working dir to a new type of project"
40
+ method_option :dir, :type => :string
41
+ def update(type)
42
+ self.type = type
43
+ dir = (options[:dir]) ? options[:dir]+"/" : ""
44
+ empty_directory "#{dir}log"
45
+ empty_directory "#{dir}conf"
46
+ empty_directory "#{dir}db"
47
+ template(File.join("..","templates","db.tt"), "#{dir}conf/#{type}_db.yml")
48
+ end
49
+
50
+
51
+ end
@@ -0,0 +1,142 @@
1
+ #
2
+ #
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+ class Quality < Thor
11
+
12
+ desc "reads FASTQ", "perform quality check for NGS reads"
13
+ method_option :width, :type => :numeric, :default => 500
14
+ method_option :height, :type => :numeric, :default => 500
15
+ method_option :fileout, :type => :string, :default => "fastq_report.svg"
16
+ def reads(fastq)
17
+ reads = Bio::Ngs::FastQuality.new(fastq)
18
+ qual = reads.quality_profile
19
+ Bio::Ngs::Graphics.draw_area(qual,options[:width],options[:height],options[:fileout],"Nucleotide","Quality Score")
20
+ end
21
+
22
+ desc "trim FASTQ", "trim all the sequences"
23
+ #TODO: create a wrapper
24
+ method_option :min_size, :type=>:numeric, :default=>20, :aliases => "-l", :desc=>"Minimum length - sequences shorter than this (after trimming)
25
+ will be discarded. Default = 0 = no minimum length."
26
+ method_option :min_quality, :type=>:numeric, :default=>10, :aliases => "-t", :desc=>"Quality threshold - nucleotides with lower
27
+ quality will be trimmed (from the end of the sequence)."
28
+ method_option :output, :type=>:string, :aliases => "-o", :desc => "Output file name"
29
+ def trim(fastq)
30
+ output_file = options.output || fastq.gsub(/(.*)_(forward|reverse)(.*)/,'\1_trim_\2\3')
31
+ if output_file==fastq
32
+ output_file+="_trim"
33
+ end
34
+ raise "Input file #{fastq} dosen't exist." unless File.exists?(fastq)
35
+ unless File.exists?("#{fastq}.txt") #suppose there is a stat file for the input file
36
+ invoke :fastq_stats, [fastq]
37
+ end
38
+ #TODO check the file in input exists
39
+ trim = Bio::Ngs::Fastx::Trim.new
40
+ trim.params={min_size:options.min_size, min_quality:options.min_quality, input:fastq, output:output_file}
41
+ trim.run
42
+ invoke :fastq_stats, [output_file]
43
+ end
44
+
45
+
46
+ desc "fastq_stats FASTQ", "Reports quality of FASTQ file"
47
+ method_option :output, :type=>:string, :aliases =>"-o", :desc => "Output file name. default is input file_name with .txt."
48
+ def fastq_stats(fastq)
49
+ output_file = options.output || "#{fastq}.txt"
50
+ stats = Bio::Ngs::Fastx::FastqStats.new
51
+ stats.params = {input:fastq, output:output_file}
52
+ stats.run
53
+ invoke :boxplot, [output_file]
54
+ invoke :reads_coverage, [output_file]
55
+ end
56
+
57
+ desc "boxplot FASTQ_QUALITY_STATS", "plot reads quality as boxplot"
58
+ method_option :title, :type=>:string, :aliases =>"-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
59
+ method_option :output, :type=>:string, :aliases =>"-o", :desc => "Output file name. default is input file_name with .txt."
60
+ def boxplot(fastq_quality_stats)
61
+ output_file = options.output || "#{fastq_quality_stats}.png"
62
+ boxplot = Bio::Ngs::Fastx::ReadsBoxPlot.new
63
+ boxplot.params={input:fastq_quality_stats, output:output_file}
64
+ boxplot.run
65
+ end
66
+
67
+ desc "reads_coverage FASTQ_QUALITY_STATS", "plot reads coverage in bases"
68
+ method_option :title, :type=>:string, :aliases =>"-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
69
+ method_option :output, :type=>:string, :aliases =>"-o", :desc => "Output file name. default is input file_name with .txt."
70
+ def reads_coverage(fastq_quality_stats)
71
+ #TODO: port this script to biongs now is only on my server
72
+ output_file = options.output || "#{fastq_quality_stats}_coverage.png"
73
+ coverage = Bio::Ngs::Fastx::ReadsCoverage.new
74
+ coverage.params={input:fastq_quality_stats, output:output_file}
75
+ coverage.run
76
+ end
77
+
78
+
79
+ desc "illumina_b_profile_raw FASTQ", "perform a profile for reads coming fom Illumina 1.5+ and write the report in a txt file"
80
+ method_option :read_length, :type => :numeric, :required => true
81
+ method_option :width, :type => :numeric, :default => 500
82
+ method_option :height, :type => :numeric, :default => 500
83
+ method_option :fileout, :type => :string, :default => "fastq_report.txt"
84
+ def illumina_b_profile_raw(fastq)
85
+ reads = Bio::Ngs::FastQuality.new(fastq, :fastq_illumina)
86
+ profile = Array.new(options.read_length,0) #create a default profile setted to zero.
87
+ quals = reads.track_b_count
88
+ quals.b_profile.each do |b_item|
89
+ b_index = b_item[0]
90
+ b_count = b_item[1]
91
+ profile[b_index] = b_count if b_index <= options.read_length
92
+ end
93
+ File.open(options.fileout,'w') do |f|
94
+ f.puts "Total reads: #{quals.n_reads}"
95
+ profile.each_index do |index|
96
+ f.puts "#{index},#{profile[index]}"
97
+ end
98
+ end#File
99
+ end
100
+
101
+ desc "illumina_b_profile_svg FASTQ", "perform a profile for reads coming fom Illumina 1.5+"
102
+ method_option :read_length, :type => :numeric, :required => true
103
+ method_option :width, :type => :numeric, :default => 500
104
+ method_option :height, :type => :numeric, :default => 500
105
+ method_option :fileout, :type => :string, :default => "fastq_report.svg"
106
+ def illumina_b_profile_svg(fastq)
107
+ reads = Bio::Ngs::FastQuality.new(fastq, :fastq_illumina)
108
+ profile = Array.new(options.read_length,0) #create a default profile setted to zero.
109
+ reads.track_b_count.b_profile.each do |b_item|
110
+ b_index = b_item[0]
111
+ b_count = b_item[1]
112
+ profile[b_index] = b_count if b_index <= options.read_length
113
+ end
114
+
115
+ Bio::Ngs::Graphics.draw_area(profile,options[:width],options[:height],options[:fileout], "B distribution", "Nucleotides", "Counts", n_ticks=30)
116
+ end
117
+
118
+ desc "scatterplot EXPR1 EXPR2 OUTPUT", "plot quantification values as scatterplot in png format"
119
+ method_option :title, :type=>:string, :aliases =>"-t", :desc => "Title plotted on the graph."
120
+ def scatterplot (expr1, expr2, output)
121
+
122
+ [expr1, expr2].each do |file_name| #controllo sul file!
123
+ unless File.exists?(file_name)
124
+ raise "Input file #{file_name} doesn't exist, please insert a valid file name."
125
+ end
126
+ end
127
+
128
+ system "sort #{expr1} > tmp_1" #con system richiami la shell
129
+ system "sort #{expr2} > tmp_2"
130
+ File.open("tmp_gnuplot",'w') do |f|
131
+ f.puts "set title '#{options.title || "Scatter plot NGS"}'"
132
+ f.puts "set terminal png"
133
+ f.puts "set output '#{output}.png'"
134
+ f.puts "plot '< join tmp_1 tmp_2 | head -n -1' using 6:14"
135
+ end
136
+ puts "gnuplot tmp_gnuplot"
137
+ system "cat tmp_gnuplot"
138
+ system "rm tmp_1 tmp_2 tmp_gnuplot"
139
+ end
140
+ end
141
+
142
+