bio-ngs 0.3.2.alpha.01

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
Binary file
Binary file
@@ -0,0 +1,73 @@
1
+ common:
2
+ libgtextutils:
3
+ version: 0.6
4
+ url: http://hannonlab.cshl.edu/fastx_toolkit/libgtextutils-0.6.tar.bz2
5
+ basename: libgtextutils-0.6
6
+ suffix: tar.bz2
7
+ desc: ""
8
+ type: source
9
+ fastx:
10
+ version: 0.0.13
11
+ url: http://hannonlab.cshl.edu/fastx_toolkit/fastx_toolkit-0.0.13.tar.bz2
12
+ basename: fastx_toolkit-0.0.13
13
+ suffix: tar.bz2
14
+ desc: "Fastx-toolkit version 0.0.13 requires libgtextutils-0.6 (available here for download). A recent g++ compiler (tested with GNU G++ 4.1.2 and later). The fasta_clipping_histogram tool requires two perl modules: PerlIO::gzip and GD::Graph::bars. The fastx_barcode_splitter tool requires GNU sed. The fastq_quality_boxplot tool requires gnuplot version 4.2 or newer."
15
+ type: source
16
+ linux:
17
+ cufflinks:
18
+ version: 1.1.0
19
+ url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.1.0.Linux_x86_64.tar.gz
20
+ basename: cufflinks-1.1.0.Linux_x86_64
21
+ suffix: tar.gz
22
+ desc: ""
23
+ type: binary
24
+ tophat:
25
+ version: 1.3.2
26
+ url: http://tophat.cbcb.umd.edu/downloads/tophat-1.3.2.Linux_x86_64.tar.gz
27
+ basename: tophat-1.3.2.Linux_x86_64
28
+ suffix: tar.gz
29
+ desc: ""
30
+ type: binary
31
+ bowtie:
32
+ version: 0.12.7
33
+ url: http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.7/bowtie-0.12.7-linux-x86_64.zip/download
34
+ basename: bowtie-0.12.7-linux-x86_64
35
+ suffix: zip
36
+ desc: ""
37
+ type: binary
38
+ # sra:
39
+ # version:
40
+ # url: http://trace.ncbi.nlm.nih.gov/Traces/sra/static/sratoolkit.2.1.0-centos_linux64.tar.gz
41
+ # basename: sratoolkit.2.1.0-centos_linux64
42
+ # suffix: tar.gz
43
+ # desc: ""
44
+ # type: binary
45
+ osx:
46
+ cufflinks:
47
+ version: 1.1.0
48
+ url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.1.0.OSX_x86_64.tar.gz
49
+ basename: cufflinks-1.1.0.OSX_x86_64
50
+ suffix: tar.gz
51
+ desc: ""
52
+ type: binary
53
+ tophat:
54
+ version: 1.3.2
55
+ url: http://tophat.cbcb.umd.edu/downloads/tophat-1.3.2.OSX_x86_64.tar.gz
56
+ basename: tophat-1.3.2.OSX_x86_64
57
+ suffix: tar.gz
58
+ desc: ""
59
+ type: binary
60
+ bowtie:
61
+ version: 0.12.7
62
+ url: http://sourceforge.net/projects/bowtie-bio/files/bowtie/0.12.7/bowtie-0.12.7-macos-10.5-x86_64.zip/download
63
+ basename: bowtie-0.12.7-macos-10.5-x86_64
64
+ suffix: zip
65
+ desc: ""
66
+ type: binary
67
+ # sra:
68
+ # version:
69
+ # url: http://trace.ncbi.nlm.nih.gov/Traces/sra/static/sratoolkit.2.1.0-mac64.tar.gz
70
+ # basename: sratoolkit.2.1.0-mac64
71
+ # suffix: tar.gz
72
+ # desc: ""
73
+ # type: binary
@@ -0,0 +1,189 @@
1
+ #
2
+ #
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+ require 'rubyvis'
11
+
12
+ module Bio
13
+ module Ngs
14
+ class Graphics
15
+
16
+ def self.draw_area(data,width,height,out=nil,xlabel,ylabel)
17
+ point = 0
18
+ max = data.max + 10
19
+ data = data.map do |d|
20
+ point += 1
21
+ OpenStruct.new({:x=> point, :y=> d})
22
+ end
23
+ x = pv.Scale.linear(data, lambda {|d| d.x}).range(0, width)
24
+ y = pv.Scale.linear(0, max).range(0, height);
25
+
26
+ #The root panel
27
+ vis = pv.Panel.new() do
28
+ width width
29
+ height height
30
+ bottom 20
31
+ left 50
32
+ right 10
33
+ top 5
34
+
35
+ # Y-axis and ticks
36
+ rule do
37
+ data y.ticks(n_ticks)
38
+ bottom(y)
39
+ stroke_style {|d| d!=0 ? "#eee" : "#000"}
40
+ label(:anchor=>"left") {
41
+ puts y.inspect
42
+ text y.tick_format
43
+ }
44
+ end
45
+
46
+ # X-axis and ticks.
47
+ rule do
48
+ data x.ticks()
49
+ visible {|d| d!=0}
50
+ left(x)
51
+ bottom(-5)
52
+ height(5)
53
+ label(:anchor=>'bottom') {
54
+ text(x.tick_format)
55
+ }
56
+ end
57
+
58
+ #/* The area with top line. */
59
+ area do |a|
60
+ a.data data
61
+ a.bottom(1)
62
+ a.left {|d| x.scale(d.x)}
63
+ a.height {|d| y.scale(d.y)}
64
+ a.fill_style("rgb(121,173,210)")
65
+ a.line(:anchor=>'top') {
66
+ line_width(3)
67
+ }
68
+ end
69
+ end
70
+
71
+ # panel legend and title
72
+ panel = vis.add(Rubyvis::Panel).
73
+ width(width-x_padding).
74
+ height(height)
75
+
76
+ panel.anchor('top').add(Rubyvis::Label).
77
+ font("20px sans-serif").
78
+ text(title_label)
79
+
80
+ panel.anchor('bottom').add(Rubyvis::Label).text(xlabel)
81
+ panel.anchor('left').add(Rubyvis::Label).
82
+ text_angle(1.5*Math::PI).
83
+ text(ylabel)
84
+
85
+
86
+ vis.render();
87
+
88
+ if out
89
+ File.open(out,"w") {|f| f.write(vis.to_svg) }
90
+ else
91
+ puts vis.to_svg
92
+ end
93
+
94
+ end
95
+
96
+ def self.bubble_chart(fileout,dataset = {}, panel_w = 600, panel_h = 800)
97
+ colors=Rubyvis::Colors.category10()
98
+ c=Rubyvis::Colors.category10().by(lambda {|n| n.parent_node})
99
+
100
+ vis = Rubyvis::Panel.new
101
+ .width(panel_w-10)
102
+ .height(panel_h-10)
103
+ .bottom(5)
104
+ .left(5)
105
+ .right(5)
106
+ .top(5)
107
+
108
+ root=Rubyvis::Dom::Node.new
109
+ dataset.each_pair do |name,value|
110
+ child = Rubyvis::Dom::Node.new(value)
111
+ child.node_name = name
112
+ root.append_child(child)
113
+ end
114
+ root = root.nodes()
115
+
116
+ pack=vis.add(pv.Layout.Pack).
117
+ nodes(root).
118
+ size(lambda {|n| n.node_value})
119
+
120
+ pack.node.add(Rubyvis::Dot).
121
+ visible( lambda {|n| n.parent_node}).
122
+ fill_style(lambda {|n|
123
+ colors.scale(n.parent_node).
124
+ brighter((n.node_value) / 5.0)
125
+ }).
126
+ stroke_style(c)
127
+
128
+ pack.node_label.add(Rubyvis::Label).
129
+ visible( lambda {|n| n.parent_node}).
130
+ text(lambda {|n| n.node_name})
131
+ vis.render()
132
+ File.open(fileout,"w") {|f| f.write vis.to_svg+"\n"}
133
+ end
134
+
135
+
136
+ def self.bar_charts(labels, data, fileout, width = 500, height = 300)
137
+
138
+ x = pv.Scale.linear(0, data.max).range(0, width)
139
+ y = pv.Scale.ordinal(pv.range(data.size)).split_banded(0, height, 4/5.0)
140
+
141
+ #/* The root panel. */
142
+ vis = pv.Panel.new()
143
+ .width(width)
144
+ .height(height)
145
+ .bottom(20)
146
+ .left(100)
147
+ .right(10)
148
+ .top(5);
149
+
150
+ #/* The bars. */
151
+ bar = vis.add(pv.Bar)
152
+ .data(data)
153
+ .top(lambda {y.scale(self.index)})
154
+ .height(y.range_band)
155
+ .left(0)
156
+ .width(x)
157
+
158
+ #/* The value label. */
159
+ bar.anchor("right").add(pv.Label)
160
+ .text_style("white")
161
+ .text(lambda {|d| "%0.1f" % d})
162
+
163
+ #/* The variable label. */
164
+ bar.anchor("left").add(pv.Label)
165
+ .text_margin(5)
166
+ .text_align("right")
167
+ .text(lambda { labels[self.index]});
168
+
169
+ #/* X-axis ticks. */
170
+ vis.add(pv.Rule)
171
+ .data(x.ticks(5))
172
+ .left(x)
173
+ .stroke_style(lambda {|d| d!=0 ? "rgba(255,255,255,.3)" : "#000"})
174
+ .add(pv.Rule)
175
+ .bottom(0)
176
+ .height(5)
177
+ .stroke_style("#000")
178
+ .anchor("bottom").add(pv.Label).text(x.tick_format)
179
+
180
+ # X-axis Labels
181
+ vis.anchor("top").add(Rubyvis::Label).text("Number of sequences")
182
+
183
+ vis.render();
184
+ File.open(fileout,"w") {|out| out.write vis.to_svg+"\n"}
185
+ end
186
+
187
+ end
188
+ end
189
+ end
@@ -0,0 +1,102 @@
1
+ #
2
+ #
3
+ # Copyright:: Copyright (C) 2011
4
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ #
8
+
9
+ module Bio
10
+ module Ngs
11
+ class Homology
12
+
13
+
14
+ # Method to import a Blast XML output file into a BlastOuput table created according to ActiveRecord model
15
+ # Params: XML Blast file, YAML file for db connection, optional ActiveRecord models file
16
+ def self.blast_import(file,yaml_file=nil)
17
+ db = Bio::Ngs::Db.new :homology,yaml_file
18
+ inserts = []
19
+ Bio::Blast::XmlIterator.new(file).to_enum.each do |iter|
20
+ iter.each do |hit|
21
+ identity = 0.0
22
+ positive = 0.0
23
+ evalue = []
24
+ length = 0
25
+ hit.each do |hsp|
26
+ identity += hsp.identity.to_f
27
+ positive += hsp.positive.to_f
28
+ evalue << hsp.evalue
29
+ length += hsp.align_len
30
+ end
31
+ identity = (identity / length)*100
32
+ positive = (positive / length)*100
33
+ evalue = evalue.inject{ |sum, el| sum + el }.to_f / evalue.size
34
+ inserts << [iter.query_def,hit.hit_id.split('|')[1],hit.hit_def,evalue,identity,positive]
35
+ if inserts.size == 1000
36
+ db.insert_many(:blast_outputs,"INSERT INTO blast_outputs(query_id,target_id,target_description,evalue,identity,positive) VALUES(?,?,?,?,?,?)",inserts)
37
+ inserts = []
38
+ end
39
+ end
40
+ end
41
+ db.insert_many(:blast_outputs,"INSERT INTO blast_outputs(query_id,target_id,target_description,evalue,identity,positive) VALUES(?,?,?,?,?,?)",inserts) if inserts.size > 0
42
+ end
43
+
44
+ def self.blast2text(file_in,file_out)
45
+ out = File.open(file_out,"w")
46
+ out.write("Query ID\tTarget ID\tTarget Description\tE-value\tIdentity\tPositive\n")
47
+ Bio::Blast::XmlIterator.new(file_in).to_enum.each do |iter|
48
+ iter.each do |hit|
49
+ identity = 0.0
50
+ positive = 0.0
51
+ evalue = []
52
+ length = 0
53
+ hit.each do |hsp|
54
+ identity += hsp.identity.to_f
55
+ positive += hsp.positive.to_f
56
+ evalue << hsp.evalue
57
+ length += hsp.align_len
58
+ end
59
+ identity = (identity / length)*100
60
+ positive = (positive / length)*100
61
+ evalue = evalue.inject{ |sum, el| sum + el }.to_f / evalue.size
62
+ out.write([iter.query_def,hit.hit_id,hit.hit_def,evalue,identity,positive].join("\t")+"\n")
63
+ end
64
+ end
65
+ out.close
66
+ end
67
+
68
+
69
+ # Method to import a GO Annotation file into GoAnnotation table created according to ActiveRecord model
70
+ # Params: GOA file, YAML file for db connection (optional)
71
+ def self.goa_import(file,yaml_file=nil)
72
+ db = Bio::Ngs::Db.new :homology, yaml_file
73
+ inserts = []
74
+ File.open(file).each do |line|
75
+ next if line.start_with? "!"
76
+ line.chomp!
77
+ inserts << line.split("\t")[0..14]
78
+ if inserts.size == 1000
79
+ db.insert_many(:go_annotations,"INSERT INTO go_annotations(db,entry_id,symbol,qualifier,go_id,db_ref,evidence,additional_identifier,aspect,name,synonym,molecule_type,taxon_id,date,assigned_by) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",inserts)
80
+ inserts = []
81
+ end
82
+ end
83
+ db.insert_many(:go_annotations,"INSERT INTO go_annotations(db,entry_id,symbol,qualifier,go_id,db_ref,evidence,additional_identifier,aspect,name,synonym,molecule_type,taxon_id,date,assigned_by) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",inserts) if inserts.size > 0
84
+ end
85
+
86
+ # Method to export the associations among genes and GO and store them into a JSON file that can be imported into the Ontology db
87
+ # Params: file to write JSON data
88
+ def self.go_annotation_to_json(file_out,library=nil,yaml_file=nil)
89
+ db = Bio::Ngs::Db.new :homology, yaml_file
90
+ ontologies = []
91
+ BlastOutput.find(:all).each do |result|
92
+ ontology = Bio::Ngs::Ontology.new result.query_id
93
+ ontology.go = result.go_annotations.map {|goa| goa.go_id}
94
+ ontology.library = library
95
+ ontologies << ontology
96
+ end
97
+ File.open(file_out,"w") {|f| f.write ontologies.to_json}
98
+ end
99
+
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,103 @@
1
+ #
2
+ #
3
+ # Copyright:: Copyright (C) 2011
4
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ #
8
+
9
+ module Bio
10
+ module Ngs
11
+ class Ontology
12
+
13
+ # Method to import a GO OBO file into Go table created according to ActiveRecord model
14
+ # Params: GO OBO file, YAML file for db connection
15
+ def self.go_import(file,yaml_file=nil)
16
+ db = Bio::Ngs::Db.new :ontology,yaml_file
17
+ inserts = []
18
+ file = File.open(file)
19
+ file.each do |line|
20
+ if line.start_with? "[Term]"
21
+ block = file.gets("\n\n")
22
+ is_a = []
23
+ data = []
24
+ block.split("\n").each do |elem|
25
+ if elem.start_with? "id: "
26
+ data << elem.gsub("id: ","")
27
+ elsif elem.start_with? "name: "
28
+ data << elem.gsub("name: ","")
29
+ elsif elem.start_with? "is_a"
30
+ is_a << elem.gsub("is_a: ","").split("!").first
31
+ elsif elem.start_with? "namespace: "
32
+ data << elem.gsub("namespace: ","")
33
+ end
34
+ end
35
+ data << is_a.join(" ")
36
+ inserts << data
37
+ if inserts.size == 1000
38
+ db.insert_many(:go,"INSERT INTO go(go_id,name,namespace,is_a) VALUES(?,?,?,?)",inserts)
39
+ inserts = []
40
+ end
41
+ end
42
+ end
43
+ db.insert_many(:go,"INSERT INTO go(go_id,name,namespace,is_a) VALUES(?,?,?,?)",inserts) if inserts.size > 0
44
+ end
45
+
46
+ # Method to lood the Gene-GO associations from a JSON file into the Ontology db
47
+ # Params: JSON file name, YAML file for db connection (optional)
48
+ def self.load_go_genes(file,yaml_file=nil)
49
+ db = Bio::Ngs::Db.new :ontology, yaml_file
50
+ list = JSON.load File.read(file)
51
+ ontologies = Bio::Ngs::OntologyCollection.new
52
+ list.each_with_index do |gene,index|
53
+ ontologies << Bio::Ngs::Ontology.new(gene["gene_id"],gene["go"],gene["library"])
54
+ end
55
+ ontologies.to_db(yaml_file)
56
+ end
57
+
58
+
59
+ attr_accessor :gene_id, :go, :library
60
+ # Constructor for Bio::Ngs::Ontology instances
61
+ def initialize(gene_id,go=[],library=nil)
62
+ @gene_id = gene_id
63
+ @go = go
64
+ @library = library
65
+ end
66
+
67
+ # Method to store a single Bio::Ngs::Ontology object into the Ontology db
68
+ def to_db(yaml_file=nil)
69
+ raise RuntimeError,"You must initialize the Ontolgy db with biongs ontology:db:init" if Go.count == 0
70
+ db = Bio::Ngs::Db.new :ontology,yaml_file
71
+ g = Gene.create(:gene_id => @gene_id, :library => @library)
72
+ Go.where({:go_id => @go}).all.each do |go|
73
+ g.gene_gos.create(:go_id => go.id)
74
+ end
75
+ end
76
+
77
+
78
+ end
79
+
80
+ # Class to handle collection of Bio::Ngs::Ontology objects.
81
+ # It provides a method to store all the gene-GO associations into the Ontology db
82
+ class OntologyCollection < Array
83
+
84
+ def to_db(yaml_file=nil)
85
+ db = Bio::Ngs::Db.new :ontology, yaml_file
86
+ genes = []
87
+ ontologies = []
88
+ go = {}
89
+ Go.find_by_sql("SELECT id, go_id FROM go").each {|g| go[g.go_id] = g.id}
90
+ self.each_with_index do |gene,index|
91
+ raise ArgumentError "OntologyCollection can store only Bio::Ngs::Ontology objects!" if gene.class != Bio::Ngs::Ontology
92
+ genes << [index+1,gene.gene_id,gene.library]
93
+ gene.go.each {|o| ontologies << [index+1,go[o]] if go[o]}
94
+ end
95
+ db.insert_many(:genes,"INSERT INTO genes(id,gene_id,library) VALUES(?,?,?)",genes)
96
+ db.insert_many(:gene_gos,"INSERT INTO gene_gos(gene_id,go_id) VALUES(?,?)",ontologies)
97
+ end
98
+
99
+ end
100
+
101
+
102
+ end
103
+ end