bio-ngs 0.3.2.alpha.01

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,100 @@
1
+ #
2
+ # converter.rb - convert qseq format to fastq
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>,
6
+ # Ranzani Valeria <ranzani@ingm.it>
7
+ # License:: The Ruby License
8
+ #
9
+ #
10
+
11
+
12
+ module Bio
13
+ module Ngs
14
+ module Converter
15
+ class Qseq
16
+
17
+ # Source buffer:
18
+ # String with \n as line separator
19
+ # File (reading)
20
+ attr_accessor :buffer
21
+ attr_reader :type
22
+ attr_reader :stats #keep statistics about total reads, passed filter or not.
23
+
24
+ def initialize(default_type=nil)
25
+ @type=default_type if [:pe, :se].include?(default_type)
26
+ @stats = {}
27
+ end
28
+
29
+ def type=(data)
30
+ if [:pe, :se].include?(data)
31
+ @type = data
32
+ else
33
+ @type = nil
34
+ end
35
+ end
36
+
37
+ # Return each line converted in fastq, is a line is not valid
38
+ # because is not good enough that line will return a nil
39
+ # rember to remove the nil values if you are building an array
40
+ # TODO: benchmark the performances, I suspect this is not ooptimized
41
+ def to_fastq(stats=false)
42
+ if (type.nil?)
43
+ raise "Type of qseq not specifed."
44
+ else
45
+ total = 0
46
+ passed = 0
47
+ rejected = 0
48
+ bases_passed_b_quality = 0
49
+ bases_rejected_b_quality = 0
50
+ bases_passed_total = 0
51
+ bases_rejected_total = 0
52
+ bases_passed_N = 0
53
+ bases_rejected_N = 0
54
+ @buffer.lines do |line|
55
+ qseq_line_array = line.split
56
+ read = (send "qseq2fastq_#{type}", qseq_line_array)
57
+ total += 1
58
+ if read
59
+ passed+=1
60
+ bases_passed_b_quality += qseq_line_array[9].scan("B").size
61
+ bases_passed_N += qseq_line_array[9].scan("N").size
62
+ bases_passed_total += qseq_line_array[9].size
63
+ else
64
+ rejected+=1
65
+ bases_rejected_b_quality += qseq_line_array[9].scan("B").size
66
+ bases_rejected_N += qseq_line_array[9].scan("N").size
67
+ bases_rejected_total += qseq_line_array[9].size
68
+ end
69
+ yield read
70
+ end
71
+ @stats={:reads_total=>total,
72
+ :reads_passed=>passed,
73
+ :reads_rejected=>rejected,
74
+ :bases_passed_total => bases_passed_total,
75
+ :bases_rejected_total => bases_rejected_total,
76
+ :bases_passed_with_b_quality => bases_passed_b_quality,
77
+ :bases_rejected_with_b_quality => bases_rejected_b_quality,
78
+ :bases_passed_with_n => bases_passed_N,
79
+ :bases_rejected_with_n => bases_rejected_N}
80
+ end
81
+ end
82
+
83
+ # Return the reads in fastq from a paired-end read dataset
84
+ # qseq_line is an Array of strings generated from raw line of qseq file.
85
+ def qseq2fastq_pe(qseq)
86
+ # qseq = qseq_line.split #logic here
87
+ "@#{qseq[0]}:#{qseq[2]}:#{qseq[3]}:#{qseq[4]}:#{qseq[5]}#0/#{qseq[7]}\n#{qseq[8].gsub(/\./,'N')}\n+\n#{qseq[9]}" if qseq[10]=="1"
88
+ end
89
+
90
+ # Return the reads in fastq from a single read dataset
91
+ # qseq_line is an Array of strings generated from raw line of qseq file.
92
+ def qseq2fastq_se(qseq)
93
+ # qseq = qseq_line.split #logic here
94
+ "@#{qseq[0]}:#{qseq[2]}:#{qseq[3]}:#{qseq[4]}:#{qseq[5]}#0/\n#{qseq[8].gsub(/\./,'N')}\n+\n#{qseq[9]}" if qseq[10]=="1"
95
+ end
96
+
97
+ end #Qseq
98
+ end #Converter
99
+ end #Ngs
100
+ end #Bio
@@ -0,0 +1,12 @@
1
+ class Thor
2
+ module CoreExt #:nodoc:
3
+ class HashWithIndifferentAccess
4
+ def symbolize_keys
5
+ self.inject({}) do |hash, item|
6
+ hash[item[0].to_sym] = item[1]
7
+ hash
8
+ end
9
+ end
10
+ end #HashWithIndifferentAccess
11
+ end # CoreExt
12
+ end #Thor
data/lib/bio/ngs/db.rb ADDED
@@ -0,0 +1,66 @@
1
+ #
2
+ #
3
+ # Copyright:: Copyright (C) 2011
4
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ #
8
+
9
+ module Bio
10
+ module Ngs
11
+ class Db
12
+
13
+ require 'active_support/inflector'
14
+ DB_TYPES = [:ontology, :homology]
15
+
16
+ # Open a connection to a database using ActiveRecord
17
+ def initialize(*args)
18
+ @db_type = args[0]
19
+ if DB_TYPES.include? @db_type
20
+ yaml_file=(args[1]) ? args[1] : Dir.pwd+"/conf/#{@db_type}_db.yml"
21
+ @db = ActiveRecord::Base
22
+ @db.establish_connection YAML.load_file(yaml_file)
23
+ # ONLY FOR DEBUG
24
+ #require 'logger'
25
+ #ActiveRecord::Base.logger = Logger.new 'log/db.log'
26
+ require File.expand_path(File.dirname(__FILE__)+"/db/models/#{@db_type}.rb")
27
+ else
28
+ raise ArgumentError, "Invalid database type: #{@db_type}"
29
+ end
30
+ end
31
+
32
+ # Runs AR migrations and create database tables
33
+ def create_tables(verbose=false)
34
+ ActiveRecord::Migration.verbose = verbose
35
+ ActiveRecord::Migrator.migrate(File.expand_path(File.dirname(__FILE__)+"/db/migrate/#{@db_type}"),nil)
36
+ end
37
+
38
+ # Export a database table into a tab-separated file
39
+ def export(table,fileout)
40
+ klass = @db.const_get(table.singularize.camelize)
41
+ columns = klass.column_names
42
+ out = File.open(fileout,"w")
43
+ out.write columns.join("\t")+"\n"
44
+ klass.find(:all).each do |output|
45
+ records = output.attributes
46
+ values = []
47
+ columns.each {|c| values << records[c]}
48
+ out.write values.join("\t")+"\n"
49
+ end
50
+ end
51
+
52
+ # Wrapper for DB transaction to execute many INSERT queries into a single transaction
53
+ # This can speed up things particularly for SQLite databases.
54
+ def insert_many(table,query,values=[])
55
+ klass = @db.const_get(table.to_s.singularize.camelize)
56
+ klass.transaction do
57
+ values.each do |v|
58
+ sql = @db.send(:sanitize_sql_array,[query]+v)
59
+ @db.connection.execute(sql)
60
+ end
61
+ end
62
+ end
63
+
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,22 @@
1
+ class CreateBlastout < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :blast_outputs do |t|
5
+ t.string :query_id
6
+ t.string :target_id
7
+ t.string :target_description
8
+ t.float :evalue, :precision => :double
9
+ t.float :identity
10
+ t.float :positive
11
+ end
12
+
13
+ add_index :blast_outputs, :query_id
14
+
15
+ end
16
+
17
+ def self.down
18
+ drop_table :blast_outputs
19
+ end
20
+
21
+
22
+ end
@@ -0,0 +1,29 @@
1
+ class CreateGoannotation < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :go_annotations do |t|
5
+ t.string :db
6
+ t.string :entry_id
7
+ t.string :symbol
8
+ t.string :qualifier
9
+ t.string :go_id
10
+ t.string :db_ref
11
+ t.string :evidence
12
+ t.string :additional_identifier
13
+ t.string :aspect
14
+ t.string :name
15
+ t.string :synonym
16
+ t.string :molecule_type
17
+ t.string :taxon_id
18
+ t.string :date
19
+ t.string :assigned_by
20
+ end
21
+
22
+ add_index :go_annotations, :entry_id
23
+ end
24
+
25
+ def self.down
26
+ drop_table :go_annotations
27
+ end
28
+
29
+ end
@@ -0,0 +1,18 @@
1
+ class CreateGo < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :go do |t|
5
+ t.string :go_id
6
+ t.string :name
7
+ t.string :namespace
8
+ t.string :is_a
9
+ end
10
+
11
+ add_index :go, :go_id
12
+ end
13
+
14
+ def self.down
15
+ drop_table :go
16
+ end
17
+
18
+ end
@@ -0,0 +1,17 @@
1
+ class CreateGeneGo < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :gene_gos do |t|
5
+ t.integer :gene_id
6
+ t.integer :go_id
7
+ end
8
+
9
+ add_index :gene_gos, :gene_id
10
+ add_index :gene_gos, :go_id
11
+ end
12
+
13
+ def self.down
14
+ drop_table :gene_gos
15
+ end
16
+
17
+ end
@@ -0,0 +1,16 @@
1
+ class CreateGene < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :genes do |t|
5
+ t.string :gene_id
6
+ t.string :library
7
+ end
8
+
9
+ add_index :genes, :gene_id
10
+ end
11
+
12
+ def self.down
13
+ drop_table :genes
14
+ end
15
+
16
+ end
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,8 @@
1
+
2
+ class GoAnnotation < ActiveRecord::Base
3
+ belongs_to :blast_output
4
+ end
5
+
6
+ class BlastOutput < ActiveRecord::Base
7
+ has_many :go_annotations, :foreign_key => "entry_id", :primary_key => "target_id"
8
+ end
@@ -0,0 +1,16 @@
1
+ class Go < ActiveRecord::Base
2
+ set_table_name "go"
3
+ has_many :genes, :through => :gene_gos
4
+ has_many :gene_gos
5
+
6
+ end
7
+
8
+ class Gene < ActiveRecord::Base
9
+ has_many :go, :through => :gene_gos
10
+ has_many :gene_gos
11
+ end
12
+
13
+ class GeneGo < ActiveRecord::Base
14
+ belongs_to :gene
15
+ belongs_to :go
16
+ end
@@ -0,0 +1,161 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
4
+ # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Affero General Public License as
8
+ # published by the Free Software Foundation, either version 3 of the
9
+ # License, or (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Affero General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Affero General Public License
17
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ function usage()
20
+ {
21
+ echo "Solexa-Quality BoxPlot plotter"
22
+ echo "Generates a solexa quality score box-plot graph "
23
+ echo
24
+ echo "Usage: $0 [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]"
25
+ echo
26
+ echo " [-p] - Generate PostScript (.PS) file. Default is PNG image."
27
+ echo " [-i INPUT.TXT] - Input file. Should be the output of \"solexa_quality_statistics\" program."
28
+ echo " [-o OUTPUT] - Output file name. default is STDOUT."
29
+ echo " [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph."
30
+ echo
31
+ exit
32
+ }
33
+
34
+ #
35
+ # Input Data columns: #pos cnt min max sum mean Q1 med Q3 IQR lW rW A_Count C_Count G_Count T_Count N_Count
36
+ # As produced by "solexa_quality_statistics" program
37
+
38
+ TITLE="" # default title is empty
39
+ FILENAME=""
40
+ OUTPUTTERM="set term png size 2048,768" # default output terminal is "PNG"
41
+ OUTPUTFILE="/dev/stdout" # Default output file is simply "stdout"
42
+ while getopts ":t:i:o:ph" Option
43
+ do
44
+ case $Option in
45
+ # w ) CMD=$OPTARG; FILENAME="PIMSLogList.txt"; TARGET="logfiles"; ;;
46
+ t ) TITLE="for $OPTARG" ;;
47
+ i ) FILENAME=$OPTARG ;;
48
+ o ) OUTPUTFILE="$OPTARG" ;;
49
+ p ) OUTPUTTERM="set term postscript enhanced color \"Helvetica\" 8" ;;
50
+ h ) usage ;;
51
+ * ) echo "unrecognized argument. use '-h' for usage information."; exit -1 ;;
52
+ esac
53
+ done
54
+ shift $(($OPTIND - 1))
55
+
56
+
57
+ if [ "$FILENAME" == "" ]; then
58
+ usage
59
+ fi
60
+
61
+ if [ ! -r "$FILENAME" ]; then
62
+ echo "Error: can't open input file ($1)." >&2
63
+ exit 1
64
+ fi
65
+
66
+ ##
67
+ ## Input validation
68
+ ## Too many users (in galaxy) try to plot a FASTQ file
69
+ ## (without using the 'fastq statistics' tool first).
70
+ ##
71
+ ## gnuplot's error in that case is crypt, and support emails are annoying.
72
+ ##
73
+ ## try to detect FASTA/FASTQ input, and give a detailed, easy-to-understand warning.
74
+ ##
75
+ ##
76
+ AWK_FASTX_DETECTION='
77
+ NR==1 && $0 ~ /^>/ { fasta_id = 1 }
78
+ NR==1 && $0 ~ /^@/ { fastq_id = 1 }
79
+ NR==2 && $0 ~ /^[ACGT][ACGT]*$/ { nucleotides = 1 }
80
+ NR>3 { exit }
81
+ END { if ( fasta_id && nucleotides ) { print "FASTA" }
82
+ if ( fastq_id && nucleotides ) { print "FASTQ" }
83
+ }'
84
+
85
+ INPUT_TYPE=$(awk "$AWK_FASTX_DETECTION" "$FILENAME")
86
+
87
+ if [ "x$INPUT_TYPE" = "xFASTA" ] ; then
88
+ #this doesn't even make sense: FASTA files don't contain any quality scores
89
+ cat>&2<<EOF
90
+ Error: It looks like your input file is a FASTA file.
91
+
92
+ FASTA files do not contain quality scores, and can not be used with this tool.
93
+ EOF
94
+ exit 1
95
+ fi
96
+ if [ "x$INPUT_TYPE" = "xFASTQ" ] ; then
97
+ cat>&2<<EOF
98
+ Error: It looks like your input file is a FASTQ file.
99
+
100
+ This tool (fastq-quality-plot) can't use FASTQ files directly - it requires a tabular text file conaining summary statistic about your FASTQ file.
101
+
102
+ In Galaxy,
103
+ Please use the "Compute Quality Statistics" tool (in the "NGS: QC and Manipulation" category) to compute the quality statistics report, and then use this tool with the new statistics report.
104
+
105
+ On the command line,
106
+ Please use the "fastx_quality_stats" program to create the statistics report.
107
+ EOF
108
+ exit 1
109
+ fi
110
+
111
+ ##
112
+ ## Even if this is not a FASTA/FASTQ file,
113
+ ## users can still use incompatible input files.
114
+ ## Try to detect it and abort with a warning.
115
+ AWK_VALID_STAT='NR==1 && $1=="column" && $2=="count" && $3=="min" { exit 2 } NR>1 { exit }'
116
+
117
+ awk "$AWK_VALID_STAT" "$FILENAME"
118
+ if [ $? -ne 2 ] ; then
119
+ cat>&2<<EOF
120
+ Error: Input file is not a valid statistics report.
121
+
122
+ This tool (fastq-quality-plot) requires a tabular text file conaining summary statistic about your FASTQ file.
123
+
124
+ In Galaxy,
125
+ Please use the "Compute Quality Statistics" tool (in the "NGS: QC and Manipulation" category) to compute the quality statistics report, and then use this tool with the new statistics report.
126
+
127
+ On the command line,
128
+ Please use the "fastx_quality_stats" program to create the statistics report.
129
+ EOF
130
+ exit 1
131
+ fi
132
+
133
+
134
+ #Read number of cycles from the stats file (each line is a cycle, minus the header line)
135
+ #But for the graph, I want xrange to reach (num_cycles+1), so I don't subtract 1 now.
136
+ NUM_CYCLES=$(cat "$FILENAME" | wc -l)
137
+
138
+ GNUPLOTCMD="
139
+ $OUTPUTTERM
140
+ set boxwidth 0.8
141
+ set size 1,1
142
+ set key Left inside
143
+ set xlabel \"read position\"
144
+ set ylabel \"Coverage\"
145
+ set title \"Coverage $TITLE\"
146
+ #set auto y
147
+ #set bars 4.0
148
+ set xrange [ 0: $NUM_CYCLES ]
149
+ #set yrange [-15:45]
150
+ #set y2range [-15:45]
151
+ set xtics 1
152
+ set x2tics 1
153
+ #set ytics 2
154
+ #set y2tics 2
155
+ set tics out
156
+ set grid ytics
157
+ set style fill empty
158
+ plot '$FILENAME' using 1:2 with lines lt 1 lw 1 title 'Coverage'
159
+ "
160
+
161
+ echo "$GNUPLOTCMD" | gnuplot > "$OUTPUTFILE"