bio-ngs 0.3.2.alpha.01

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,100 @@
1
+ #
2
+ # converter.rb - convert qseq format to fastq
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>,
6
+ # Ranzani Valeria <ranzani@ingm.it>
7
+ # License:: The Ruby License
8
+ #
9
+ #
10
+
11
+
12
+ module Bio
13
+ module Ngs
14
+ module Converter
15
+ class Qseq
16
+
17
+ # Source buffer:
18
+ # String with \n as line separator
19
+ # File (reading)
20
+ attr_accessor :buffer
21
+ attr_reader :type
22
+ attr_reader :stats #keep statistics about total reads, passed filter or not.
23
+
24
+ def initialize(default_type=nil)
25
+ @type=default_type if [:pe, :se].include?(default_type)
26
+ @stats = {}
27
+ end
28
+
29
+ def type=(data)
30
+ if [:pe, :se].include?(data)
31
+ @type = data
32
+ else
33
+ @type = nil
34
+ end
35
+ end
36
+
37
+ # Return each line converted in fastq, is a line is not valid
38
+ # because is not good enough that line will return a nil
39
+ # rember to remove the nil values if you are building an array
40
+ # TODO: benchmark the performances, I suspect this is not ooptimized
41
+ def to_fastq(stats=false)
42
+ if (type.nil?)
43
+ raise "Type of qseq not specifed."
44
+ else
45
+ total = 0
46
+ passed = 0
47
+ rejected = 0
48
+ bases_passed_b_quality = 0
49
+ bases_rejected_b_quality = 0
50
+ bases_passed_total = 0
51
+ bases_rejected_total = 0
52
+ bases_passed_N = 0
53
+ bases_rejected_N = 0
54
+ @buffer.lines do |line|
55
+ qseq_line_array = line.split
56
+ read = (send "qseq2fastq_#{type}", qseq_line_array)
57
+ total += 1
58
+ if read
59
+ passed+=1
60
+ bases_passed_b_quality += qseq_line_array[9].scan("B").size
61
+ bases_passed_N += qseq_line_array[9].scan("N").size
62
+ bases_passed_total += qseq_line_array[9].size
63
+ else
64
+ rejected+=1
65
+ bases_rejected_b_quality += qseq_line_array[9].scan("B").size
66
+ bases_rejected_N += qseq_line_array[9].scan("N").size
67
+ bases_rejected_total += qseq_line_array[9].size
68
+ end
69
+ yield read
70
+ end
71
+ @stats={:reads_total=>total,
72
+ :reads_passed=>passed,
73
+ :reads_rejected=>rejected,
74
+ :bases_passed_total => bases_passed_total,
75
+ :bases_rejected_total => bases_rejected_total,
76
+ :bases_passed_with_b_quality => bases_passed_b_quality,
77
+ :bases_rejected_with_b_quality => bases_rejected_b_quality,
78
+ :bases_passed_with_n => bases_passed_N,
79
+ :bases_rejected_with_n => bases_rejected_N}
80
+ end
81
+ end
82
+
83
+ # Return the reads in fastq from a paired-end read dataset
84
+ # qseq_line is an Array of strings generated from raw line of qseq file.
85
+ def qseq2fastq_pe(qseq)
86
+ # qseq = qseq_line.split #logic here
87
+ "@#{qseq[0]}:#{qseq[2]}:#{qseq[3]}:#{qseq[4]}:#{qseq[5]}#0/#{qseq[7]}\n#{qseq[8].gsub(/\./,'N')}\n+\n#{qseq[9]}" if qseq[10]=="1"
88
+ end
89
+
90
+ # Return the reads in fastq from a single read dataset
91
+ # qseq_line is an Array of strings generated from raw line of qseq file.
92
+ def qseq2fastq_se(qseq)
93
+ # qseq = qseq_line.split #logic here
94
+ "@#{qseq[0]}:#{qseq[2]}:#{qseq[3]}:#{qseq[4]}:#{qseq[5]}#0/\n#{qseq[8].gsub(/\./,'N')}\n+\n#{qseq[9]}" if qseq[10]=="1"
95
+ end
96
+
97
+ end #Qseq
98
+ end #Converter
99
+ end #Ngs
100
+ end #Bio
@@ -0,0 +1,12 @@
1
+ class Thor
2
+ module CoreExt #:nodoc:
3
+ class HashWithIndifferentAccess
4
+ def symbolize_keys
5
+ self.inject({}) do |hash, item|
6
+ hash[item[0].to_sym] = item[1]
7
+ hash
8
+ end
9
+ end
10
+ end #HashWithIndifferentAccess
11
+ end # CoreExt
12
+ end #Thor
data/lib/bio/ngs/db.rb ADDED
@@ -0,0 +1,66 @@
1
+ #
2
+ #
3
+ # Copyright:: Copyright (C) 2011
4
+ # Francesco Strozzi <francesco.strozzi@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ #
8
+
9
+ module Bio
10
+ module Ngs
11
+ class Db
12
+
13
+ require 'active_support/inflector'
14
+ DB_TYPES = [:ontology, :homology]
15
+
16
+ # Open a connection to a database using ActiveRecord
17
+ def initialize(*args)
18
+ @db_type = args[0]
19
+ if DB_TYPES.include? @db_type
20
+ yaml_file=(args[1]) ? args[1] : Dir.pwd+"/conf/#{@db_type}_db.yml"
21
+ @db = ActiveRecord::Base
22
+ @db.establish_connection YAML.load_file(yaml_file)
23
+ # ONLY FOR DEBUG
24
+ #require 'logger'
25
+ #ActiveRecord::Base.logger = Logger.new 'log/db.log'
26
+ require File.expand_path(File.dirname(__FILE__)+"/db/models/#{@db_type}.rb")
27
+ else
28
+ raise ArgumentError, "Invalid database type: #{@db_type}"
29
+ end
30
+ end
31
+
32
+ # Runs AR migrations and create database tables
33
+ def create_tables(verbose=false)
34
+ ActiveRecord::Migration.verbose = verbose
35
+ ActiveRecord::Migrator.migrate(File.expand_path(File.dirname(__FILE__)+"/db/migrate/#{@db_type}"),nil)
36
+ end
37
+
38
+ # Export a database table into a tab-separated file
39
+ def export(table,fileout)
40
+ klass = @db.const_get(table.singularize.camelize)
41
+ columns = klass.column_names
42
+ out = File.open(fileout,"w")
43
+ out.write columns.join("\t")+"\n"
44
+ klass.find(:all).each do |output|
45
+ records = output.attributes
46
+ values = []
47
+ columns.each {|c| values << records[c]}
48
+ out.write values.join("\t")+"\n"
49
+ end
50
+ end
51
+
52
+ # Wrapper for DB transaction to execute many INSERT queries into a single transaction
53
+ # This can speed up things particularly for SQLite databases.
54
+ def insert_many(table,query,values=[])
55
+ klass = @db.const_get(table.to_s.singularize.camelize)
56
+ klass.transaction do
57
+ values.each do |v|
58
+ sql = @db.send(:sanitize_sql_array,[query]+v)
59
+ @db.connection.execute(sql)
60
+ end
61
+ end
62
+ end
63
+
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,22 @@
1
+ class CreateBlastout < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :blast_outputs do |t|
5
+ t.string :query_id
6
+ t.string :target_id
7
+ t.string :target_description
8
+ t.float :evalue, :precision => :double
9
+ t.float :identity
10
+ t.float :positive
11
+ end
12
+
13
+ add_index :blast_outputs, :query_id
14
+
15
+ end
16
+
17
+ def self.down
18
+ drop_table :blast_outputs
19
+ end
20
+
21
+
22
+ end
@@ -0,0 +1,29 @@
1
+ class CreateGoannotation < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :go_annotations do |t|
5
+ t.string :db
6
+ t.string :entry_id
7
+ t.string :symbol
8
+ t.string :qualifier
9
+ t.string :go_id
10
+ t.string :db_ref
11
+ t.string :evidence
12
+ t.string :additional_identifier
13
+ t.string :aspect
14
+ t.string :name
15
+ t.string :synonym
16
+ t.string :molecule_type
17
+ t.string :taxon_id
18
+ t.string :date
19
+ t.string :assigned_by
20
+ end
21
+
22
+ add_index :go_annotations, :entry_id
23
+ end
24
+
25
+ def self.down
26
+ drop_table :go_annotations
27
+ end
28
+
29
+ end
@@ -0,0 +1,18 @@
1
+ class CreateGo < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :go do |t|
5
+ t.string :go_id
6
+ t.string :name
7
+ t.string :namespace
8
+ t.string :is_a
9
+ end
10
+
11
+ add_index :go, :go_id
12
+ end
13
+
14
+ def self.down
15
+ drop_table :go
16
+ end
17
+
18
+ end
@@ -0,0 +1,17 @@
1
+ class CreateGeneGo < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :gene_gos do |t|
5
+ t.integer :gene_id
6
+ t.integer :go_id
7
+ end
8
+
9
+ add_index :gene_gos, :gene_id
10
+ add_index :gene_gos, :go_id
11
+ end
12
+
13
+ def self.down
14
+ drop_table :gene_gos
15
+ end
16
+
17
+ end
@@ -0,0 +1,16 @@
1
+ class CreateGene < ActiveRecord::Migration
2
+
3
+ def self.up
4
+ create_table :genes do |t|
5
+ t.string :gene_id
6
+ t.string :library
7
+ end
8
+
9
+ add_index :genes, :gene_id
10
+ end
11
+
12
+ def self.down
13
+ drop_table :genes
14
+ end
15
+
16
+ end
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,8 @@
1
+
2
+ class GoAnnotation < ActiveRecord::Base
3
+ belongs_to :blast_output
4
+ end
5
+
6
+ class BlastOutput < ActiveRecord::Base
7
+ has_many :go_annotations, :foreign_key => "entry_id", :primary_key => "target_id"
8
+ end
@@ -0,0 +1,16 @@
1
+ class Go < ActiveRecord::Base
2
+ set_table_name "go"
3
+ has_many :genes, :through => :gene_gos
4
+ has_many :gene_gos
5
+
6
+ end
7
+
8
+ class Gene < ActiveRecord::Base
9
+ has_many :go, :through => :gene_gos
10
+ has_many :gene_gos
11
+ end
12
+
13
+ class GeneGo < ActiveRecord::Base
14
+ belongs_to :gene
15
+ belongs_to :go
16
+ end
@@ -0,0 +1,161 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # FASTX-toolkit - FASTA/FASTQ preprocessing tools.
4
+ # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
5
+ #
6
+ # This program is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU Affero General Public License as
8
+ # published by the Free Software Foundation, either version 3 of the
9
+ # License, or (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU Affero General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Affero General Public License
17
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ function usage()
20
+ {
21
+ echo "Solexa-Quality BoxPlot plotter"
22
+ echo "Generates a solexa quality score box-plot graph "
23
+ echo
24
+ echo "Usage: $0 [-i INPUT.TXT] [-t TITLE] [-p] [-o OUTPUT]"
25
+ echo
26
+ echo " [-p] - Generate PostScript (.PS) file. Default is PNG image."
27
+ echo " [-i INPUT.TXT] - Input file. Should be the output of \"solexa_quality_statistics\" program."
28
+ echo " [-o OUTPUT] - Output file name. default is STDOUT."
29
+ echo " [-t TITLE] - Title (usually the solexa file name) - will be plotted on the graph."
30
+ echo
31
+ exit
32
+ }
33
+
34
+ #
35
+ # Input Data columns: #pos cnt min max sum mean Q1 med Q3 IQR lW rW A_Count C_Count G_Count T_Count N_Count
36
+ # As produced by "solexa_quality_statistics" program
37
+
38
+ TITLE="" # default title is empty
39
+ FILENAME=""
40
+ OUTPUTTERM="set term png size 2048,768" # default output terminal is "PNG"
41
+ OUTPUTFILE="/dev/stdout" # Default output file is simply "stdout"
42
+ while getopts ":t:i:o:ph" Option
43
+ do
44
+ case $Option in
45
+ # w ) CMD=$OPTARG; FILENAME="PIMSLogList.txt"; TARGET="logfiles"; ;;
46
+ t ) TITLE="for $OPTARG" ;;
47
+ i ) FILENAME=$OPTARG ;;
48
+ o ) OUTPUTFILE="$OPTARG" ;;
49
+ p ) OUTPUTTERM="set term postscript enhanced color \"Helvetica\" 8" ;;
50
+ h ) usage ;;
51
+ * ) echo "unrecognized argument. use '-h' for usage information."; exit -1 ;;
52
+ esac
53
+ done
54
+ shift $(($OPTIND - 1))
55
+
56
+
57
+ if [ "$FILENAME" == "" ]; then
58
+ usage
59
+ fi
60
+
61
+ if [ ! -r "$FILENAME" ]; then
62
+ echo "Error: can't open input file ($1)." >&2
63
+ exit 1
64
+ fi
65
+
66
+ ##
67
+ ## Input validation
68
+ ## Too many users (in galaxy) try to plot a FASTQ file
69
+ ## (without using the 'fastq statistics' tool first).
70
+ ##
71
+ ## gnuplot's error in that case is crypt, and support emails are annoying.
72
+ ##
73
+ ## try to detect FASTA/FASTQ input, and give a detailed, easy-to-understand warning.
74
+ ##
75
+ ##
76
+ AWK_FASTX_DETECTION='
77
+ NR==1 && $0 ~ /^>/ { fasta_id = 1 }
78
+ NR==1 && $0 ~ /^@/ { fastq_id = 1 }
79
+ NR==2 && $0 ~ /^[ACGT][ACGT]*$/ { nucleotides = 1 }
80
+ NR>3 { exit }
81
+ END { if ( fasta_id && nucleotides ) { print "FASTA" }
82
+ if ( fastq_id && nucleotides ) { print "FASTQ" }
83
+ }'
84
+
85
+ INPUT_TYPE=$(awk "$AWK_FASTX_DETECTION" "$FILENAME")
86
+
87
+ if [ "x$INPUT_TYPE" = "xFASTA" ] ; then
88
+ #this doesn't even make sense: FASTA files don't contain any quality scores
89
+ cat>&2<<EOF
90
+ Error: It looks like your input file is a FASTA file.
91
+
92
+ FASTA files do not contain quality scores, and can not be used with this tool.
93
+ EOF
94
+ exit 1
95
+ fi
96
+ if [ "x$INPUT_TYPE" = "xFASTQ" ] ; then
97
+ cat>&2<<EOF
98
+ Error: It looks like your input file is a FASTQ file.
99
+
100
+ This tool (fastq-quality-plot) can't use FASTQ files directly - it requires a tabular text file conaining summary statistic about your FASTQ file.
101
+
102
+ In Galaxy,
103
+ Please use the "Compute Quality Statistics" tool (in the "NGS: QC and Manipulation" category) to compute the quality statistics report, and then use this tool with the new statistics report.
104
+
105
+ On the command line,
106
+ Please use the "fastx_quality_stats" program to create the statistics report.
107
+ EOF
108
+ exit 1
109
+ fi
110
+
111
+ ##
112
+ ## Even if this is not a FASTA/FASTQ file,
113
+ ## users can still use incompatible input files.
114
+ ## Try to detect it and abort with a warning.
115
+ AWK_VALID_STAT='NR==1 && $1=="column" && $2=="count" && $3=="min" { exit 2 } NR>1 { exit }'
116
+
117
+ awk "$AWK_VALID_STAT" "$FILENAME"
118
+ if [ $? -ne 2 ] ; then
119
+ cat>&2<<EOF
120
+ Error: Input file is not a valid statistics report.
121
+
122
+ This tool (fastq-quality-plot) requires a tabular text file conaining summary statistic about your FASTQ file.
123
+
124
+ In Galaxy,
125
+ Please use the "Compute Quality Statistics" tool (in the "NGS: QC and Manipulation" category) to compute the quality statistics report, and then use this tool with the new statistics report.
126
+
127
+ On the command line,
128
+ Please use the "fastx_quality_stats" program to create the statistics report.
129
+ EOF
130
+ exit 1
131
+ fi
132
+
133
+
134
+ #Read number of cycles from the stats file (each line is a cycle, minus the header line)
135
+ #But for the graph, I want xrange to reach (num_cycles+1), so I don't subtract 1 now.
136
+ NUM_CYCLES=$(cat "$FILENAME" | wc -l)
137
+
138
+ GNUPLOTCMD="
139
+ $OUTPUTTERM
140
+ set boxwidth 0.8
141
+ set size 1,1
142
+ set key Left inside
143
+ set xlabel \"read position\"
144
+ set ylabel \"Coverage\"
145
+ set title \"Coverage $TITLE\"
146
+ #set auto y
147
+ #set bars 4.0
148
+ set xrange [ 0: $NUM_CYCLES ]
149
+ #set yrange [-15:45]
150
+ #set y2range [-15:45]
151
+ set xtics 1
152
+ set x2tics 1
153
+ #set ytics 2
154
+ #set y2tics 2
155
+ set tics out
156
+ set grid ytics
157
+ set style fill empty
158
+ plot '$FILENAME' using 1:2 with lines lt 1 lw 1 title 'Coverage'
159
+ "
160
+
161
+ echo "$GNUPLOTCMD" | gnuplot > "$OUTPUTFILE"