fastq-factory 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,16 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ gem "trollop"
9
+ gem "nokogiri"
10
+ group :development do
11
+ gem "shoulda", ">= 0"
12
+ gem "rdoc", "~> 3.12"
13
+ gem "bundler", "~> 1.1.5"
14
+ gem "jeweler", "~> 1.8.4"
15
+ gem "simplecov"
16
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 Anthony Underwood
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = fastq-factory
2
+
3
+ Description goes here.
4
+
5
+ == Contributing to fastq-factory
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
9
+ * Fork the project.
10
+ * Start a feature/bugfix branch.
11
+ * Commit and push until you are happy with your contribution.
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2012 Anthony Underwood. See LICENSE.txt for
18
+ further details.
19
+
data/Rakefile ADDED
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "fastq-factory"
18
+ gem.homepage = "http://github.com/hpa-bioinformatics/fastq-factory"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{A tool to process and QC fastq files from illumina machines}
21
+ gem.description = %Q{This tool can process fastq files, using fastq_quality_trimmer and quake to correct fastq files and then provide a quality asssessment of the data}
22
+ gem.email = "anthony.underwood@hpa.org.uk"
23
+ gem.authors = ["Anthony Underwood"]
24
+ gem.executables = ["fastq-factory"]
25
+ # dependencies defined in Gemfile
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ # require 'rcov/rcovtask'
37
+ # Rcov::RcovTask.new do |test|
38
+ # test.libs << 'test'
39
+ # test.pattern = 'test/**/test_*.rb'
40
+ # test.verbose = true
41
+ # test.rcov_opts << '--exclude "gems/*"'
42
+ # end
43
+
44
+ task :default => :test
45
+
46
+ require 'rdoc/task'
47
+ Rake::RDocTask.new do |rdoc|
48
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
49
+
50
+ rdoc.rdoc_dir = 'rdoc'
51
+ rdoc.title = "fastq-factory #{version}"
52
+ rdoc.rdoc_files.include('README*')
53
+ rdoc.rdoc_files.include('lib/**/*.rb')
54
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/bin/fastq-factory ADDED
@@ -0,0 +1,44 @@
1
+ #! /usr/bin/env ruby
2
+ if ENV['fastq_factory_test_libpath']
3
+ $LOAD_PATH.unshift(ENV['fastq_factory_test_libpath']) # read in lib path if testing
4
+ end
5
+ require 'fastq-factory'
6
+ gem "trollop", "~> 2.0"
7
+ require 'trollop'
8
+ options = Trollop::options do
9
+ opt :trim, "Trim fastq files"
10
+ opt :metrics, "Produce quality metrics"
11
+ opt :directory, "The full path to thedirectory where the fastq and sample map files are stored", :type => :string, :required => true
12
+ opt :sample_map_file, "A file that where each line is tab-delimited filename_prefix and sample name e.g H113880160-S2-E1_S3_L001\\tH113880160", :type => :string, :required => true
13
+ opt :forward_reads_suffix, "The suffix for the forward read files such that the prefix listed in the sample map file and the suffic combined will make up the complete fastq filename", :type => :string, :required => true
14
+ opt :reverse_reads_suffix, "The suffix for the forward read files such that the prefix listed in the sample map file and the suffic combined will make up the complete fastq filename", :type => :string, :required => true
15
+ opt :quality_scale, "The Phred quality scale, 33 or 64", :type => :integer, :required => true
16
+ opt :fastq_quality_trimmer_dir, "The directory where fastq_quality_trimmer can be found", :short => "-F", :type => :string
17
+ opt :quake_dir, "The directory where the quake executable can be found", :short => "-Q", :type => :string
18
+ end
19
+
20
+ forward_reads_suffix, forward_reads_file_extension = options[:forward_reads_suffix].match(/(.+)\.(.+?)$/).captures
21
+ reverse_reads_suffix, reverse_reads_file_extension = options[:reverse_reads_suffix].match(/(.+)\.(.+?)$/).captures
22
+
23
+ sample_map = extract_file_prefixes_and_sample_name(options[:sample_map_file], options[:directory])
24
+ # check sequence files exist
25
+ sample_map.keys.each do |sample_prefix|
26
+ file_exists?("#{sample_prefix}#{options[:forward_reads_suffix]}", options[:directory])
27
+ file_exists?("#{sample_prefix}#{options[:reverse_reads_suffix]}", options[:directory])
28
+ if options[:metrics]
29
+ file_exists?("#{sample_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}", options[:directory])
30
+ file_exists?("#{sample_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}", options[:directory])
31
+ end
32
+ end
33
+ if options[:trim]
34
+ fastq_quality_trimmer_path = find_executable("fastq_quality_trimmer", options[:fastq_quality_trimmer_dir])
35
+ abort ("Can not find fastq_quality_trimmer. You can specifiy the directory where this can be found using the -F option") unless fastq_quality_trimmer_path
36
+ quake_path = find_executable("quake.py", options[:quake_dir])
37
+ abort ("Can not find quake.py. You can specifiy the directory where this can be found using the -Q option") unless quake_path
38
+ write_out_fastq_trim_script
39
+ trim_and_correct_fastqs(sample_map, options[:directory], forward_reads_suffix, forward_reads_file_extension, reverse_reads_suffix, reverse_reads_file_extension, options[:quality_scale], fastq_quality_trimmer_path, quake_path)
40
+ end
41
+
42
+ if options[:metrics]
43
+ generate_quality_metrics(sample_map, options[:directory], options[:forward_reads_suffix], options[:reverse_reads_suffix], options[:quality_scale])
44
+ end
@@ -0,0 +1,52 @@
1
+ require 'rubygems'
2
+ require 'trim_and_correct'
3
+ require 'generate_quality_metrics'
4
+ def extract_file_prefixes_and_sample_name(sample_map_file, directory)
5
+ sample_map = Hash.new
6
+ File.read("#{directory}/#{sample_map_file}").split("\n").each do |sample_map_line|
7
+ file_prefix, sample_name = sample_map_line.split("\t")
8
+ sample_map[file_prefix] = sample_name
9
+ end
10
+ return sample_map
11
+ end
12
+
13
+ def file_exists?(filename, directory)
14
+ abort("You specified a fastq file : #{filename}. This does not exist! Please check your sample map file") unless File.exists?("#{directory}/#{filename}")
15
+ end
16
+
17
+ def find_executable(executable_name, directory = nil)
18
+ if directory.nil?
19
+ if which(executable_name)
20
+ return which(executable_name)
21
+ elsif File.executable?("/usr/local/bin/#{executable_name}")
22
+ return "/usr/local/bin/#{executable_name}"
23
+ elsif File.executable?("/usr/local/#{executable_name}/#{executable_name}")
24
+ return "/usr/local/#{executable_name}/#{executable_name}"
25
+ else
26
+ return nil
27
+ end
28
+ else
29
+ if File.executable?("#{directory}/#{executable_name}")
30
+ return "#{directory}/#{executable_name}"
31
+ else
32
+ return nil
33
+ end
34
+ end
35
+ end
36
+
37
+ # meethod to return path to command if it is in the path (works in windows)
38
+ # @param String cmd the name of the command
39
+ def which(cmd)
40
+ exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
41
+ ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
42
+ exts.each { |ext|
43
+ exe = "#{path}/#{cmd}#{ext}"
44
+ return exe if File.executable? exe
45
+ }
46
+ end
47
+ return nil
48
+ end
49
+
50
+ def write_out_fastq_trim_script
51
+ system("cp #{File.dirname(__FILE__)}/fastq-remove-orphans.pl /tmp/")
52
+ end
@@ -0,0 +1,130 @@
1
+ #! /usr/bin/perl
2
+ # Victor Amin 2009
3
+
4
+ use strict;
5
+ use warnings;
6
+
7
+ use Getopt::Std;
8
+ $Getopt::Std::STANDARD_HELP_VERSION = 1;
9
+
10
+ my %options;
11
+ getopts('1:2:h', \%options);
12
+
13
+ if ($options{h} || !$options{1} || !$options{2}) {Getopt::Std->version_mess(); HELP_MESSAGE(\*STDERR)}
14
+ sub HELP_MESSAGE {
15
+ my $fh = shift;
16
+ print $fh "\nSplit ophaned reads out of a pair of FASTQ files. Counts to STDOUT.\n";
17
+ print $fh "\tOPTIONS:\n";
18
+ print $fh "\t-1 [FASTQ1] [required]\n";
19
+ print $fh "\t-2 [FASTQ2] [required]\n";
20
+ print $fh "\nProperly paired FASTQs are outputted to paired_*, orphans to orphaned_*\n\n";
21
+ exit;
22
+ }
23
+
24
+ open FASTQ1, "<$options{1}" or die "\nThere was a problem opening the FASTQ file: $!\n";
25
+ open FASTQ2, "<$options{2}" or die "\nThere was a problem opening the FASTQ file: $!\n";
26
+
27
+ open PAIRED1, ">paired_$options{1}" or die "\nThere was a problem opening the output file: $!\n";
28
+ open PAIRED2, ">paired_$options{2}" or die "\nThere was a problem opening the output file: $!\n";
29
+
30
+ open ORPHANED1, ">orphaned_$options{1}" or die "\nThere was a problem opening the output file: $!\n";
31
+ open ORPHANED2, ">orphaned_$options{2}" or die "\nThere was a problem opening the output file: $!\n";
32
+
33
+ my $SEQ_MODE = 1;
34
+ my $QUAL_MODE = 2;
35
+ my $mode = 1;
36
+
37
+ my $reads_1 = 0;
38
+ my $lines = 0;
39
+ my $ident;
40
+ my $sequence;
41
+ my $quality;
42
+
43
+ my %sequences_1;
44
+ my %qualities_1;
45
+ print STDERR "\nLoading first FASTQ...\n";
46
+ while (<FASTQ1>) {
47
+ chomp;
48
+ if (/^\@/ && $mode == $SEQ_MODE) {
49
+ /\@([^ \/]+?)( |\/)/; # rather than substitute capture AU 16/08/2012
50
+ # chop; this is to remove the 1 or 2 from /1 or /2 if the reads are in that format. These reads are in the format @ident 1:N:0:x or @ident 2:N:0:x AU 16/08/2012
51
+ $ident = $1; # ident = capture AU 16/08/2012
52
+ $reads_1++;
53
+ } elsif (/^\+/) {
54
+ $mode = $QUAL_MODE;
55
+ } elsif ($mode == $SEQ_MODE) {
56
+ $sequence .= $_;
57
+ $lines++;
58
+ } elsif ($mode == $QUAL_MODE) {
59
+ $quality .= $_;
60
+ $lines--;
61
+ if ($lines == 0) {
62
+ $mode = $SEQ_MODE;
63
+ $sequences_1{$ident} = $sequence;
64
+ $qualities_1{$ident} = $quality;
65
+ $sequence = '';
66
+ $quality = '';
67
+ }
68
+ } else {
69
+ die "\nError reading file.\n";
70
+ }
71
+ }
72
+
73
+ my $reads_2 = 0;
74
+
75
+ my %sequences_2;
76
+ my %qualities_2;
77
+ print STDERR "\nLoading second FASTQ...\n";
78
+ while (<FASTQ2>) {
79
+ chomp;
80
+ if (/^\@/ && $mode == $SEQ_MODE) {
81
+ /\@([^ \/]+?)( |\/)/; # rather than substitute capture AU 16/08/2012
82
+ # chop; this is to remove the 1 or 2 from /1 or /2 if the reads are in that format. These reads are in the format @ident 1:N:0:x or @ident 2:N:0:x AU 16/08/2012
83
+ $ident = $1; # ident = capture AU 16/08/2012
84
+ $reads_2++;
85
+ } elsif (/^\+/) {
86
+ $mode = $QUAL_MODE;
87
+ } elsif ($mode == $SEQ_MODE) {
88
+ $sequence .= $_;
89
+ $lines++;
90
+ } elsif ($mode == $QUAL_MODE) {
91
+ $quality .= $_;
92
+ $lines--;
93
+ if ($lines == 0) {
94
+ $mode = $SEQ_MODE;
95
+ $sequences_2{$ident} = $sequence;
96
+ $qualities_2{$ident} = $quality;
97
+ $sequence = '';
98
+ $quality = '';
99
+ }
100
+ } else {
101
+ die "\nError reading file.\n";
102
+ }
103
+ }
104
+
105
+ my $paired;
106
+ print STDERR "\nPrinting paired reads...\n";
107
+ for $ident (keys %sequences_1) {
108
+ if (exists $sequences_2{$ident}) {
109
+ print PAIRED1 "\@${ident} 1\n$sequences_1{$ident}\n\+${ident}1\n$qualities_1{$ident}\n";
110
+ print PAIRED2 "\@${ident} 2\n$sequences_2{$ident}\n\+${ident}2\n$qualities_2{$ident}\n";
111
+ delete $sequences_1{$ident};
112
+ delete $sequences_2{$ident};
113
+ $paired++;
114
+ }
115
+ }
116
+
117
+ print STDERR "\nPrinting orphaned reads...\n";
118
+ my $orphaned_1;
119
+ for $ident (keys %sequences_1) {
120
+ print ORPHANED1 "\@${ident} 1\n$sequences_1{$ident}\n\+${ident}1\n$qualities_1{$ident}\n";
121
+ $orphaned_1++;
122
+ }
123
+
124
+ my $orphaned_2;
125
+ for $ident (keys %sequences_2) {
126
+ print ORPHANED2 "\@${ident} 2\n$sequences_2{$ident}\n\+${ident}2\n$qualities_2{$ident}\n";
127
+ $orphaned_2++
128
+ }
129
+
130
+ print "\nReads 1: $reads_1\nOrphans 1: $orphaned_1\nReads 2: $reads_2\nOrphaned 2: $orphaned_2\nPaired: $paired\n";
@@ -0,0 +1,49 @@
1
+ module FastqAssessment
2
+ require 'maths'
3
+ FastqStats = Struct.new("FastqStats", :read_position_stats, :mean_quality, :position_where_quality_lt_20, :percentage_compared_to_raw)
4
+ ReadPositionStats = Struct.new("ReadPositionStats", :mean_quality, :median_quality, :first_quartile, :third_quartile)
5
+
6
+ def generate_quality_stats_for_read(fastq_file, quality_scale, quality_cutoff = 30)
7
+ fastq_stats = FastqStats.new()
8
+ fastq_stats.read_position_stats = Array.new
9
+ if quality_scale == 64
10
+ qual_stats = `fastx_quality_stats -i #{fastq_file}`
11
+ else
12
+ qual_stats = `fastx_quality_stats -Q 33 -i #{fastq_file}`
13
+ end
14
+ read_positions = qual_stats.split("\n")
15
+ qualities_at_read_positions = Array.new
16
+ read_positions.each do |read_position|
17
+ qual_stats = read_position.split(/\s+/)
18
+ mean_quality = qual_stats[5].to_f
19
+ median_quality = qual_stats[7].to_f
20
+ first_quartile = qual_stats[6].to_f
21
+ third_quartile = qual_stats[8].to_f
22
+ qualities_at_read_positions << mean_quality
23
+ fastq_stats.read_position_stats << ReadPositionStats.new(
24
+ mean_quality,
25
+ median_quality,
26
+ first_quartile,
27
+ third_quartile
28
+ )
29
+ end
30
+ # determine mean quality
31
+ fastq_stats.mean_quality = qualities_at_read_positions.mean
32
+ # determine position where quality in a 5 position window drops below 20
33
+ position = 0
34
+ qualities_at_read_positions.each_cons(5) do |window|
35
+ position += 1
36
+ if window.mean < quality_cutoff && position > 15
37
+ fastq_stats.position_where_quality_lt_20 = position
38
+ break
39
+ end
40
+ end
41
+ return fastq_stats
42
+ end
43
+
44
+ def percentage_compared_to_raw(processed_fastq_file, raw_fastq_file)
45
+ file_lines_processed = `wc -l #{processed_fastq_file}`.split(" ").first.to_f
46
+ file_lines_raw = `wc -l #{raw_fastq_file}`.split(" ").first.to_f
47
+ percentage_reduction = (file_lines_processed/file_lines_raw*100).to_i
48
+ end
49
+ end
@@ -0,0 +1,46 @@
1
+ require 'fastq_assessment'
2
+ include FastqAssessment
3
+ require 'miseq_run_stats'
4
+ include MiseqRunStats
5
+
6
+ def generate_quality_metrics(sample_map, directory, forward_reads_suffix, reverse_reads_suffix, quality_scale)
7
+ if File.exists?("#{directory}/ResequencingRunStatistics.xml")
8
+ resequencing_run_stats = parse_resequencing_run_stats("#{directory}/ResequencingRunStatistics.xml", sample_map.values)
9
+ else
10
+ resequencing_run_stats = ResequencingRunStats.new
11
+ resequencing_run_stats.sample_stats = Hash.new
12
+ sample_map.values.each do |sample_name|
13
+ resequencing_run_stats.sample_stats[sample_name] = SampleStats.new
14
+ end
15
+ end
16
+
17
+
18
+ forward_reads_trimmed_corrected_suffix = forward_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')
19
+ reverse_reads_trimmed_corrected_suffix = reverse_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')
20
+
21
+
22
+ sample_map.each do |read_file_prefix, sample_name|
23
+ puts sample_name
24
+ resequencing_run_stats.sample_stats[sample_name].fastq_stats = Hash.new
25
+ resequencing_run_stats.sample_stats[sample_name].fastq_stats["forward"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_suffix}",quality_scale)
26
+ resequencing_run_stats.sample_stats[sample_name].fastq_stats["reverse"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_suffix}",quality_scale)
27
+ resequencing_run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}",quality_scale)
28
+ resequencing_run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{forward_reads_suffix}")
29
+ resequencing_run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}",quality_scale)
30
+ resequencing_run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{reverse_reads_suffix}")
31
+ end
32
+ # print out data
33
+ output_file = File.open("#{directory}/summary_stats.txt", "w")
34
+ # print headers
35
+ output_file.puts "run name\tnumber of bases(Gb)\tnumber of clusters\tsample name\tdirection\tnumber of clusters\tnumber of forward reads aligned\tnumber of reverse reads aligned\tcoverage\tnumber of snps\tmean quality\tread base where qual falls below 30\tpercent reduction compared to raw"
36
+ output_file.puts "#{directory.match(/.*\/(.+?)$/).captures.first}\t#{resequencing_run_stats.number_of_bases}\t#{resequencing_run_stats.number_of_clusters}"
37
+ resequencing_run_stats.sample_stats.keys.sort.each do |sample_name|
38
+ sample_stats = resequencing_run_stats.sample_stats[sample_name]
39
+ output_file.puts "\t\t\t#{sample_name}\t\t#{sample_stats.number_of_clusters}\t#{sample_stats.number_of_forward_reads_aligned}\t#{sample_stats.number_of_reverse_reads_aligned}\t#{sample_stats.coverage}\t#{sample_stats.number_of_snps}"
40
+ ["forward", "reverse", "forward-trim_corrected", "reverse-trim_corrected"].each do |direction|
41
+ fastq_stats = resequencing_run_stats.sample_stats[sample_name].fastq_stats[direction]
42
+ output_file.puts "\t\t\t\t#{direction}\t\t\t\t\t\t#{fastq_stats.mean_quality}\t#{fastq_stats.position_where_quality_lt_20}\t#{fastq_stats.percentage_compared_to_raw}"
43
+ end
44
+ end
45
+ output_file.close
46
+ end
data/lib/maths.rb ADDED
@@ -0,0 +1,25 @@
1
+ # Add methods to Enumerable, which makes them available to Array
2
+ module Enumerable
3
+ # sum of an array of numbers
4
+ def sum
5
+ return self.inject(0){|acc,i|acc +i}
6
+ end
7
+
8
+ # mean of an array of numbers
9
+ def mean
10
+ return self.sum/self.length.to_f
11
+ end
12
+
13
+ # variance of an array of numbers
14
+ def sample_variance
15
+ mean=self.mean
16
+ sum=self.inject(0){|acc,i|acc +(i-mean)**2}
17
+ return(1/self.length.to_f*sum)
18
+ end
19
+
20
+ # standard deviation of an array of numbers
21
+ def standard_deviation
22
+ return Math.sqrt(self.sample_variance)
23
+ end
24
+
25
+ end
@@ -0,0 +1,29 @@
1
+ module MiseqRunStats
2
+ require 'nokogiri'
3
+ ResequencingRunStats = Struct.new("ResequencingRunStats", :number_of_bases, :number_of_clusters, :sample_stats)
4
+ SampleStats = Struct.new("SampleStats", :sample_name, :number_of_clusters, :number_of_forward_reads_aligned, :number_of_reverse_reads_aligned, :coverage, :number_of_snps, :fastq_stats)
5
+ def parse_resequencing_run_stats(xml_file, original_sample_names = nil)
6
+ xml = Nokogiri::XML(File.read(xml_file))
7
+ resequencing_run_stats = ResequencingRunStats.new
8
+
9
+ xml.search('//RunStats').each do |run_stats|
10
+ resequencing_run_stats.number_of_bases = run_stats.search('YieldInBasesPF').text.to_f/1000000000
11
+ resequencing_run_stats.number_of_clusters = run_stats.search('NumberOfClustersPF').text.to_i
12
+ end
13
+
14
+ resequencing_run_stats.sample_stats = Hash.new
15
+ xml.search('//SummarizedSampleStatisics').each do |summarised_samples_stats|
16
+ sample_name = summarised_samples_stats.search('SampleName').text
17
+ sample_name = original_sample_names.select{|original_sample_name| sample_name =~ /#{original_sample_name}/}.first unless original_sample_names.nil? # alter sample name to original sample name if supplies as an array
18
+
19
+ resequencing_run_stats.sample_stats[sample_name] = SampleStats.new
20
+ resequencing_run_stats.sample_stats[sample_name].sample_name = sample_name
21
+ resequencing_run_stats.sample_stats[sample_name].number_of_clusters = summarised_samples_stats.search('NumberOfClustersPF').text
22
+ resequencing_run_stats.sample_stats[sample_name].number_of_forward_reads_aligned = summarised_samples_stats.search('ClustersAlignedR1').text
23
+ resequencing_run_stats.sample_stats[sample_name].number_of_reverse_reads_aligned = summarised_samples_stats.search('ClustersAlignedR2').text
24
+ resequencing_run_stats.sample_stats[sample_name].coverage = summarised_samples_stats.search('WeightedCoverage').text
25
+ resequencing_run_stats.sample_stats[sample_name].number_of_snps = summarised_samples_stats.search('NumberHomozygousSNPs').text
26
+ end
27
+ return resequencing_run_stats
28
+ end
29
+ end
@@ -0,0 +1,46 @@
1
+ def trim_and_correct_fastqs(sample_map, directory, forward_reads_suffix, forward_reads_file_extension, reverse_reads_suffix, reverse_reads_file_extension, quality_scale, fastq_quality_trimmer_path, quake_path)
2
+ Dir.chdir(directory)
3
+ # trimming
4
+ sample_map.each do |sample_file_prefix, sample_name|
5
+ puts "Trimming files for #{sample_name}"
6
+
7
+ system("#{fastq_quality_trimmer_path} -i #{directory}/#{sample_file_prefix}#{forward_reads_suffix}.#{forward_reads_file_extension} -o #{directory}/#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} -t 20 -l 90 -Q #{quality_scale} -v")
8
+ system("#{fastq_quality_trimmer_path} -i #{directory}/#{sample_file_prefix}#{reverse_reads_suffix}.#{reverse_reads_file_extension} -o #{directory}/#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension} -t 20 -l 90 -Q #{quality_scale} -v")
9
+ system("perl /tmp/fastq-remove-orphans.pl -1 #{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} -2 #{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
10
+ end
11
+
12
+ # quake correction
13
+ # write file for quake
14
+ sample_map.each do |sample_file_prefix, sample_name|
15
+ output_file = File.open("quake_file_list.txt","w")
16
+ output_file.puts "paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}"
17
+ output_file.close
18
+ # run quake
19
+ system("#{quake_path} -f quake_file_list.txt -k 15 -q #{quality_scale}")
20
+ end
21
+ sample_map.each do |sample_file_prefix, sample_name|
22
+ system("perl /Volumes/NGS2_DataRAID/projects/MRSA/scripts/fastq-remove-orphans.pl -1 paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension} -2 paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
23
+ end
24
+
25
+ # cleanup and rename files
26
+ sample_map.each do |sample_file_prefix, sample_name|
27
+ system("rm #{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
28
+ system("rm #{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
29
+ system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
30
+ system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
31
+ system("rm orphaned_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
32
+ system("rm orphaned_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
33
+ system("rm error_model.paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.txt")
34
+ system("rm error_model.paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.txt")
35
+ system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.stats.txt")
36
+ system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
37
+ system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.stats.txt")
38
+ system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
39
+ system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
40
+ system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
41
+ system("rm orphaned_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
42
+ system("rm orphaned_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
43
+ system("mv paired_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension} #{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
44
+ system("mv paired_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension} #{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
45
+ end
46
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'fastq-factory'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestFastqFactory < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,180 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fastq-factory
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Anthony Underwood
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-08-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: trollop
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: nokogiri
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: shoulda
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rdoc
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '3.12'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: '3.12'
78
+ - !ruby/object:Gem::Dependency
79
+ name: bundler
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ~>
84
+ - !ruby/object:Gem::Version
85
+ version: 1.1.5
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.1.5
94
+ - !ruby/object:Gem::Dependency
95
+ name: jeweler
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: 1.8.4
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: 1.8.4
110
+ - !ruby/object:Gem::Dependency
111
+ name: simplecov
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ description: This tool can process fastq files, using fastq_quality_trimmer and quake
127
+ to correct fastq files and then provide a quality asssessment of the data
128
+ email: anthony.underwood@hpa.org.uk
129
+ executables:
130
+ - fastq-factory
131
+ extensions: []
132
+ extra_rdoc_files:
133
+ - LICENSE.txt
134
+ - README.rdoc
135
+ files:
136
+ - .document
137
+ - Gemfile
138
+ - LICENSE.txt
139
+ - README.rdoc
140
+ - Rakefile
141
+ - VERSION
142
+ - bin/fastq-factory
143
+ - lib/fastq-factory.rb
144
+ - lib/fastq-remove-orphans.pl
145
+ - lib/fastq_assessment.rb
146
+ - lib/generate_quality_metrics.rb
147
+ - lib/maths.rb
148
+ - lib/miseq_run_stats.rb
149
+ - lib/trim_and_correct.rb
150
+ - test/helper.rb
151
+ - test/test_fastq-factory.rb
152
+ homepage: http://github.com/hpa-bioinformatics/fastq-factory
153
+ licenses:
154
+ - MIT
155
+ post_install_message:
156
+ rdoc_options: []
157
+ require_paths:
158
+ - lib
159
+ required_ruby_version: !ruby/object:Gem::Requirement
160
+ none: false
161
+ requirements:
162
+ - - ! '>='
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ segments:
166
+ - 0
167
+ hash: 2485524748993201700
168
+ required_rubygems_version: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 1.8.19
177
+ signing_key:
178
+ specification_version: 3
179
+ summary: A tool to process and QC fastq files from illumina machines
180
+ test_files: []