fastq-factory 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +16 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/bin/fastq-factory +44 -0
- data/lib/fastq-factory.rb +52 -0
- data/lib/fastq-remove-orphans.pl +130 -0
- data/lib/fastq_assessment.rb +49 -0
- data/lib/generate_quality_metrics.rb +46 -0
- data/lib/maths.rb +25 -0
- data/lib/miseq_run_stats.rb +29 -0
- data/lib/trim_and_correct.rb +46 -0
- data/test/helper.rb +18 -0
- data/test/test_fastq-factory.rb +7 -0
- metadata +180 -0
data/.document
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
gem "trollop"
|
9
|
+
gem "nokogiri"
|
10
|
+
group :development do
|
11
|
+
gem "shoulda", ">= 0"
|
12
|
+
gem "rdoc", "~> 3.12"
|
13
|
+
gem "bundler", "~> 1.1.5"
|
14
|
+
gem "jeweler", "~> 1.8.4"
|
15
|
+
gem "simplecov"
|
16
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Anthony Underwood
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= fastq-factory
|
2
|
+
|
3
|
+
Description goes here.
|
4
|
+
|
5
|
+
== Contributing to fastq-factory
|
6
|
+
|
7
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
8
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
|
9
|
+
* Fork the project.
|
10
|
+
* Start a feature/bugfix branch.
|
11
|
+
* Commit and push until you are happy with your contribution.
|
12
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
13
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2012 Anthony Underwood. See LICENSE.txt for
|
18
|
+
further details.
|
19
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "fastq-factory"
|
18
|
+
gem.homepage = "http://github.com/hpa-bioinformatics/fastq-factory"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{A tool to process and QC fastq files from illumina machines}
|
21
|
+
gem.description = %Q{This tool can process fastq files, using fastq_quality_trimmer and quake to correct fastq files and then provide a quality asssessment of the data}
|
22
|
+
gem.email = "anthony.underwood@hpa.org.uk"
|
23
|
+
gem.authors = ["Anthony Underwood"]
|
24
|
+
gem.executables = ["fastq-factory"]
|
25
|
+
# dependencies defined in Gemfile
|
26
|
+
end
|
27
|
+
Jeweler::RubygemsDotOrgTasks.new
|
28
|
+
|
29
|
+
require 'rake/testtask'
|
30
|
+
Rake::TestTask.new(:test) do |test|
|
31
|
+
test.libs << 'lib' << 'test'
|
32
|
+
test.pattern = 'test/**/test_*.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
|
36
|
+
# require 'rcov/rcovtask'
|
37
|
+
# Rcov::RcovTask.new do |test|
|
38
|
+
# test.libs << 'test'
|
39
|
+
# test.pattern = 'test/**/test_*.rb'
|
40
|
+
# test.verbose = true
|
41
|
+
# test.rcov_opts << '--exclude "gems/*"'
|
42
|
+
# end
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
require 'rdoc/task'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "fastq-factory #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/bin/fastq-factory
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
if ENV['fastq_factory_test_libpath']
|
3
|
+
$LOAD_PATH.unshift(ENV['fastq_factory_test_libpath']) # read in lib path if testing
|
4
|
+
end
|
5
|
+
require 'fastq-factory'
|
6
|
+
gem "trollop", "~> 2.0"
|
7
|
+
require 'trollop'
|
8
|
+
options = Trollop::options do
|
9
|
+
opt :trim, "Trim fastq files"
|
10
|
+
opt :metrics, "Produce quality metrics"
|
11
|
+
opt :directory, "The full path to thedirectory where the fastq and sample map files are stored", :type => :string, :required => true
|
12
|
+
opt :sample_map_file, "A file that where each line is tab-delimited filename_prefix and sample name e.g H113880160-S2-E1_S3_L001\\tH113880160", :type => :string, :required => true
|
13
|
+
opt :forward_reads_suffix, "The suffix for the forward read files such that the prefix listed in the sample map file and the suffic combined will make up the complete fastq filename", :type => :string, :required => true
|
14
|
+
opt :reverse_reads_suffix, "The suffix for the forward read files such that the prefix listed in the sample map file and the suffic combined will make up the complete fastq filename", :type => :string, :required => true
|
15
|
+
opt :quality_scale, "The Phred quality scale, 33 or 64", :type => :integer, :required => true
|
16
|
+
opt :fastq_quality_trimmer_dir, "The directory where fastq_quality_trimmer can be found", :short => "-F", :type => :string
|
17
|
+
opt :quake_dir, "The directory where the quake executable can be found", :short => "-Q", :type => :string
|
18
|
+
end
|
19
|
+
|
20
|
+
forward_reads_suffix, forward_reads_file_extension = options[:forward_reads_suffix].match(/(.+)\.(.+?)$/).captures
|
21
|
+
reverse_reads_suffix, reverse_reads_file_extension = options[:reverse_reads_suffix].match(/(.+)\.(.+?)$/).captures
|
22
|
+
|
23
|
+
sample_map = extract_file_prefixes_and_sample_name(options[:sample_map_file], options[:directory])
|
24
|
+
# check sequence files exist
|
25
|
+
sample_map.keys.each do |sample_prefix|
|
26
|
+
file_exists?("#{sample_prefix}#{options[:forward_reads_suffix]}", options[:directory])
|
27
|
+
file_exists?("#{sample_prefix}#{options[:reverse_reads_suffix]}", options[:directory])
|
28
|
+
if options[:metrics]
|
29
|
+
file_exists?("#{sample_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}", options[:directory])
|
30
|
+
file_exists?("#{sample_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}", options[:directory])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
if options[:trim]
|
34
|
+
fastq_quality_trimmer_path = find_executable("fastq_quality_trimmer", options[:fastq_quality_trimmer_dir])
|
35
|
+
abort ("Can not find fastq_quality_trimmer. You can specifiy the directory where this can be found using the -F option") unless fastq_quality_trimmer_path
|
36
|
+
quake_path = find_executable("quake.py", options[:quake_dir])
|
37
|
+
abort ("Can not find quake.py. You can specifiy the directory where this can be found using the -Q option") unless quake_path
|
38
|
+
write_out_fastq_trim_script
|
39
|
+
trim_and_correct_fastqs(sample_map, options[:directory], forward_reads_suffix, forward_reads_file_extension, reverse_reads_suffix, reverse_reads_file_extension, options[:quality_scale], fastq_quality_trimmer_path, quake_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
if options[:metrics]
|
43
|
+
generate_quality_metrics(sample_map, options[:directory], options[:forward_reads_suffix], options[:reverse_reads_suffix], options[:quality_scale])
|
44
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'trim_and_correct'
|
3
|
+
require 'generate_quality_metrics'
|
4
|
+
def extract_file_prefixes_and_sample_name(sample_map_file, directory)
|
5
|
+
sample_map = Hash.new
|
6
|
+
File.read("#{directory}/#{sample_map_file}").split("\n").each do |sample_map_line|
|
7
|
+
file_prefix, sample_name = sample_map_line.split("\t")
|
8
|
+
sample_map[file_prefix] = sample_name
|
9
|
+
end
|
10
|
+
return sample_map
|
11
|
+
end
|
12
|
+
|
13
|
+
def file_exists?(filename, directory)
|
14
|
+
abort("You specified a fastq file : #{filename}. This does not exist! Please check your sample map file") unless File.exists?("#{directory}/#{filename}")
|
15
|
+
end
|
16
|
+
|
17
|
+
def find_executable(executable_name, directory = nil)
|
18
|
+
if directory.nil?
|
19
|
+
if which(executable_name)
|
20
|
+
return which(executable_name)
|
21
|
+
elsif File.executable?("/usr/local/bin/#{executable_name}")
|
22
|
+
return "/usr/local/bin/#{executable_name}"
|
23
|
+
elsif File.executable?("/usr/local/#{executable_name}/#{executable_name}")
|
24
|
+
return "/usr/local/#{executable_name}/#{executable_name}"
|
25
|
+
else
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
else
|
29
|
+
if File.executable?("#{directory}/#{executable_name}")
|
30
|
+
return "#{directory}/#{executable_name}"
|
31
|
+
else
|
32
|
+
return nil
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# meethod to return path to command if it is in the path (works in windows)
|
38
|
+
# @param String cmd the name of the command
|
39
|
+
def which(cmd)
|
40
|
+
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
|
41
|
+
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
|
42
|
+
exts.each { |ext|
|
43
|
+
exe = "#{path}/#{cmd}#{ext}"
|
44
|
+
return exe if File.executable? exe
|
45
|
+
}
|
46
|
+
end
|
47
|
+
return nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def write_out_fastq_trim_script
|
51
|
+
system("cp #{File.dirname(__FILE__)}/fastq-remove-orphans.pl /tmp/")
|
52
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
#! /usr/bin/perl
|
2
|
+
# Victor Amin 2009
|
3
|
+
|
4
|
+
use strict;
|
5
|
+
use warnings;
|
6
|
+
|
7
|
+
use Getopt::Std;
|
8
|
+
$Getopt::Std::STANDARD_HELP_VERSION = 1;
|
9
|
+
|
10
|
+
my %options;
|
11
|
+
getopts('1:2:h', \%options);
|
12
|
+
|
13
|
+
if ($options{h} || !$options{1} || !$options{2}) {Getopt::Std->version_mess(); HELP_MESSAGE(\*STDERR)}
|
14
|
+
sub HELP_MESSAGE {
|
15
|
+
my $fh = shift;
|
16
|
+
print $fh "\nSplit ophaned reads out of a pair of FASTQ files. Counts to STDOUT.\n";
|
17
|
+
print $fh "\tOPTIONS:\n";
|
18
|
+
print $fh "\t-1 [FASTQ1] [required]\n";
|
19
|
+
print $fh "\t-2 [FASTQ2] [required]\n";
|
20
|
+
print $fh "\nProperly paired FASTQs are outputted to paired_*, orphans to orphaned_*\n\n";
|
21
|
+
exit;
|
22
|
+
}
|
23
|
+
|
24
|
+
open FASTQ1, "<$options{1}" or die "\nThere was a problem opening the FASTQ file: $!\n";
|
25
|
+
open FASTQ2, "<$options{2}" or die "\nThere was a problem opening the FASTQ file: $!\n";
|
26
|
+
|
27
|
+
open PAIRED1, ">paired_$options{1}" or die "\nThere was a problem opening the output file: $!\n";
|
28
|
+
open PAIRED2, ">paired_$options{2}" or die "\nThere was a problem opening the output file: $!\n";
|
29
|
+
|
30
|
+
open ORPHANED1, ">orphaned_$options{1}" or die "\nThere was a problem opening the output file: $!\n";
|
31
|
+
open ORPHANED2, ">orphaned_$options{2}" or die "\nThere was a problem opening the output file: $!\n";
|
32
|
+
|
33
|
+
my $SEQ_MODE = 1;
|
34
|
+
my $QUAL_MODE = 2;
|
35
|
+
my $mode = 1;
|
36
|
+
|
37
|
+
my $reads_1 = 0;
|
38
|
+
my $lines = 0;
|
39
|
+
my $ident;
|
40
|
+
my $sequence;
|
41
|
+
my $quality;
|
42
|
+
|
43
|
+
my %sequences_1;
|
44
|
+
my %qualities_1;
|
45
|
+
print STDERR "\nLoading first FASTQ...\n";
|
46
|
+
while (<FASTQ1>) {
|
47
|
+
chomp;
|
48
|
+
if (/^\@/ && $mode == $SEQ_MODE) {
|
49
|
+
/\@([^ \/]+?)( |\/)/; # rather than substitute capture AU 16/08/2012
|
50
|
+
# chop; this is to remove the 1 or 2 from /1 or /2 if the reads are in that format. These reads are in the format @ident 1:N:0:x or @ident 2:N:0:x AU 16/08/2012
|
51
|
+
$ident = $1; # ident = capture AU 16/08/2012
|
52
|
+
$reads_1++;
|
53
|
+
} elsif (/^\+/) {
|
54
|
+
$mode = $QUAL_MODE;
|
55
|
+
} elsif ($mode == $SEQ_MODE) {
|
56
|
+
$sequence .= $_;
|
57
|
+
$lines++;
|
58
|
+
} elsif ($mode == $QUAL_MODE) {
|
59
|
+
$quality .= $_;
|
60
|
+
$lines--;
|
61
|
+
if ($lines == 0) {
|
62
|
+
$mode = $SEQ_MODE;
|
63
|
+
$sequences_1{$ident} = $sequence;
|
64
|
+
$qualities_1{$ident} = $quality;
|
65
|
+
$sequence = '';
|
66
|
+
$quality = '';
|
67
|
+
}
|
68
|
+
} else {
|
69
|
+
die "\nError reading file.\n";
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
my $reads_2 = 0;
|
74
|
+
|
75
|
+
my %sequences_2;
|
76
|
+
my %qualities_2;
|
77
|
+
print STDERR "\nLoading second FASTQ...\n";
|
78
|
+
while (<FASTQ2>) {
|
79
|
+
chomp;
|
80
|
+
if (/^\@/ && $mode == $SEQ_MODE) {
|
81
|
+
/\@([^ \/]+?)( |\/)/; # rather than substitute capture AU 16/08/2012
|
82
|
+
# chop; this is to remove the 1 or 2 from /1 or /2 if the reads are in that format. These reads are in the format @ident 1:N:0:x or @ident 2:N:0:x AU 16/08/2012
|
83
|
+
$ident = $1; # ident = capture AU 16/08/2012
|
84
|
+
$reads_2++;
|
85
|
+
} elsif (/^\+/) {
|
86
|
+
$mode = $QUAL_MODE;
|
87
|
+
} elsif ($mode == $SEQ_MODE) {
|
88
|
+
$sequence .= $_;
|
89
|
+
$lines++;
|
90
|
+
} elsif ($mode == $QUAL_MODE) {
|
91
|
+
$quality .= $_;
|
92
|
+
$lines--;
|
93
|
+
if ($lines == 0) {
|
94
|
+
$mode = $SEQ_MODE;
|
95
|
+
$sequences_2{$ident} = $sequence;
|
96
|
+
$qualities_2{$ident} = $quality;
|
97
|
+
$sequence = '';
|
98
|
+
$quality = '';
|
99
|
+
}
|
100
|
+
} else {
|
101
|
+
die "\nError reading file.\n";
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
my $paired;
|
106
|
+
print STDERR "\nPrinting paired reads...\n";
|
107
|
+
for $ident (keys %sequences_1) {
|
108
|
+
if (exists $sequences_2{$ident}) {
|
109
|
+
print PAIRED1 "\@${ident} 1\n$sequences_1{$ident}\n\+${ident}1\n$qualities_1{$ident}\n";
|
110
|
+
print PAIRED2 "\@${ident} 2\n$sequences_2{$ident}\n\+${ident}2\n$qualities_2{$ident}\n";
|
111
|
+
delete $sequences_1{$ident};
|
112
|
+
delete $sequences_2{$ident};
|
113
|
+
$paired++;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
print STDERR "\nPrinting orphaned reads...\n";
|
118
|
+
my $orphaned_1;
|
119
|
+
for $ident (keys %sequences_1) {
|
120
|
+
print ORPHANED1 "\@${ident} 1\n$sequences_1{$ident}\n\+${ident}1\n$qualities_1{$ident}\n";
|
121
|
+
$orphaned_1++;
|
122
|
+
}
|
123
|
+
|
124
|
+
my $orphaned_2;
|
125
|
+
for $ident (keys %sequences_2) {
|
126
|
+
print ORPHANED2 "\@${ident} 2\n$sequences_2{$ident}\n\+${ident}2\n$qualities_2{$ident}\n";
|
127
|
+
$orphaned_2++
|
128
|
+
}
|
129
|
+
|
130
|
+
print "\nReads 1: $reads_1\nOrphans 1: $orphaned_1\nReads 2: $reads_2\nOrphaned 2: $orphaned_2\nPaired: $paired\n";
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module FastqAssessment
|
2
|
+
require 'maths'
|
3
|
+
FastqStats = Struct.new("FastqStats", :read_position_stats, :mean_quality, :position_where_quality_lt_20, :percentage_compared_to_raw)
|
4
|
+
ReadPositionStats = Struct.new("ReadPositionStats", :mean_quality, :median_quality, :first_quartile, :third_quartile)
|
5
|
+
|
6
|
+
def generate_quality_stats_for_read(fastq_file, quality_scale, quality_cutoff = 30)
|
7
|
+
fastq_stats = FastqStats.new()
|
8
|
+
fastq_stats.read_position_stats = Array.new
|
9
|
+
if quality_scale == 64
|
10
|
+
qual_stats = `fastx_quality_stats -i #{fastq_file}`
|
11
|
+
else
|
12
|
+
qual_stats = `fastx_quality_stats -Q 33 -i #{fastq_file}`
|
13
|
+
end
|
14
|
+
read_positions = qual_stats.split("\n")
|
15
|
+
qualities_at_read_positions = Array.new
|
16
|
+
read_positions.each do |read_position|
|
17
|
+
qual_stats = read_position.split(/\s+/)
|
18
|
+
mean_quality = qual_stats[5].to_f
|
19
|
+
median_quality = qual_stats[7].to_f
|
20
|
+
first_quartile = qual_stats[6].to_f
|
21
|
+
third_quartile = qual_stats[8].to_f
|
22
|
+
qualities_at_read_positions << mean_quality
|
23
|
+
fastq_stats.read_position_stats << ReadPositionStats.new(
|
24
|
+
mean_quality,
|
25
|
+
median_quality,
|
26
|
+
first_quartile,
|
27
|
+
third_quartile
|
28
|
+
)
|
29
|
+
end
|
30
|
+
# determine mean quality
|
31
|
+
fastq_stats.mean_quality = qualities_at_read_positions.mean
|
32
|
+
# determine position where quality in a 5 position window drops below 20
|
33
|
+
position = 0
|
34
|
+
qualities_at_read_positions.each_cons(5) do |window|
|
35
|
+
position += 1
|
36
|
+
if window.mean < quality_cutoff && position > 15
|
37
|
+
fastq_stats.position_where_quality_lt_20 = position
|
38
|
+
break
|
39
|
+
end
|
40
|
+
end
|
41
|
+
return fastq_stats
|
42
|
+
end
|
43
|
+
|
44
|
+
def percentage_compared_to_raw(processed_fastq_file, raw_fastq_file)
|
45
|
+
file_lines_processed = `wc -l #{processed_fastq_file}`.split(" ").first.to_f
|
46
|
+
file_lines_raw = `wc -l #{raw_fastq_file}`.split(" ").first.to_f
|
47
|
+
percentage_reduction = (file_lines_processed/file_lines_raw*100).to_i
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'fastq_assessment'
|
2
|
+
include FastqAssessment
|
3
|
+
require 'miseq_run_stats'
|
4
|
+
include MiseqRunStats
|
5
|
+
|
6
|
+
def generate_quality_metrics(sample_map, directory, forward_reads_suffix, reverse_reads_suffix, quality_scale)
|
7
|
+
if File.exists?("#{directory}/ResequencingRunStatistics.xml")
|
8
|
+
resequencing_run_stats = parse_resequencing_run_stats("#{directory}/ResequencingRunStatistics.xml", sample_map.values)
|
9
|
+
else
|
10
|
+
resequencing_run_stats = ResequencingRunStats.new
|
11
|
+
resequencing_run_stats.sample_stats = Hash.new
|
12
|
+
sample_map.values.each do |sample_name|
|
13
|
+
resequencing_run_stats.sample_stats[sample_name] = SampleStats.new
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
forward_reads_trimmed_corrected_suffix = forward_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')
|
19
|
+
reverse_reads_trimmed_corrected_suffix = reverse_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')
|
20
|
+
|
21
|
+
|
22
|
+
sample_map.each do |read_file_prefix, sample_name|
|
23
|
+
puts sample_name
|
24
|
+
resequencing_run_stats.sample_stats[sample_name].fastq_stats = Hash.new
|
25
|
+
resequencing_run_stats.sample_stats[sample_name].fastq_stats["forward"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_suffix}",quality_scale)
|
26
|
+
resequencing_run_stats.sample_stats[sample_name].fastq_stats["reverse"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_suffix}",quality_scale)
|
27
|
+
resequencing_run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}",quality_scale)
|
28
|
+
resequencing_run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{forward_reads_suffix}")
|
29
|
+
resequencing_run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}",quality_scale)
|
30
|
+
resequencing_run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{reverse_reads_suffix}")
|
31
|
+
end
|
32
|
+
# print out data
|
33
|
+
output_file = File.open("#{directory}/summary_stats.txt", "w")
|
34
|
+
# print headers
|
35
|
+
output_file.puts "run name\tnumber of bases(Gb)\tnumber of clusters\tsample name\tdirection\tnumber of clusters\tnumber of forward reads aligned\tnumber of reverse reads aligned\tcoverage\tnumber of snps\tmean quality\tread base where qual falls below 30\tpercent reduction compared to raw"
|
36
|
+
output_file.puts "#{directory.match(/.*\/(.+?)$/).captures.first}\t#{resequencing_run_stats.number_of_bases}\t#{resequencing_run_stats.number_of_clusters}"
|
37
|
+
resequencing_run_stats.sample_stats.keys.sort.each do |sample_name|
|
38
|
+
sample_stats = resequencing_run_stats.sample_stats[sample_name]
|
39
|
+
output_file.puts "\t\t\t#{sample_name}\t\t#{sample_stats.number_of_clusters}\t#{sample_stats.number_of_forward_reads_aligned}\t#{sample_stats.number_of_reverse_reads_aligned}\t#{sample_stats.coverage}\t#{sample_stats.number_of_snps}"
|
40
|
+
["forward", "reverse", "forward-trim_corrected", "reverse-trim_corrected"].each do |direction|
|
41
|
+
fastq_stats = resequencing_run_stats.sample_stats[sample_name].fastq_stats[direction]
|
42
|
+
output_file.puts "\t\t\t\t#{direction}\t\t\t\t\t\t#{fastq_stats.mean_quality}\t#{fastq_stats.position_where_quality_lt_20}\t#{fastq_stats.percentage_compared_to_raw}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
output_file.close
|
46
|
+
end
|
data/lib/maths.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Add methods to Enumerable, which makes them available to Array
|
2
|
+
module Enumerable
|
3
|
+
# sum of an array of numbers
|
4
|
+
def sum
|
5
|
+
return self.inject(0){|acc,i|acc +i}
|
6
|
+
end
|
7
|
+
|
8
|
+
# mean of an array of numbers
|
9
|
+
def mean
|
10
|
+
return self.sum/self.length.to_f
|
11
|
+
end
|
12
|
+
|
13
|
+
# variance of an array of numbers
|
14
|
+
def sample_variance
|
15
|
+
mean=self.mean
|
16
|
+
sum=self.inject(0){|acc,i|acc +(i-mean)**2}
|
17
|
+
return(1/self.length.to_f*sum)
|
18
|
+
end
|
19
|
+
|
20
|
+
# standard deviation of an array of numbers
|
21
|
+
def standard_deviation
|
22
|
+
return Math.sqrt(self.sample_variance)
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module MiseqRunStats
|
2
|
+
require 'nokogiri'
|
3
|
+
ResequencingRunStats = Struct.new("ResequencingRunStats", :number_of_bases, :number_of_clusters, :sample_stats)
|
4
|
+
SampleStats = Struct.new("SampleStats", :sample_name, :number_of_clusters, :number_of_forward_reads_aligned, :number_of_reverse_reads_aligned, :coverage, :number_of_snps, :fastq_stats)
|
5
|
+
def parse_resequencing_run_stats(xml_file, original_sample_names = nil)
|
6
|
+
xml = Nokogiri::XML(File.read(xml_file))
|
7
|
+
resequencing_run_stats = ResequencingRunStats.new
|
8
|
+
|
9
|
+
xml.search('//RunStats').each do |run_stats|
|
10
|
+
resequencing_run_stats.number_of_bases = run_stats.search('YieldInBasesPF').text.to_f/1000000000
|
11
|
+
resequencing_run_stats.number_of_clusters = run_stats.search('NumberOfClustersPF').text.to_i
|
12
|
+
end
|
13
|
+
|
14
|
+
resequencing_run_stats.sample_stats = Hash.new
|
15
|
+
xml.search('//SummarizedSampleStatisics').each do |summarised_samples_stats|
|
16
|
+
sample_name = summarised_samples_stats.search('SampleName').text
|
17
|
+
sample_name = original_sample_names.select{|original_sample_name| sample_name =~ /#{original_sample_name}/}.first unless original_sample_names.nil? # alter sample name to original sample name if supplies as an array
|
18
|
+
|
19
|
+
resequencing_run_stats.sample_stats[sample_name] = SampleStats.new
|
20
|
+
resequencing_run_stats.sample_stats[sample_name].sample_name = sample_name
|
21
|
+
resequencing_run_stats.sample_stats[sample_name].number_of_clusters = summarised_samples_stats.search('NumberOfClustersPF').text
|
22
|
+
resequencing_run_stats.sample_stats[sample_name].number_of_forward_reads_aligned = summarised_samples_stats.search('ClustersAlignedR1').text
|
23
|
+
resequencing_run_stats.sample_stats[sample_name].number_of_reverse_reads_aligned = summarised_samples_stats.search('ClustersAlignedR2').text
|
24
|
+
resequencing_run_stats.sample_stats[sample_name].coverage = summarised_samples_stats.search('WeightedCoverage').text
|
25
|
+
resequencing_run_stats.sample_stats[sample_name].number_of_snps = summarised_samples_stats.search('NumberHomozygousSNPs').text
|
26
|
+
end
|
27
|
+
return resequencing_run_stats
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
def trim_and_correct_fastqs(sample_map, directory, forward_reads_suffix, forward_reads_file_extension, reverse_reads_suffix, reverse_reads_file_extension, quality_scale, fastq_quality_trimmer_path, quake_path)
|
2
|
+
Dir.chdir(directory)
|
3
|
+
# trimming
|
4
|
+
sample_map.each do |sample_file_prefix, sample_name|
|
5
|
+
puts "Trimming files for #{sample_name}"
|
6
|
+
|
7
|
+
system("#{fastq_quality_trimmer_path} -i #{directory}/#{sample_file_prefix}#{forward_reads_suffix}.#{forward_reads_file_extension} -o #{directory}/#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} -t 20 -l 90 -Q #{quality_scale} -v")
|
8
|
+
system("#{fastq_quality_trimmer_path} -i #{directory}/#{sample_file_prefix}#{reverse_reads_suffix}.#{reverse_reads_file_extension} -o #{directory}/#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension} -t 20 -l 90 -Q #{quality_scale} -v")
|
9
|
+
system("perl /tmp/fastq-remove-orphans.pl -1 #{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} -2 #{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
10
|
+
end
|
11
|
+
|
12
|
+
# quake correction
|
13
|
+
# write file for quake
|
14
|
+
sample_map.each do |sample_file_prefix, sample_name|
|
15
|
+
output_file = File.open("quake_file_list.txt","w")
|
16
|
+
output_file.puts "paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension} paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}"
|
17
|
+
output_file.close
|
18
|
+
# run quake
|
19
|
+
system("#{quake_path} -f quake_file_list.txt -k 15 -q #{quality_scale}")
|
20
|
+
end
|
21
|
+
sample_map.each do |sample_file_prefix, sample_name|
|
22
|
+
system("perl /Volumes/NGS2_DataRAID/projects/MRSA/scripts/fastq-remove-orphans.pl -1 paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension} -2 paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
|
23
|
+
end
|
24
|
+
|
25
|
+
# cleanup and rename files
|
26
|
+
sample_map.each do |sample_file_prefix, sample_name|
|
27
|
+
system("rm #{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
28
|
+
system("rm #{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
29
|
+
system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
30
|
+
system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
31
|
+
system("rm orphaned_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
32
|
+
system("rm orphaned_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
33
|
+
system("rm error_model.paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.txt")
|
34
|
+
system("rm error_model.paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.txt")
|
35
|
+
system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.stats.txt")
|
36
|
+
system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
|
37
|
+
system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.stats.txt")
|
38
|
+
system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
|
39
|
+
system("rm paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
40
|
+
system("rm paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
|
41
|
+
system("rm orphaned_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
42
|
+
system("rm orphaned_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
|
43
|
+
system("mv paired_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension} #{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
44
|
+
system("mv paired_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension} #{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
|
45
|
+
end
|
46
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'fastq-factory'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fastq-factory
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Anthony Underwood
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-08-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: trollop
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: nokogiri
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: shoulda
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: rdoc
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '3.12'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '3.12'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: bundler
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 1.1.5
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.1.5
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: jeweler
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ~>
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.8.4
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ~>
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.8.4
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: simplecov
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
description: This tool can process fastq files, using fastq_quality_trimmer and quake
|
127
|
+
to correct fastq files and then provide a quality asssessment of the data
|
128
|
+
email: anthony.underwood@hpa.org.uk
|
129
|
+
executables:
|
130
|
+
- fastq-factory
|
131
|
+
extensions: []
|
132
|
+
extra_rdoc_files:
|
133
|
+
- LICENSE.txt
|
134
|
+
- README.rdoc
|
135
|
+
files:
|
136
|
+
- .document
|
137
|
+
- Gemfile
|
138
|
+
- LICENSE.txt
|
139
|
+
- README.rdoc
|
140
|
+
- Rakefile
|
141
|
+
- VERSION
|
142
|
+
- bin/fastq-factory
|
143
|
+
- lib/fastq-factory.rb
|
144
|
+
- lib/fastq-remove-orphans.pl
|
145
|
+
- lib/fastq_assessment.rb
|
146
|
+
- lib/generate_quality_metrics.rb
|
147
|
+
- lib/maths.rb
|
148
|
+
- lib/miseq_run_stats.rb
|
149
|
+
- lib/trim_and_correct.rb
|
150
|
+
- test/helper.rb
|
151
|
+
- test/test_fastq-factory.rb
|
152
|
+
homepage: http://github.com/hpa-bioinformatics/fastq-factory
|
153
|
+
licenses:
|
154
|
+
- MIT
|
155
|
+
post_install_message:
|
156
|
+
rdoc_options: []
|
157
|
+
require_paths:
|
158
|
+
- lib
|
159
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
160
|
+
none: false
|
161
|
+
requirements:
|
162
|
+
- - ! '>='
|
163
|
+
- !ruby/object:Gem::Version
|
164
|
+
version: '0'
|
165
|
+
segments:
|
166
|
+
- 0
|
167
|
+
hash: 2485524748993201700
|
168
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
requirements: []
|
175
|
+
rubyforge_project:
|
176
|
+
rubygems_version: 1.8.19
|
177
|
+
signing_key:
|
178
|
+
specification_version: 3
|
179
|
+
summary: A tool to process and QC fastq files from illumina machines
|
180
|
+
test_files: []
|