fastq-factory 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bin/fastq-factory +4 -4
- data/fastq-factory.gemspec +2 -2
- data/lib/fastq-factory.rb +6 -2
- data/lib/generate_quality_metrics.rb +35 -19
- data/lib/miseq_run_stats.rb +41 -2
- data/lib/trim_and_correct.rb +27 -19
- metadata +3 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.5
|
data/bin/fastq-factory
CHANGED
@@ -26,8 +26,8 @@ reverse_reads_suffix, reverse_reads_file_extension = options[:reverse_reads_suff
|
|
26
26
|
sample_map = extract_file_prefixes_and_sample_name(options[:sample_map_file], options[:directory])
|
27
27
|
# check sequence files exist
|
28
28
|
sample_map.keys.each do |sample_prefix|
|
29
|
-
file_exists?("#{sample_prefix}#{options[:forward_reads_suffix]}"
|
30
|
-
file_exists?("#{sample_prefix}#{options[:reverse_reads_suffix]}"
|
29
|
+
file_exists?(options[:directory], "#{sample_prefix}#{options[:forward_reads_suffix]}")
|
30
|
+
file_exists?(options[:directory], "#{sample_prefix}#{options[:reverse_reads_suffix]}")
|
31
31
|
end
|
32
32
|
if options[:trim]
|
33
33
|
fastq_quality_trimmer_path = find_executable("fastq_quality_trimmer", options[:fastq_quality_trimmer_dir])
|
@@ -40,8 +40,8 @@ end
|
|
40
40
|
|
41
41
|
if options[:metrics]
|
42
42
|
sample_map.keys.each do |sample_prefix|
|
43
|
-
file_exists?("#{sample_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}",
|
44
|
-
file_exists?("#{sample_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}",
|
43
|
+
file_exists?(options[:directory], "#{sample_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}", "#{sample_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
44
|
+
file_exists?(options[:directory], "#{sample_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}", "#{sample_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
45
45
|
end
|
46
46
|
generate_quality_metrics(sample_map, options[:directory], options[:forward_reads_suffix], options[:reverse_reads_suffix], options[:quality_scale], options[:metrics_quality_cutoff])
|
47
47
|
end
|
data/fastq-factory.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "fastq-factory"
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Anthony Underwood"]
|
12
|
-
s.date = "2012-08-
|
12
|
+
s.date = "2012-08-29"
|
13
13
|
s.description = "This tool can process fastq files, using fastq_quality_trimmer and quake to correct fastq files and then provide a quality asssessment of the data"
|
14
14
|
s.email = "anthony.underwood@hpa.org.uk"
|
15
15
|
s.executables = ["fastq-factory"]
|
data/lib/fastq-factory.rb
CHANGED
@@ -10,8 +10,12 @@ def extract_file_prefixes_and_sample_name(sample_map_file, directory)
|
|
10
10
|
return sample_map
|
11
11
|
end
|
12
12
|
|
13
|
-
def file_exists?(
|
14
|
-
|
13
|
+
def file_exists?(directory, *filenames)
|
14
|
+
at_least_one_file_found = false
|
15
|
+
filenames.each do |filename|
|
16
|
+
at_least_one_file_found = true if File.exists?("#{directory}/#{filename}")
|
17
|
+
end
|
18
|
+
abort("You specified a file(s): #{filenames.join(", ")}. At least one of these must exist! Please check your sample map file") unless at_least_one_file_found
|
15
19
|
end
|
16
20
|
|
17
21
|
def find_executable(executable_name, directory = nil)
|
@@ -6,41 +6,57 @@ include MiseqRunStats
|
|
6
6
|
def generate_quality_metrics(sample_map, directory, forward_reads_suffix, reverse_reads_suffix, quality_scale, quality_cutoff)
|
7
7
|
if File.exists?("#{directory}/ResequencingRunStatistics.xml")
|
8
8
|
puts "Assessing quality from Miseq run stats file"
|
9
|
-
|
9
|
+
run_stats = parse_run_stats("#{directory}/ResequencingRunStatistics.xml", sample_map.values)
|
10
|
+
elsif File.exists?("#{directory}/AssemblyRunStatistics.xml")
|
11
|
+
puts "Assessing quality from Miseq run stats file"
|
12
|
+
run_stats = parse_assembly_run_stats("#{directory}/AssemblyRunStatistics.xml", sample_map.values)
|
10
13
|
else
|
11
|
-
|
12
|
-
|
14
|
+
run_stats = ResequencingRunStats.new
|
15
|
+
run_stats.sample_stats = Hash.new
|
13
16
|
sample_map.values.each do |sample_name|
|
14
|
-
|
17
|
+
run_stats.sample_stats[sample_name] = ResequencingSampleStats.new
|
15
18
|
end
|
16
19
|
end
|
17
20
|
|
21
|
+
forward_reads_trimmed_suffix = forward_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed\2')
|
22
|
+
reverse_reads_trimmed_suffix = reverse_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed\2')
|
18
23
|
|
19
24
|
forward_reads_trimmed_corrected_suffix = forward_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')
|
20
25
|
reverse_reads_trimmed_corrected_suffix = reverse_reads_suffix.sub(/(.+)(\..+?)$/, '\1.trimmed.cor\2')
|
21
26
|
|
22
|
-
|
23
27
|
sample_map.each do |read_file_prefix, sample_name|
|
24
28
|
puts "Assesing quality for #{sample_name}"
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
run_stats.sample_stats[sample_name].fastq_stats = Hash.new
|
30
|
+
run_stats.sample_stats[sample_name].fastq_stats["forward"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_suffix}",quality_scale, quality_cutoff)
|
31
|
+
run_stats.sample_stats[sample_name].fastq_stats["reverse"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_suffix}",quality_scale, quality_cutoff)
|
32
|
+
if File.exists?("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}")
|
33
|
+
run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}",quality_scale, quality_cutoff)
|
34
|
+
run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{forward_reads_suffix}")
|
35
|
+
run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}",quality_scale, quality_cutoff)
|
36
|
+
run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_corrected_suffix}", "#{directory}/#{read_file_prefix}#{reverse_reads_suffix}")
|
37
|
+
else
|
38
|
+
run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_suffix}",quality_scale, quality_cutoff)
|
39
|
+
run_stats.sample_stats[sample_name].fastq_stats["forward-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{forward_reads_trimmed_suffix}", "#{directory}/#{read_file_prefix}#{forward_reads_suffix}")
|
40
|
+
run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"] = generate_quality_stats_for_read("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_suffix}",quality_scale, quality_cutoff)
|
41
|
+
run_stats.sample_stats[sample_name].fastq_stats["reverse-trim_corrected"].percentage_compared_to_raw = percentage_compared_to_raw("#{directory}/#{read_file_prefix}#{reverse_reads_trimmed_suffix}", "#{directory}/#{read_file_prefix}#{reverse_reads_suffix}")
|
42
|
+
end
|
43
|
+
|
32
44
|
end
|
33
45
|
# print out data
|
34
46
|
output_file = File.open("#{directory}/summary_stats.txt", "w")
|
35
47
|
# print headers
|
36
|
-
output_file.puts "run name\tnumber of bases(Gb)\tnumber of clusters\tsample name\tdirection\tnumber of clusters\tnumber of forward reads aligned\tnumber of reverse reads aligned\tcoverage\tnumber of snps\tmean quality\tread base where qual falls below 30\tpercent reduction compared to raw"
|
37
|
-
output_file.puts "#{directory.match(/.*\/(.+?)$/).captures.first}\t#{
|
38
|
-
|
39
|
-
sample_stats =
|
40
|
-
|
48
|
+
output_file.puts "run name\tnumber of bases(Gb)\tnumber of clusters\tsample name\tdirection\tnumber of clusters\tnumber of forward reads aligned\tnumber of reverse reads aligned\tcoverage\tnumber of snps\tnumber of contigs\tmean contig size\tn50\tnumber of bases\tmean quality\tread base where qual falls below 30\tpercent reduction compared to raw"
|
49
|
+
output_file.puts "#{directory.match(/.*\/(.+?)$/).captures.first}\t#{run_stats.number_of_bases}\t#{run_stats.number_of_clusters}"
|
50
|
+
run_stats.sample_stats.keys.sort.each do |sample_name|
|
51
|
+
sample_stats = run_stats.sample_stats[sample_name]
|
52
|
+
if sample_stats.class == Struct::ResequencingSampleStats
|
53
|
+
output_file.puts "\t\t\t#{sample_name}\t\t#{sample_stats.number_of_clusters}\t#{sample_stats.number_of_forward_reads_aligned}\t#{sample_stats.number_of_reverse_reads_aligned}\t#{sample_stats.coverage}\t#{sample_stats.number_of_snps}"
|
54
|
+
elsif sample_stats.class == Struct::AssemblySampleStats
|
55
|
+
output_file.puts "\t\t\t#{sample_name}\t\t#{sample_stats.number_of_clusters}\t\t\t\t\t#{sample_stats.number_of_contigs}\t#{sample_stats.mean_contig_size}\t#{sample_stats.n50}\t#{sample_stats.number_of_bases}"
|
56
|
+
end
|
41
57
|
["forward", "reverse", "forward-trim_corrected", "reverse-trim_corrected"].each do |direction|
|
42
|
-
fastq_stats =
|
43
|
-
output_file.puts "\t\t\t\t#{direction}\t\t\t\t\t\t#{fastq_stats.mean_quality}\t#{fastq_stats.position_where_quality_lt_20}\t#{fastq_stats.percentage_compared_to_raw}"
|
58
|
+
fastq_stats = run_stats.sample_stats[sample_name].fastq_stats[direction]
|
59
|
+
output_file.puts "\t\t\t\t#{direction}\t\t\t\t\t\t\t\t\t\t#{fastq_stats.mean_quality}\t#{fastq_stats.position_where_quality_lt_20}\t#{fastq_stats.percentage_compared_to_raw}"
|
44
60
|
end
|
45
61
|
end
|
46
62
|
output_file.close
|
data/lib/miseq_run_stats.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
module MiseqRunStats
|
2
2
|
require 'nokogiri'
|
3
3
|
ResequencingRunStats = Struct.new("ResequencingRunStats", :number_of_bases, :number_of_clusters, :sample_stats)
|
4
|
-
|
4
|
+
ResequencingSampleStats = Struct.new("ResequencingSampleStats", :sample_name, :number_of_clusters, :number_of_forward_reads_aligned, :number_of_reverse_reads_aligned, :coverage, :number_of_snps, :fastq_stats)
|
5
|
+
AssemblyRunStats = Struct.new("AssemblyRunStats", :number_of_bases, :number_of_clusters, :sample_stats)
|
6
|
+
AssemblySampleStats = Struct.new("AssemblySampleStats", :sample_name, :number_of_clusters, :number_of_contigs, :mean_contig_size, :n50, :number_of_bases, :fastq_stats)
|
5
7
|
def parse_resequencing_run_stats(xml_file, original_sample_names = nil)
|
6
8
|
xml = Nokogiri::XML(File.read(xml_file))
|
7
9
|
resequencing_run_stats = ResequencingRunStats.new
|
@@ -16,7 +18,7 @@ module MiseqRunStats
|
|
16
18
|
sample_name = summarised_samples_stats.search('SampleName').text
|
17
19
|
sample_name = original_sample_names.select{|original_sample_name| sample_name =~ /#{original_sample_name}/}.first unless original_sample_names.nil? # alter sample name to original sample name if supplies as an array
|
18
20
|
|
19
|
-
resequencing_run_stats.sample_stats[sample_name] =
|
21
|
+
resequencing_run_stats.sample_stats[sample_name] = ResequencingSampleStats.new
|
20
22
|
resequencing_run_stats.sample_stats[sample_name].sample_name = sample_name
|
21
23
|
resequencing_run_stats.sample_stats[sample_name].number_of_clusters = summarised_samples_stats.search('NumberOfClustersPF').text
|
22
24
|
resequencing_run_stats.sample_stats[sample_name].number_of_forward_reads_aligned = summarised_samples_stats.search('ClustersAlignedR1').text
|
@@ -26,4 +28,41 @@ module MiseqRunStats
|
|
26
28
|
end
|
27
29
|
return resequencing_run_stats
|
28
30
|
end
|
31
|
+
def parse_assembly_run_stats(xml_file, original_sample_names = nil)
|
32
|
+
xml = Nokogiri::XML(File.read(xml_file))
|
33
|
+
assembly_run_stats = AssemblyRunStats.new
|
34
|
+
|
35
|
+
xml.search('//RunStats').each do |run_stats|
|
36
|
+
assembly_run_stats.number_of_bases = run_stats.search('YieldInBasesPF').text.to_f/1000000000
|
37
|
+
assembly_run_stats.number_of_clusters = run_stats.search('NumberOfClustersPF').text.to_i
|
38
|
+
end
|
39
|
+
|
40
|
+
# get un-named contig data
|
41
|
+
assembly_stats = Array.new
|
42
|
+
xml.search('//AssemblyStatistics').each do |assembly_sample_stats|
|
43
|
+
number_of_contigs = assembly_sample_stats.search('NumberOfContigs').text.to_i
|
44
|
+
mean_contig_size = assembly_sample_stats.search('MeanContigLength').text.to_f.to_i
|
45
|
+
n50 = assembly_sample_stats.search('N50').text.to_i
|
46
|
+
number_of_bases = assembly_sample_stats.search('BaseCount').text.to_i
|
47
|
+
assembly_stats << {:number_of_contigs => number_of_contigs, :mean_contig_size => mean_contig_size, :n50 => n50, :number_of_bases => number_of_bases}
|
48
|
+
end
|
49
|
+
|
50
|
+
assembly_run_stats.sample_stats = Hash.new
|
51
|
+
xml.search('//SampleStatistics').each do |sample_stats|
|
52
|
+
sample_name = sample_stats.search('SampleName').text
|
53
|
+
sample_name = original_sample_names.select{|original_sample_name| sample_name =~ /#{original_sample_name}/}.first unless original_sample_names.nil? # alter sample name to original sample name if supplies as an array
|
54
|
+
next if sample_name.nil?
|
55
|
+
|
56
|
+
assembly_run_stats.sample_stats[sample_name] = AssemblySampleStats.new
|
57
|
+
assembly_run_stats.sample_stats[sample_name].sample_name = sample_name
|
58
|
+
assembly_run_stats.sample_stats[sample_name].number_of_clusters = sample_stats.search('NumberOfClustersPF').text
|
59
|
+
assembly_sample_stats = assembly_stats.shift
|
60
|
+
assembly_run_stats.sample_stats[sample_name].number_of_contigs = assembly_sample_stats[:number_of_contigs]
|
61
|
+
assembly_run_stats.sample_stats[sample_name].mean_contig_size = assembly_sample_stats[:mean_contig_size]
|
62
|
+
assembly_run_stats.sample_stats[sample_name].n50 = assembly_sample_stats[:n50]
|
63
|
+
assembly_run_stats.sample_stats[sample_name].number_of_bases = assembly_sample_stats[:number_of_bases]
|
64
|
+
|
65
|
+
end
|
66
|
+
return assembly_run_stats
|
67
|
+
end
|
29
68
|
end
|
data/lib/trim_and_correct.rb
CHANGED
@@ -23,29 +23,37 @@ def trim_and_correct_fastqs(sample_map, directory, forward_reads_suffix, forward
|
|
23
23
|
`#{quake_path} -f quake_file_list.txt -k 15 -q #{quality_scale}`
|
24
24
|
end
|
25
25
|
sample_map.each do |sample_file_prefix, sample_name|
|
26
|
-
|
26
|
+
if File.exists?("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
27
|
+
`perl /Volumes/NGS2_DataRAID/projects/MRSA/scripts/fastq-remove-orphans.pl -1 paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension} -2 paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}`
|
28
|
+
end
|
27
29
|
end
|
28
30
|
|
29
31
|
# cleanup and rename files
|
32
|
+
|
30
33
|
sample_map.each do |sample_file_prefix, sample_name|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
34
|
+
File.delete("#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
35
|
+
File.delete("#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
36
|
+
File.delete("orphaned_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
37
|
+
File.delete("orphaned_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
38
|
+
if File.exists?("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
39
|
+
File.delete("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
40
|
+
File.delete("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
41
|
+
File.delete("error_model.paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.txt")
|
42
|
+
File.delete("error_model.paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.txt")
|
43
|
+
File.delete("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.stats.txt")
|
44
|
+
File.delete("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
|
45
|
+
File.delete("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.stats.txt")
|
46
|
+
File.delete("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor_single.#{forward_reads_file_extension}")
|
47
|
+
File.delete("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
48
|
+
File.delete("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
|
49
|
+
File.delete("orphaned_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
50
|
+
File.delete("orphaned_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
|
51
|
+
File.rename("paired_paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}", "#{sample_file_prefix}#{forward_reads_suffix}.trimmed.cor.#{forward_reads_file_extension}")
|
52
|
+
File.rename("paired_paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}", "#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.cor.#{reverse_reads_file_extension}")
|
53
|
+
else
|
54
|
+
File.rename("paired_#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}", "#{sample_file_prefix}#{forward_reads_suffix}.trimmed.#{forward_reads_file_extension}")
|
55
|
+
File.rename("paired_#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}", "#{sample_file_prefix}#{reverse_reads_suffix}.trimmed.#{reverse_reads_file_extension}")
|
56
|
+
end
|
49
57
|
end
|
50
58
|
end
|
51
59
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fastq-factory
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: trollop
|
@@ -165,7 +165,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
165
165
|
version: '0'
|
166
166
|
segments:
|
167
167
|
- 0
|
168
|
-
hash:
|
168
|
+
hash: 2435527591364603339
|
169
169
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
170
170
|
none: false
|
171
171
|
requirements:
|