bio-polyploid-tools 0.1.0 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -3
- data/Gemfile.lock +8 -8
- data/README.md +45 -0
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -7
- data/bin/count_variations.rb +1 -1
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +1 -1
- data/bin/polymarker.rb +2 -2
- data/bin/snps_between_bams.rb +37 -7
- data/bio-polyploid-tools.gemspec +17 -13
- data/lib/bio/BFRTools.rb +27 -261
- data/lib/bio/BIOExtensions.rb +0 -124
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +1 -1
- data/lib/bio/PolyploidTools/ExonContainer.rb +6 -5
- data/lib/bio/PolyploidTools/Marker.rb +2 -2
- data/lib/bio/PolyploidTools/SNP.rb +5 -4
- data/lib/bio/db/exonerate.rb +1 -1
- data/test/test_bfr.rb +101 -9
- metadata +28 -12
- data/lib/bio/SAMToolsExtensions.rb +0 -284
- data/lib/bio/db/fastadb.rb +0 -164
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 019bf8dc15f35de0be9a55567b8041f9b98ac326
|
4
|
+
data.tar.gz: 3e0a76bbefead5c5284c64a01b36645748a70098
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98e2d6c023ee8d89014efe65da619f0a98808c1540c3773aaef901de9f5c2338a3cc4645bdee1a3cdc430d525587c27d576d1f19e2ac8e59d7724a6efaac5901
|
7
|
+
data.tar.gz: 3d09c9a1972b7538eb160ee89786f9bdd3f8c52fec554da110222241767e7a17f6efd7f0f42a219fb06cd3689037fbee2ec88eeae0cb94e40333a5c491259421
|
data/Gemfile
CHANGED
@@ -2,12 +2,13 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
-
|
6
|
-
gem "bio
|
5
|
+
|
6
|
+
gem "bio", ">= 1.4.3"
|
7
|
+
gem "bio-samtools", ">= 2.0.3"
|
7
8
|
gem "rake"
|
8
9
|
gem "jeweler"
|
9
10
|
|
10
|
-
|
11
|
+
gem "systemu", ">=2.5.2"
|
11
12
|
|
12
13
|
group :development do
|
13
14
|
# gem "shoulda", ">= 0"
|
data/Gemfile.lock
CHANGED
@@ -3,17 +3,16 @@ GEM
|
|
3
3
|
specs:
|
4
4
|
addressable (2.3.6)
|
5
5
|
atomic (1.1.16)
|
6
|
-
bio (1.4.
|
7
|
-
bio-samtools (0.
|
6
|
+
bio (1.4.3.0001)
|
7
|
+
bio-samtools (2.0.3)
|
8
8
|
bio (>= 1.4.2)
|
9
|
-
|
10
|
-
|
9
|
+
bio-svgenes (>= 0.4.1)
|
10
|
+
bio-svgenes (0.4.1)
|
11
11
|
builder (3.2.2)
|
12
12
|
descendants_tracker (0.0.4)
|
13
13
|
thread_safe (~> 0.3, >= 0.3.1)
|
14
14
|
faraday (0.9.0)
|
15
15
|
multipart-post (>= 1.2, < 3)
|
16
|
-
ffi (1.9.3)
|
17
16
|
git (1.2.6)
|
18
17
|
github_api (0.11.3)
|
19
18
|
addressable (~> 2.3)
|
@@ -53,7 +52,7 @@ GEM
|
|
53
52
|
rake (10.2.2)
|
54
53
|
rdoc (4.1.1)
|
55
54
|
json (~> 1.4)
|
56
|
-
systemu (2.6.
|
55
|
+
systemu (2.6.4)
|
57
56
|
thread_safe (0.3.1)
|
58
57
|
atomic (>= 1.1.7, < 2)
|
59
58
|
|
@@ -61,7 +60,8 @@ PLATFORMS
|
|
61
60
|
ruby
|
62
61
|
|
63
62
|
DEPENDENCIES
|
64
|
-
bio (
|
65
|
-
bio-samtools (
|
63
|
+
bio (>= 1.4.3)
|
64
|
+
bio-samtools (>= 2.0.3)
|
66
65
|
jeweler
|
67
66
|
rake
|
67
|
+
systemu (>= 2.5.2)
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
bio-polyploid-tools
|
2
|
+
===================
|
3
|
+
|
4
|
+
Introduction
|
5
|
+
-------------
|
6
|
+
This tools are designed to deal with polyploid wheat. The first tool is to design KASP primers,
|
7
|
+
making them as specific as possible.
|
8
|
+
|
9
|
+
|
10
|
+
Installation
|
11
|
+
------------
|
12
|
+
'gem install bio-polyploid-tools'
|
13
|
+
|
14
|
+
You need to have in your $PATH the following programs:
|
15
|
+
* [MAFFT]{http://mafft.cbrc.jp/alignment/software/}
|
16
|
+
* [primer3]{http://primer3.sourceforge.net/releases.php}
|
17
|
+
* [exonerate]{http://www.ebi.ac.uk/~guy/exonerate/}
|
18
|
+
|
19
|
+
The code has been developed on ruby 2.1.0, but it should work on 1.9.3 and above.
|
20
|
+
|
21
|
+
|
22
|
+
Polymarker
|
23
|
+
----------
|
24
|
+
|
25
|
+
To run poolymerker with the CSS wheat contigs, you need to unzip the
|
26
|
+
reference file [Triticum_aestivum.IWGSP1.22.dna_rm.genome.fa.gz{ftp://ftp.ensemblgenomes.org/pub/release-22/plants/fasta/triticum_aestivum/dna/}.
|
27
|
+
|
28
|
+
polymarker.rb --contigs Triticum_aestivum.IWGSP1.22.dna_rm.genome.fa --marker_list snp_list.csv --output output_folder
|
29
|
+
|
30
|
+
The snp_list file must follow the convention
|
31
|
+
<ID>,<Chromosome>,<SEQUENCE>
|
32
|
+
with the SNP inside the sequence in the format [A/T]. As a reference, look at test/data/short_primer_design_test.csv
|
33
|
+
|
34
|
+
Notes
|
35
|
+
-----
|
36
|
+
|
37
|
+
* If the SNP is in a gap in the alignment to the chromosomes, it is ignored.
|
38
|
+
|
39
|
+
BUG: Blocks with NNNs are picked and treated as semi-specific.
|
40
|
+
BUG: If the name of the reference have space, the ID is not chopped. ">gene_1 (G12A)" shouls be treated as ">gene_1".
|
41
|
+
TODO: If reading from a reference file, only get one reference to align when the region is queried several times
|
42
|
+
TODO: Add a parameter file to configure the alignments.
|
43
|
+
TODO: Produce primers for products of different sizes
|
44
|
+
|
45
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.3
|
data/bin/bfr.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
1
2
|
require 'rubygems'
|
2
3
|
#require 'extensions/all'
|
3
4
|
require 'bio-samtools'
|
@@ -70,18 +71,12 @@ chunk_size = options[:chunk_size]
|
|
70
71
|
output_filename = options[:output_filename]
|
71
72
|
stats_file = options[:stats_file]
|
72
73
|
|
73
|
-
|
74
|
-
#reference = ARGV[6]
|
75
|
-
|
76
|
-
|
77
74
|
|
78
75
|
min = chunk * chunk_size
|
79
76
|
max = min + chunk_size
|
80
77
|
|
81
78
|
|
82
|
-
#AvocetS
|
83
79
|
parental_1=options[:parent_1]
|
84
|
-
#AvocetS (Yr15)
|
85
80
|
parental_2=options[:parent_2]
|
86
81
|
|
87
82
|
|
@@ -89,7 +84,7 @@ bulk_1 = options[:bulk_1]
|
|
89
84
|
bulk_2 = options[:bulk_2]
|
90
85
|
|
91
86
|
|
92
|
-
fasta_db = Bio::DB::Fasta::FastaFile.new(reference)
|
87
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
93
88
|
fasta_db.load_fai_entries
|
94
89
|
|
95
90
|
|
data/bin/count_variations.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
|
4
|
+
found_cointigs = Set.new
|
5
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model, :chunk=>chunk, :total_chunks=>}) do |aln|
|
6
|
+
if aln.identity > min_identity
|
7
|
+
exo_f.puts aln.line
|
8
|
+
unless found_cointigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
9
|
+
found_cointigs.add(aln.target_id)
|
10
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
11
|
+
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
12
|
+
region = entry.get_full_region
|
13
|
+
seq = fasta_file.fetch_sequence(region)
|
14
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/bin/hexaploid_primers.rb
CHANGED
@@ -43,7 +43,7 @@ snps = Array.new
|
|
43
43
|
#0. Load the fasta index
|
44
44
|
fasta_reference_db = nil
|
45
45
|
if fasta_reference
|
46
|
-
fasta_reference_db = Bio::DB::Fasta::FastaFile.new(fasta_reference)
|
46
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
47
47
|
fasta_reference_db.load_fai_entries
|
48
48
|
p "Fasta reference: #{fasta_reference}"
|
49
49
|
end
|
@@ -99,7 +99,7 @@ Dir.foreach(path_to_contigs) do |filename |
|
|
99
99
|
puts filename
|
100
100
|
target="#{path_to_contigs}/#{filename}"
|
101
101
|
|
102
|
-
fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
102
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
103
103
|
fasta_file.load_fai_entries
|
104
104
|
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
|
105
105
|
if aln.identity > min_identity
|
data/bin/homokaryot_primers.rb
CHANGED
@@ -82,7 +82,7 @@ snps = Array.new
|
|
82
82
|
#0. Load the fasta index
|
83
83
|
fasta_reference_db = nil
|
84
84
|
if reference_file
|
85
|
-
fasta_reference_db = Bio::DB::Fasta::FastaFile.new(reference_file)
|
85
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference_file})
|
86
86
|
fasta_reference_db.load_fai_entries
|
87
87
|
p "Fasta reference: #{reference_file}"
|
88
88
|
end
|
data/bin/polymarker.rb
CHANGED
@@ -87,7 +87,7 @@ snps = Array.new
|
|
87
87
|
#0. Load the fasta index
|
88
88
|
fasta_reference_db = nil
|
89
89
|
if fasta_reference
|
90
|
-
fasta_reference_db = Bio::DB::Fasta::FastaFile.new(fasta_reference)
|
90
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
91
91
|
fasta_reference_db.load_fai_entries
|
92
92
|
p "Fasta reference: #{fasta_reference}"
|
93
93
|
end
|
@@ -141,7 +141,7 @@ filename=path_to_contigs
|
|
141
141
|
puts filename
|
142
142
|
target=filename
|
143
143
|
|
144
|
-
fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
144
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
145
145
|
fasta_file.load_fai_entries
|
146
146
|
|
147
147
|
found_cointigs = Set.new
|
data/bin/snps_between_bams.rb
CHANGED
@@ -15,7 +15,7 @@ require path
|
|
15
15
|
|
16
16
|
|
17
17
|
|
18
|
-
fasta_db = Bio::DB::Fasta::FastaFile.new(
|
18
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new(:fasta=>ARGV[0])
|
19
19
|
fasta_db.load_fai_entries
|
20
20
|
bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
|
21
21
|
bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
|
@@ -23,7 +23,7 @@ bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
|
|
23
23
|
|
24
24
|
output_prefix = ARGV[3]
|
25
25
|
|
26
|
-
block_size=
|
26
|
+
block_size=1000
|
27
27
|
|
28
28
|
min_cov = ARGV[4].to_i ? ARGV[4].to_i : 10
|
29
29
|
chunk = ARGV[5].to_i
|
@@ -54,6 +54,38 @@ fasta_db.index.entries.each do | r |
|
|
54
54
|
|
55
55
|
|
56
56
|
begin
|
57
|
+
<<<<<<< HEAD
|
58
|
+
reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
59
|
+
reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
60
|
+
cons_1 = reg_a.consensus
|
61
|
+
cons_2 = reg_b.consensus
|
62
|
+
|
63
|
+
|
64
|
+
snps_1 = cons_1.count_ambiguities
|
65
|
+
snps_2 = cons_2.count_ambiguities
|
66
|
+
|
67
|
+
called_1 = reg_a.called
|
68
|
+
called_2 = reg_b.called
|
69
|
+
|
70
|
+
snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
|
71
|
+
|
72
|
+
snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
|
73
|
+
snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
|
74
|
+
snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
|
75
|
+
|
76
|
+
hist_1[snps_per_1k_1.to_i] += 1
|
77
|
+
hist_2[snps_per_1k_2.to_i] += 1
|
78
|
+
|
79
|
+
table_file.print "#{r.id}\t#{region.size}\t"
|
80
|
+
table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
|
81
|
+
table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
|
82
|
+
table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
|
83
|
+
fasta_file.puts ">#{r.id}_1"
|
84
|
+
fasta_file.puts "#{cons_1}"
|
85
|
+
fasta_file.puts ">#{r.id}_2"
|
86
|
+
fasta_file.puts "#{cons_2}"
|
87
|
+
|
88
|
+
=======
|
57
89
|
|
58
90
|
cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
59
91
|
cons_2 = bam2.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
@@ -62,13 +94,10 @@ fasta_db.index.entries.each do | r |
|
|
62
94
|
snps_1 = cons_1.count_ambiguities
|
63
95
|
snps_2 = cons_2.count_ambiguities
|
64
96
|
|
65
|
-
called_1 = cons_1.upper_case_count
|
66
|
-
called_2 = cons_2.upper_case_count
|
67
|
-
|
68
97
|
snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
|
69
98
|
|
70
|
-
snps_per_1k_1 = (block_size * snps_1.to_f ) /
|
71
|
-
snps_per_1k_2 = (block_size * snps_2.to_f ) /
|
99
|
+
snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
|
100
|
+
snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
|
72
101
|
snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
|
73
102
|
|
74
103
|
hist_1[snps_per_1k_1.to_i] += 1
|
@@ -83,6 +112,7 @@ fasta_db.index.entries.each do | r |
|
|
83
112
|
fasta_file.puts ">#{r.id}_2"
|
84
113
|
fasta_file.puts "#{cons_2}"
|
85
114
|
end
|
115
|
+
>>>>>>> 1b60bd09fdb1b087d6cb53c643ff36e536efe4a3
|
86
116
|
rescue Exception => e
|
87
117
|
$stderr.puts "Unable to process #{region}: #{e.to_s}"
|
88
118
|
end
|
data/bio-polyploid-tools.gemspec
CHANGED
@@ -2,32 +2,35 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: bio-polyploid-tools 0.
|
5
|
+
# stub: bio-polyploid-tools 0.2.3 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "bio-polyploid-tools"
|
9
|
-
s.version = "0.
|
9
|
+
s.version = "0.2.3"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
13
13
|
s.authors = ["Ricardo H. Ramirez-Gonzalez"]
|
14
|
-
s.date = "2014-
|
14
|
+
s.date = "2014-04-27"
|
15
15
|
s.description = "Repository of tools developed in TGAC and Crop Genetics in JIC to work with polyploid wheat"
|
16
16
|
s.email = "ricardo.ramirez-gonzalez@tgac.ac.uk"
|
17
|
-
s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "find_best_blat_hit.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snps_between_bams.rb"]
|
17
|
+
s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "find_best_blat_hit.rb", "find_best_exonerate.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snps_between_bams.rb"]
|
18
18
|
s.extra_rdoc_files = [
|
19
|
-
"README"
|
19
|
+
"README",
|
20
|
+
"README.md"
|
20
21
|
]
|
21
22
|
s.files = [
|
22
23
|
"Gemfile",
|
23
24
|
"Gemfile.lock",
|
24
25
|
"README",
|
26
|
+
"README.md",
|
25
27
|
"Rakefile",
|
26
28
|
"VERSION",
|
27
29
|
"bin/bfr.rb",
|
28
30
|
"bin/count_variations.rb",
|
29
31
|
"bin/filter_blat_by_target_coverage.rb",
|
30
32
|
"bin/find_best_blat_hit.rb",
|
33
|
+
"bin/find_best_exonerate.rb",
|
31
34
|
"bin/hexaploid_primers.rb",
|
32
35
|
"bin/homokaryot_primers.rb",
|
33
36
|
"bin/map_markers_to_contigs.rb",
|
@@ -78,9 +81,7 @@ Gem::Specification.new do |s|
|
|
78
81
|
"lib/bio/PolyploidTools/PrimerRegion.rb",
|
79
82
|
"lib/bio/PolyploidTools/SNP.rb",
|
80
83
|
"lib/bio/PolyploidTools/SNPSequence.rb",
|
81
|
-
"lib/bio/SAMToolsExtensions.rb",
|
82
84
|
"lib/bio/db/exonerate.rb",
|
83
|
-
"lib/bio/db/fastadb.rb",
|
84
85
|
"lib/bio/db/primer3.rb",
|
85
86
|
"lib/bioruby-polyploid-tools.rb",
|
86
87
|
"test/data/BS00068396_51.fa",
|
@@ -119,21 +120,24 @@ Gem::Specification.new do |s|
|
|
119
120
|
s.specification_version = 4
|
120
121
|
|
121
122
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
122
|
-
s.add_runtime_dependency(%q<bio>, ["
|
123
|
-
s.add_runtime_dependency(%q<bio-samtools>, ["
|
123
|
+
s.add_runtime_dependency(%q<bio>, [">= 1.4.3"])
|
124
|
+
s.add_runtime_dependency(%q<bio-samtools>, [">= 2.0.3"])
|
124
125
|
s.add_runtime_dependency(%q<rake>, [">= 0"])
|
125
126
|
s.add_runtime_dependency(%q<jeweler>, [">= 0"])
|
127
|
+
s.add_runtime_dependency(%q<systemu>, [">= 2.5.2"])
|
126
128
|
else
|
127
|
-
s.add_dependency(%q<bio>, ["
|
128
|
-
s.add_dependency(%q<bio-samtools>, ["
|
129
|
+
s.add_dependency(%q<bio>, [">= 1.4.3"])
|
130
|
+
s.add_dependency(%q<bio-samtools>, [">= 2.0.3"])
|
129
131
|
s.add_dependency(%q<rake>, [">= 0"])
|
130
132
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
133
|
+
s.add_dependency(%q<systemu>, [">= 2.5.2"])
|
131
134
|
end
|
132
135
|
else
|
133
|
-
s.add_dependency(%q<bio>, ["
|
134
|
-
s.add_dependency(%q<bio-samtools>, ["
|
136
|
+
s.add_dependency(%q<bio>, [">= 1.4.3"])
|
137
|
+
s.add_dependency(%q<bio-samtools>, [">= 2.0.3"])
|
135
138
|
s.add_dependency(%q<rake>, [">= 0"])
|
136
139
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
140
|
+
s.add_dependency(%q<systemu>, [">= 2.5.2"])
|
137
141
|
end
|
138
142
|
end
|
139
143
|
|
data/lib/bio/BFRTools.rb
CHANGED
@@ -5,252 +5,16 @@ require 'rubygems'
|
|
5
5
|
#require 'bio/db/vcf'
|
6
6
|
require 'pathname'
|
7
7
|
#require_relative 'BIOExtensions.rb'
|
8
|
-
|
8
|
+
|
9
9
|
|
10
10
|
require 'bio'
|
11
|
+
require 'bio-samtools'
|
12
|
+
|
11
13
|
require "set"
|
12
14
|
require 'systemu'
|
13
15
|
require 'json'
|
14
16
|
#require 'strmask'
|
15
17
|
|
16
|
-
=begin
|
17
|
-
|
18
|
-
Extends the methods to be able to calculate the BFR and a consensus from the pileup
|
19
|
-
|
20
|
-
=end
|
21
|
-
|
22
|
-
class Bio::DB::Pileup
|
23
|
-
|
24
|
-
#attr_accessor :minumum_ratio_for_iup_consensus
|
25
|
-
#@minumum_ratio_for_iup_consensus = 0.20
|
26
|
-
|
27
|
-
#Returns a hash with the count of bases
|
28
|
-
|
29
|
-
def bases
|
30
|
-
return @bases if @bases
|
31
|
-
@bases = self.non_refs
|
32
|
-
#puts self.ref_count
|
33
|
-
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
34
|
-
@bases
|
35
|
-
end
|
36
|
-
|
37
|
-
def base_coverage
|
38
|
-
total = 0
|
39
|
-
@bases.each do |k,v|
|
40
|
-
total += v
|
41
|
-
end
|
42
|
-
total
|
43
|
-
end
|
44
|
-
|
45
|
-
def base_ratios
|
46
|
-
return @base_ratios if @base_ratios
|
47
|
-
bases = self.bases
|
48
|
-
@base_ratios = Hash.new
|
49
|
-
bases.each do |k,v|
|
50
|
-
@base_ratios[k] = v.to_f/self.base_coverage.to_f
|
51
|
-
end
|
52
|
-
@base_ratios
|
53
|
-
end
|
54
|
-
|
55
|
-
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
56
|
-
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
57
|
-
minumum_ratio_for_iup_consensus
|
58
|
-
if @consensus_iuap.nil?
|
59
|
-
@consensus_iuap = self.ref_base.downcase
|
60
|
-
bases = self.bases
|
61
|
-
tmp = String.new
|
62
|
-
bases.each do |k,v|
|
63
|
-
tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
|
64
|
-
end
|
65
|
-
if tmp.length > 0
|
66
|
-
@consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
@consensus_iuap
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
class Bio::DB::Fasta::Region
|
74
|
-
attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
|
75
|
-
|
76
|
-
#TODO: Debug, as it hasnt been tested in the actual code.
|
77
|
-
def base_ratios_for_base(base)
|
78
|
-
@all_ratios = Hash.new unless @all_ratios
|
79
|
-
unless @all_ratios[base]
|
80
|
-
ratios = Array.new
|
81
|
-
for i in (0..region.size-1)
|
82
|
-
ratios << @base_ratios[i][base]
|
83
|
-
end
|
84
|
-
@all_ratios[base] = ratios
|
85
|
-
end
|
86
|
-
@all_ratios[base]
|
87
|
-
end
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
class Bio::DB::Sam::SAMException < RuntimeError
|
92
|
-
|
93
|
-
end
|
94
|
-
|
95
|
-
class Bio::DB::Sam
|
96
|
-
|
97
|
-
|
98
|
-
attr_accessor :minumum_ratio_for_iup_consensus
|
99
|
-
attr_reader :cached_regions
|
100
|
-
#attr_accessor :pileup_cache
|
101
|
-
@minumum_ratio_for_iup_consensus = 0.20
|
102
|
-
|
103
|
-
|
104
|
-
#Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
|
105
|
-
#the pile for different operations, it won't execute the mpilup command several times
|
106
|
-
#Whenever you finish using a region, call mpileup_clear_cache to free the cache
|
107
|
-
#The argument Region is required, as it will be the key for the underlying hash.
|
108
|
-
#We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
|
109
|
-
#
|
110
|
-
#TODO: It may be good to load partially the pileup
|
111
|
-
def mpileup_cached (opts={})
|
112
|
-
raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
|
113
|
-
@pileup_cache = Hash.new unless @pileup_cache
|
114
|
-
@cached_regions = Hash.new unless @cached_regions
|
115
|
-
|
116
|
-
region = opts[:r] ? opts[:r] : opts[:region]
|
117
|
-
opts[:r] = "'#{region.to_s}'"
|
118
|
-
opts[:region] = "'#{region.to_s}'"
|
119
|
-
opts[:A] = true
|
120
|
-
#reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
|
121
|
-
|
122
|
-
unless @cached_regions[region.to_s]
|
123
|
-
@cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
|
124
|
-
tmp = Array.new
|
125
|
-
@cached_regions[region.to_s].pileup = tmp
|
126
|
-
#puts "Loading #{region.to_s}"
|
127
|
-
mpileup(opts) do | pile |
|
128
|
-
# puts pile
|
129
|
-
tmp << pile
|
130
|
-
yield pile
|
131
|
-
end
|
132
|
-
else
|
133
|
-
# puts "Loaded, reruning #{region.to_s}"
|
134
|
-
@cached_regions.pileup[region.to_s] .each do | pile |
|
135
|
-
yield pile
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
#Clears the pileup cache. If a region is passed as argument, just the specified region is removed
|
141
|
-
#If no region is passed, the hash is emptied
|
142
|
-
def mpileup_clear_cache (region)
|
143
|
-
return unless @cached_regions
|
144
|
-
if region
|
145
|
-
@cached_regions[region.to_s] = nil
|
146
|
-
else
|
147
|
-
@cached_regions.clear
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
#Gets the coverage of a region from a pileup.
|
152
|
-
def average_coverage_from_pileup(opts={})
|
153
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
154
|
-
region = opts[:region]
|
155
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
156
|
-
@cached_regions[region].average_coverage
|
157
|
-
end
|
158
|
-
|
159
|
-
#
|
160
|
-
def coverages_from_pileup(opts={})
|
161
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
162
|
-
region = opts[:region]
|
163
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
164
|
-
@cached_regions[region].coverages
|
165
|
-
end
|
166
|
-
|
167
|
-
def consensus_with_ambiguities(opts={})
|
168
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
169
|
-
region = opts[:region]
|
170
|
-
# p "consensus with ambiguities for: " << opts[:region]
|
171
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
172
|
-
@cached_regions[region].consensus
|
173
|
-
end
|
174
|
-
|
175
|
-
def calculate_stats_from_pile(opts={})
|
176
|
-
min_cov = opts[:min_cov] ? opts[:min_cov] : 20
|
177
|
-
|
178
|
-
|
179
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
180
|
-
region = opts[:region]
|
181
|
-
reference = self.fetch_reference(region.entry, region.start, region.end).downcase
|
182
|
-
# p "calculationg from pile..." << region.to_s
|
183
|
-
base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
|
184
|
-
bases = Array.new(region.size, BASE_COUNT_ZERO)
|
185
|
-
coverages = Array.new(region.size, 0)
|
186
|
-
total_cov = 0
|
187
|
-
|
188
|
-
self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
|
189
|
-
#puts pile
|
190
|
-
#puts pile.coverage
|
191
|
-
if pile.coverage > min_cov
|
192
|
-
base_ratios[pile.pos - region.start ] = pile.base_ratios
|
193
|
-
reference[pile.pos - region.start ] = pile.consensus_iuap(0.20)
|
194
|
-
coverages[pile.pos - region.start ] = pile.coverage.to_i
|
195
|
-
bases[pile.pos - region.start ] = pile.bases
|
196
|
-
end
|
197
|
-
total_cov += pile.coverage
|
198
|
-
end
|
199
|
-
|
200
|
-
region = @cached_regions[region.to_s]
|
201
|
-
region.coverages = coverages
|
202
|
-
region.base_ratios = base_ratios
|
203
|
-
region.consensus = reference
|
204
|
-
|
205
|
-
region.average_coverage = total_cov.to_f/region.size.to_f
|
206
|
-
region.bases = bases
|
207
|
-
region
|
208
|
-
end
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
213
|
-
|
214
|
-
#Gets an array with the proportions of the bases in the region. If there is no coverage, a
|
215
|
-
def base_ratios_in_region(opts={})
|
216
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
217
|
-
region = opts[:region]
|
218
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
219
|
-
@cached_regions[region].base_ratios
|
220
|
-
end
|
221
|
-
|
222
|
-
#Gets an array with the bsaes count in the region. If there is no coverage, a
|
223
|
-
def bases_in_region(opts={})
|
224
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
225
|
-
region = opts[:region]
|
226
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
227
|
-
@cached_regions[region].bases
|
228
|
-
end
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
def extract_reads(opts={})
|
233
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
234
|
-
fastq_filename = opts[:fastq]
|
235
|
-
fastq_file = opts[:fastq_file]
|
236
|
-
|
237
|
-
out = $stdout
|
238
|
-
|
239
|
-
print_fastq = Proc.new do |alignment|
|
240
|
-
out.puts "@#{alignment.qname}"
|
241
|
-
out.puts "#{alignment.seq}"
|
242
|
-
out.puts "+#{alignment.qname}"
|
243
|
-
out.puts "#{alignment.qual}"
|
244
|
-
end
|
245
|
-
|
246
|
-
fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
|
247
|
-
|
248
|
-
|
249
|
-
end
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
end
|
254
18
|
|
255
19
|
module Bio::BFRTools
|
256
20
|
|
@@ -267,7 +31,7 @@ module Bio::BFRTools
|
|
267
31
|
BASES = [:A, :C, :G, :T]
|
268
32
|
#Sets the reference file
|
269
33
|
def reference(path)
|
270
|
-
@reference_db = Bio::DB::Fasta::FastaFile.new(path)
|
34
|
+
@reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
|
271
35
|
@reference_path = path
|
272
36
|
end
|
273
37
|
|
@@ -350,33 +114,35 @@ module Bio::BFRTools
|
|
350
114
|
self.entry = reg.entry
|
351
115
|
self.start = reg.start
|
352
116
|
self.end = reg.end
|
353
|
-
|
117
|
+
opts[:region] = reg
|
354
118
|
@container = opts[:container]
|
355
119
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
@parental_2_sequence = parental_2_sam.consensus_with_ambiguities(opts)
|
120
|
+
parental_1_reg = @container.parental_1_sam.fetch_region(opts)
|
121
|
+
parental_2_reg = @container.parental_2_sam.fetch_region(opts)
|
122
|
+
bulk_1_reg = @container.bulk_1_sam.fetch_region(opts)
|
123
|
+
bulk_2_reg = @container.bulk_2_sam.fetch_region(opts)
|
124
|
+
|
125
|
+
|
363
126
|
|
364
|
-
@
|
365
|
-
@
|
127
|
+
@parental_1_sequence = parental_1_reg.consensus
|
128
|
+
@parental_2_sequence = parental_2_reg.consensus
|
129
|
+
|
130
|
+
@bulk_1_sequence = bulk_1_reg.consensus
|
131
|
+
@bulk_2_sequence = bulk_2_reg.consensus
|
366
132
|
|
367
133
|
@snp_count = Container.snps_between( @parental_1_sequence , @parental_2_sequence )
|
368
134
|
|
369
|
-
@ratios_bulk_1 =
|
370
|
-
@ratios_bulk_2 =
|
135
|
+
@ratios_bulk_1 = bulk_1_reg.base_ratios
|
136
|
+
@ratios_bulk_2 = bulk_2_reg.base_ratios
|
371
137
|
|
372
|
-
@bases_bulk_1 =
|
373
|
-
@bases_bulk_2 =
|
138
|
+
@bases_bulk_1 = bulk_1_reg.bases
|
139
|
+
@bases_bulk_2 = bulk_2_reg.bases
|
374
140
|
|
375
|
-
@avg_cov_bulk_1 =
|
376
|
-
@avg_cov_bulk_2 =
|
141
|
+
@avg_cov_bulk_1 = bulk_1_reg.average_coverage
|
142
|
+
@avg_cov_bulk_2 = bulk_2_reg.average_coverage
|
377
143
|
|
378
|
-
@coverages_1 =
|
379
|
-
@coverages_2 =
|
144
|
+
@coverages_1 = bulk_1_reg.coverages
|
145
|
+
@coverages_2 = bulk_2_reg.coverages
|
380
146
|
|
381
147
|
end
|
382
148
|
|
@@ -472,7 +238,7 @@ module Bio::BFRTools
|
|
472
238
|
raise BFRToolsException.new ("The reference for the line should be :first or :second, but was " + reference.to_s )
|
473
239
|
end
|
474
240
|
|
475
|
-
relative_position = self.start + position
|
241
|
+
relative_position = self.start + position
|
476
242
|
|
477
243
|
bfr = bfrs[reference][base][position]
|
478
244
|
cov_1 = @coverages_1[position]
|
@@ -622,7 +388,7 @@ module Bio::BFRTools
|
|
622
388
|
end
|
623
389
|
|
624
390
|
def process_region(opts={})
|
625
|
-
opts = { :min_cov=>20, :max_snp_1kbp => 10 }.merge!(opts)
|
391
|
+
opts = { :min_cov=>20, :max_snp_1kbp => 10, :max_per=>0.20 }.merge!(opts)
|
626
392
|
|
627
393
|
@proccesed_regions += 1
|
628
394
|
output = opts[:output_file] ? opts[:output_file] : $stdout
|
@@ -675,7 +441,7 @@ module Bio::BFRTools
|
|
675
441
|
|
676
442
|
|
677
443
|
for informative in info
|
678
|
-
line = region.get_bfr_line(i
|
444
|
+
line = region.get_bfr_line(i, base, informative)
|
679
445
|
output.print line , "\n"
|
680
446
|
end
|
681
447
|
end
|