bio-polyploid-tools 0.1.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -3
- data/Gemfile.lock +8 -8
- data/README.md +45 -0
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -7
- data/bin/count_variations.rb +1 -1
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +1 -1
- data/bin/polymarker.rb +2 -2
- data/bin/snps_between_bams.rb +37 -7
- data/bio-polyploid-tools.gemspec +17 -13
- data/lib/bio/BFRTools.rb +27 -261
- data/lib/bio/BIOExtensions.rb +0 -124
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +1 -1
- data/lib/bio/PolyploidTools/ExonContainer.rb +6 -5
- data/lib/bio/PolyploidTools/Marker.rb +2 -2
- data/lib/bio/PolyploidTools/SNP.rb +5 -4
- data/lib/bio/db/exonerate.rb +1 -1
- data/test/test_bfr.rb +101 -9
- metadata +28 -12
- data/lib/bio/SAMToolsExtensions.rb +0 -284
- data/lib/bio/db/fastadb.rb +0 -164
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 019bf8dc15f35de0be9a55567b8041f9b98ac326
|
4
|
+
data.tar.gz: 3e0a76bbefead5c5284c64a01b36645748a70098
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98e2d6c023ee8d89014efe65da619f0a98808c1540c3773aaef901de9f5c2338a3cc4645bdee1a3cdc430d525587c27d576d1f19e2ac8e59d7724a6efaac5901
|
7
|
+
data.tar.gz: 3d09c9a1972b7538eb160ee89786f9bdd3f8c52fec554da110222241767e7a17f6efd7f0f42a219fb06cd3689037fbee2ec88eeae0cb94e40333a5c491259421
|
data/Gemfile
CHANGED
@@ -2,12 +2,13 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
-
|
6
|
-
gem "bio
|
5
|
+
|
6
|
+
gem "bio", ">= 1.4.3"
|
7
|
+
gem "bio-samtools", ">= 2.0.3"
|
7
8
|
gem "rake"
|
8
9
|
gem "jeweler"
|
9
10
|
|
10
|
-
|
11
|
+
gem "systemu", ">=2.5.2"
|
11
12
|
|
12
13
|
group :development do
|
13
14
|
# gem "shoulda", ">= 0"
|
data/Gemfile.lock
CHANGED
@@ -3,17 +3,16 @@ GEM
|
|
3
3
|
specs:
|
4
4
|
addressable (2.3.6)
|
5
5
|
atomic (1.1.16)
|
6
|
-
bio (1.4.
|
7
|
-
bio-samtools (0.
|
6
|
+
bio (1.4.3.0001)
|
7
|
+
bio-samtools (2.0.3)
|
8
8
|
bio (>= 1.4.2)
|
9
|
-
|
10
|
-
|
9
|
+
bio-svgenes (>= 0.4.1)
|
10
|
+
bio-svgenes (0.4.1)
|
11
11
|
builder (3.2.2)
|
12
12
|
descendants_tracker (0.0.4)
|
13
13
|
thread_safe (~> 0.3, >= 0.3.1)
|
14
14
|
faraday (0.9.0)
|
15
15
|
multipart-post (>= 1.2, < 3)
|
16
|
-
ffi (1.9.3)
|
17
16
|
git (1.2.6)
|
18
17
|
github_api (0.11.3)
|
19
18
|
addressable (~> 2.3)
|
@@ -53,7 +52,7 @@ GEM
|
|
53
52
|
rake (10.2.2)
|
54
53
|
rdoc (4.1.1)
|
55
54
|
json (~> 1.4)
|
56
|
-
systemu (2.6.
|
55
|
+
systemu (2.6.4)
|
57
56
|
thread_safe (0.3.1)
|
58
57
|
atomic (>= 1.1.7, < 2)
|
59
58
|
|
@@ -61,7 +60,8 @@ PLATFORMS
|
|
61
60
|
ruby
|
62
61
|
|
63
62
|
DEPENDENCIES
|
64
|
-
bio (
|
65
|
-
bio-samtools (
|
63
|
+
bio (>= 1.4.3)
|
64
|
+
bio-samtools (>= 2.0.3)
|
66
65
|
jeweler
|
67
66
|
rake
|
67
|
+
systemu (>= 2.5.2)
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
bio-polyploid-tools
|
2
|
+
===================
|
3
|
+
|
4
|
+
Introduction
|
5
|
+
-------------
|
6
|
+
This tools are designed to deal with polyploid wheat. The first tool is to design KASP primers,
|
7
|
+
making them as specific as possible.
|
8
|
+
|
9
|
+
|
10
|
+
Installation
|
11
|
+
------------
|
12
|
+
'gem install bio-polyploid-tools'
|
13
|
+
|
14
|
+
You need to have in your $PATH the following programs:
|
15
|
+
* [MAFFT]{http://mafft.cbrc.jp/alignment/software/}
|
16
|
+
* [primer3]{http://primer3.sourceforge.net/releases.php}
|
17
|
+
* [exonerate]{http://www.ebi.ac.uk/~guy/exonerate/}
|
18
|
+
|
19
|
+
The code has been developed on ruby 2.1.0, but it should work on 1.9.3 and above.
|
20
|
+
|
21
|
+
|
22
|
+
Polymarker
|
23
|
+
----------
|
24
|
+
|
25
|
+
To run poolymerker with the CSS wheat contigs, you need to unzip the
|
26
|
+
reference file [Triticum_aestivum.IWGSP1.22.dna_rm.genome.fa.gz{ftp://ftp.ensemblgenomes.org/pub/release-22/plants/fasta/triticum_aestivum/dna/}.
|
27
|
+
|
28
|
+
polymarker.rb --contigs Triticum_aestivum.IWGSP1.22.dna_rm.genome.fa --marker_list snp_list.csv --output output_folder
|
29
|
+
|
30
|
+
The snp_list file must follow the convention
|
31
|
+
<ID>,<Chromosome>,<SEQUENCE>
|
32
|
+
with the SNP inside the sequence in the format [A/T]. As a reference, look at test/data/short_primer_design_test.csv
|
33
|
+
|
34
|
+
Notes
|
35
|
+
-----
|
36
|
+
|
37
|
+
* If the SNP is in a gap in the alignment to the chromosomes, it is ignored.
|
38
|
+
|
39
|
+
BUG: Blocks with NNNs are picked and treated as semi-specific.
|
40
|
+
BUG: If the name of the reference have space, the ID is not chopped. ">gene_1 (G12A)" shouls be treated as ">gene_1".
|
41
|
+
TODO: If reading from a reference file, only get one reference to align when the region is queried several times
|
42
|
+
TODO: Add a parameter file to configure the alignments.
|
43
|
+
TODO: Produce primers for products of different sizes
|
44
|
+
|
45
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.3
|
data/bin/bfr.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
1
2
|
require 'rubygems'
|
2
3
|
#require 'extensions/all'
|
3
4
|
require 'bio-samtools'
|
@@ -70,18 +71,12 @@ chunk_size = options[:chunk_size]
|
|
70
71
|
output_filename = options[:output_filename]
|
71
72
|
stats_file = options[:stats_file]
|
72
73
|
|
73
|
-
|
74
|
-
#reference = ARGV[6]
|
75
|
-
|
76
|
-
|
77
74
|
|
78
75
|
min = chunk * chunk_size
|
79
76
|
max = min + chunk_size
|
80
77
|
|
81
78
|
|
82
|
-
#AvocetS
|
83
79
|
parental_1=options[:parent_1]
|
84
|
-
#AvocetS (Yr15)
|
85
80
|
parental_2=options[:parent_2]
|
86
81
|
|
87
82
|
|
@@ -89,7 +84,7 @@ bulk_1 = options[:bulk_1]
|
|
89
84
|
bulk_2 = options[:bulk_2]
|
90
85
|
|
91
86
|
|
92
|
-
fasta_db = Bio::DB::Fasta::FastaFile.new(reference)
|
87
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
93
88
|
fasta_db.load_fai_entries
|
94
89
|
|
95
90
|
|
data/bin/count_variations.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
|
4
|
+
found_cointigs = Set.new
|
5
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model, :chunk=>chunk, :total_chunks=>}) do |aln|
|
6
|
+
if aln.identity > min_identity
|
7
|
+
exo_f.puts aln.line
|
8
|
+
unless found_cointigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
9
|
+
found_cointigs.add(aln.target_id)
|
10
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
11
|
+
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
12
|
+
region = entry.get_full_region
|
13
|
+
seq = fasta_file.fetch_sequence(region)
|
14
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/bin/hexaploid_primers.rb
CHANGED
@@ -43,7 +43,7 @@ snps = Array.new
|
|
43
43
|
#0. Load the fasta index
|
44
44
|
fasta_reference_db = nil
|
45
45
|
if fasta_reference
|
46
|
-
fasta_reference_db = Bio::DB::Fasta::FastaFile.new(fasta_reference)
|
46
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
47
47
|
fasta_reference_db.load_fai_entries
|
48
48
|
p "Fasta reference: #{fasta_reference}"
|
49
49
|
end
|
@@ -99,7 +99,7 @@ Dir.foreach(path_to_contigs) do |filename |
|
|
99
99
|
puts filename
|
100
100
|
target="#{path_to_contigs}/#{filename}"
|
101
101
|
|
102
|
-
fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
102
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
103
103
|
fasta_file.load_fai_entries
|
104
104
|
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
|
105
105
|
if aln.identity > min_identity
|
data/bin/homokaryot_primers.rb
CHANGED
@@ -82,7 +82,7 @@ snps = Array.new
|
|
82
82
|
#0. Load the fasta index
|
83
83
|
fasta_reference_db = nil
|
84
84
|
if reference_file
|
85
|
-
fasta_reference_db = Bio::DB::Fasta::FastaFile.new(reference_file)
|
85
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference_file})
|
86
86
|
fasta_reference_db.load_fai_entries
|
87
87
|
p "Fasta reference: #{reference_file}"
|
88
88
|
end
|
data/bin/polymarker.rb
CHANGED
@@ -87,7 +87,7 @@ snps = Array.new
|
|
87
87
|
#0. Load the fasta index
|
88
88
|
fasta_reference_db = nil
|
89
89
|
if fasta_reference
|
90
|
-
fasta_reference_db = Bio::DB::Fasta::FastaFile.new(fasta_reference)
|
90
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
91
91
|
fasta_reference_db.load_fai_entries
|
92
92
|
p "Fasta reference: #{fasta_reference}"
|
93
93
|
end
|
@@ -141,7 +141,7 @@ filename=path_to_contigs
|
|
141
141
|
puts filename
|
142
142
|
target=filename
|
143
143
|
|
144
|
-
fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
144
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
145
145
|
fasta_file.load_fai_entries
|
146
146
|
|
147
147
|
found_cointigs = Set.new
|
data/bin/snps_between_bams.rb
CHANGED
@@ -15,7 +15,7 @@ require path
|
|
15
15
|
|
16
16
|
|
17
17
|
|
18
|
-
fasta_db = Bio::DB::Fasta::FastaFile.new(
|
18
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new(:fasta=>ARGV[0])
|
19
19
|
fasta_db.load_fai_entries
|
20
20
|
bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
|
21
21
|
bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
|
@@ -23,7 +23,7 @@ bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
|
|
23
23
|
|
24
24
|
output_prefix = ARGV[3]
|
25
25
|
|
26
|
-
block_size=
|
26
|
+
block_size=1000
|
27
27
|
|
28
28
|
min_cov = ARGV[4].to_i ? ARGV[4].to_i : 10
|
29
29
|
chunk = ARGV[5].to_i
|
@@ -54,6 +54,38 @@ fasta_db.index.entries.each do | r |
|
|
54
54
|
|
55
55
|
|
56
56
|
begin
|
57
|
+
<<<<<<< HEAD
|
58
|
+
reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
59
|
+
reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
60
|
+
cons_1 = reg_a.consensus
|
61
|
+
cons_2 = reg_b.consensus
|
62
|
+
|
63
|
+
|
64
|
+
snps_1 = cons_1.count_ambiguities
|
65
|
+
snps_2 = cons_2.count_ambiguities
|
66
|
+
|
67
|
+
called_1 = reg_a.called
|
68
|
+
called_2 = reg_b.called
|
69
|
+
|
70
|
+
snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
|
71
|
+
|
72
|
+
snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
|
73
|
+
snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
|
74
|
+
snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
|
75
|
+
|
76
|
+
hist_1[snps_per_1k_1.to_i] += 1
|
77
|
+
hist_2[snps_per_1k_2.to_i] += 1
|
78
|
+
|
79
|
+
table_file.print "#{r.id}\t#{region.size}\t"
|
80
|
+
table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
|
81
|
+
table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
|
82
|
+
table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
|
83
|
+
fasta_file.puts ">#{r.id}_1"
|
84
|
+
fasta_file.puts "#{cons_1}"
|
85
|
+
fasta_file.puts ">#{r.id}_2"
|
86
|
+
fasta_file.puts "#{cons_2}"
|
87
|
+
|
88
|
+
=======
|
57
89
|
|
58
90
|
cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
59
91
|
cons_2 = bam2.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
@@ -62,13 +94,10 @@ fasta_db.index.entries.each do | r |
|
|
62
94
|
snps_1 = cons_1.count_ambiguities
|
63
95
|
snps_2 = cons_2.count_ambiguities
|
64
96
|
|
65
|
-
called_1 = cons_1.upper_case_count
|
66
|
-
called_2 = cons_2.upper_case_count
|
67
|
-
|
68
97
|
snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
|
69
98
|
|
70
|
-
snps_per_1k_1 = (block_size * snps_1.to_f ) /
|
71
|
-
snps_per_1k_2 = (block_size * snps_2.to_f ) /
|
99
|
+
snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
|
100
|
+
snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
|
72
101
|
snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
|
73
102
|
|
74
103
|
hist_1[snps_per_1k_1.to_i] += 1
|
@@ -83,6 +112,7 @@ fasta_db.index.entries.each do | r |
|
|
83
112
|
fasta_file.puts ">#{r.id}_2"
|
84
113
|
fasta_file.puts "#{cons_2}"
|
85
114
|
end
|
115
|
+
>>>>>>> 1b60bd09fdb1b087d6cb53c643ff36e536efe4a3
|
86
116
|
rescue Exception => e
|
87
117
|
$stderr.puts "Unable to process #{region}: #{e.to_s}"
|
88
118
|
end
|
data/bio-polyploid-tools.gemspec
CHANGED
@@ -2,32 +2,35 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: bio-polyploid-tools 0.
|
5
|
+
# stub: bio-polyploid-tools 0.2.3 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "bio-polyploid-tools"
|
9
|
-
s.version = "0.
|
9
|
+
s.version = "0.2.3"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
13
13
|
s.authors = ["Ricardo H. Ramirez-Gonzalez"]
|
14
|
-
s.date = "2014-
|
14
|
+
s.date = "2014-04-27"
|
15
15
|
s.description = "Repository of tools developed in TGAC and Crop Genetics in JIC to work with polyploid wheat"
|
16
16
|
s.email = "ricardo.ramirez-gonzalez@tgac.ac.uk"
|
17
|
-
s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "find_best_blat_hit.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snps_between_bams.rb"]
|
17
|
+
s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "find_best_blat_hit.rb", "find_best_exonerate.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snps_between_bams.rb"]
|
18
18
|
s.extra_rdoc_files = [
|
19
|
-
"README"
|
19
|
+
"README",
|
20
|
+
"README.md"
|
20
21
|
]
|
21
22
|
s.files = [
|
22
23
|
"Gemfile",
|
23
24
|
"Gemfile.lock",
|
24
25
|
"README",
|
26
|
+
"README.md",
|
25
27
|
"Rakefile",
|
26
28
|
"VERSION",
|
27
29
|
"bin/bfr.rb",
|
28
30
|
"bin/count_variations.rb",
|
29
31
|
"bin/filter_blat_by_target_coverage.rb",
|
30
32
|
"bin/find_best_blat_hit.rb",
|
33
|
+
"bin/find_best_exonerate.rb",
|
31
34
|
"bin/hexaploid_primers.rb",
|
32
35
|
"bin/homokaryot_primers.rb",
|
33
36
|
"bin/map_markers_to_contigs.rb",
|
@@ -78,9 +81,7 @@ Gem::Specification.new do |s|
|
|
78
81
|
"lib/bio/PolyploidTools/PrimerRegion.rb",
|
79
82
|
"lib/bio/PolyploidTools/SNP.rb",
|
80
83
|
"lib/bio/PolyploidTools/SNPSequence.rb",
|
81
|
-
"lib/bio/SAMToolsExtensions.rb",
|
82
84
|
"lib/bio/db/exonerate.rb",
|
83
|
-
"lib/bio/db/fastadb.rb",
|
84
85
|
"lib/bio/db/primer3.rb",
|
85
86
|
"lib/bioruby-polyploid-tools.rb",
|
86
87
|
"test/data/BS00068396_51.fa",
|
@@ -119,21 +120,24 @@ Gem::Specification.new do |s|
|
|
119
120
|
s.specification_version = 4
|
120
121
|
|
121
122
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
122
|
-
s.add_runtime_dependency(%q<bio>, ["
|
123
|
-
s.add_runtime_dependency(%q<bio-samtools>, ["
|
123
|
+
s.add_runtime_dependency(%q<bio>, [">= 1.4.3"])
|
124
|
+
s.add_runtime_dependency(%q<bio-samtools>, [">= 2.0.3"])
|
124
125
|
s.add_runtime_dependency(%q<rake>, [">= 0"])
|
125
126
|
s.add_runtime_dependency(%q<jeweler>, [">= 0"])
|
127
|
+
s.add_runtime_dependency(%q<systemu>, [">= 2.5.2"])
|
126
128
|
else
|
127
|
-
s.add_dependency(%q<bio>, ["
|
128
|
-
s.add_dependency(%q<bio-samtools>, ["
|
129
|
+
s.add_dependency(%q<bio>, [">= 1.4.3"])
|
130
|
+
s.add_dependency(%q<bio-samtools>, [">= 2.0.3"])
|
129
131
|
s.add_dependency(%q<rake>, [">= 0"])
|
130
132
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
133
|
+
s.add_dependency(%q<systemu>, [">= 2.5.2"])
|
131
134
|
end
|
132
135
|
else
|
133
|
-
s.add_dependency(%q<bio>, ["
|
134
|
-
s.add_dependency(%q<bio-samtools>, ["
|
136
|
+
s.add_dependency(%q<bio>, [">= 1.4.3"])
|
137
|
+
s.add_dependency(%q<bio-samtools>, [">= 2.0.3"])
|
135
138
|
s.add_dependency(%q<rake>, [">= 0"])
|
136
139
|
s.add_dependency(%q<jeweler>, [">= 0"])
|
140
|
+
s.add_dependency(%q<systemu>, [">= 2.5.2"])
|
137
141
|
end
|
138
142
|
end
|
139
143
|
|
data/lib/bio/BFRTools.rb
CHANGED
@@ -5,252 +5,16 @@ require 'rubygems'
|
|
5
5
|
#require 'bio/db/vcf'
|
6
6
|
require 'pathname'
|
7
7
|
#require_relative 'BIOExtensions.rb'
|
8
|
-
|
8
|
+
|
9
9
|
|
10
10
|
require 'bio'
|
11
|
+
require 'bio-samtools'
|
12
|
+
|
11
13
|
require "set"
|
12
14
|
require 'systemu'
|
13
15
|
require 'json'
|
14
16
|
#require 'strmask'
|
15
17
|
|
16
|
-
=begin
|
17
|
-
|
18
|
-
Extends the methods to be able to calculate the BFR and a consensus from the pileup
|
19
|
-
|
20
|
-
=end
|
21
|
-
|
22
|
-
class Bio::DB::Pileup
|
23
|
-
|
24
|
-
#attr_accessor :minumum_ratio_for_iup_consensus
|
25
|
-
#@minumum_ratio_for_iup_consensus = 0.20
|
26
|
-
|
27
|
-
#Returns a hash with the count of bases
|
28
|
-
|
29
|
-
def bases
|
30
|
-
return @bases if @bases
|
31
|
-
@bases = self.non_refs
|
32
|
-
#puts self.ref_count
|
33
|
-
@bases[self.ref_base.upcase.to_sym] = self.ref_count
|
34
|
-
@bases
|
35
|
-
end
|
36
|
-
|
37
|
-
def base_coverage
|
38
|
-
total = 0
|
39
|
-
@bases.each do |k,v|
|
40
|
-
total += v
|
41
|
-
end
|
42
|
-
total
|
43
|
-
end
|
44
|
-
|
45
|
-
def base_ratios
|
46
|
-
return @base_ratios if @base_ratios
|
47
|
-
bases = self.bases
|
48
|
-
@base_ratios = Hash.new
|
49
|
-
bases.each do |k,v|
|
50
|
-
@base_ratios[k] = v.to_f/self.base_coverage.to_f
|
51
|
-
end
|
52
|
-
@base_ratios
|
53
|
-
end
|
54
|
-
|
55
|
-
# returns the consensus (most frequent) base from the pileup, if there are equally represented bases returns a string of all equally represented bases in alphabetical order
|
56
|
-
def consensus_iuap(minumum_ratio_for_iup_consensus)
|
57
|
-
minumum_ratio_for_iup_consensus
|
58
|
-
if @consensus_iuap.nil?
|
59
|
-
@consensus_iuap = self.ref_base.downcase
|
60
|
-
bases = self.bases
|
61
|
-
tmp = String.new
|
62
|
-
bases.each do |k,v|
|
63
|
-
tmp << k[0].to_s if v/self.coverage > minumum_ratio_for_iup_consensus
|
64
|
-
end
|
65
|
-
if tmp.length > 0
|
66
|
-
@consensus_iuap = Bio::NucleicAcid.to_IUAPC(tmp)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
@consensus_iuap
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
class Bio::DB::Fasta::Region
|
74
|
-
attr_accessor :pileup, :average_coverage, :snps, :reference, :base_ratios, :consensus, :coverages, :bases
|
75
|
-
|
76
|
-
#TODO: Debug, as it hasnt been tested in the actual code.
|
77
|
-
def base_ratios_for_base(base)
|
78
|
-
@all_ratios = Hash.new unless @all_ratios
|
79
|
-
unless @all_ratios[base]
|
80
|
-
ratios = Array.new
|
81
|
-
for i in (0..region.size-1)
|
82
|
-
ratios << @base_ratios[i][base]
|
83
|
-
end
|
84
|
-
@all_ratios[base] = ratios
|
85
|
-
end
|
86
|
-
@all_ratios[base]
|
87
|
-
end
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
class Bio::DB::Sam::SAMException < RuntimeError
|
92
|
-
|
93
|
-
end
|
94
|
-
|
95
|
-
class Bio::DB::Sam
|
96
|
-
|
97
|
-
|
98
|
-
attr_accessor :minumum_ratio_for_iup_consensus
|
99
|
-
attr_reader :cached_regions
|
100
|
-
#attr_accessor :pileup_cache
|
101
|
-
@minumum_ratio_for_iup_consensus = 0.20
|
102
|
-
|
103
|
-
|
104
|
-
#Same as mpilup, but it caches the pileup, so if you want several operations on the same set of regions
|
105
|
-
#the pile for different operations, it won't execute the mpilup command several times
|
106
|
-
#Whenever you finish using a region, call mpileup_clear_cache to free the cache
|
107
|
-
#The argument Region is required, as it will be the key for the underlying hash.
|
108
|
-
#We asume that the options are constant. If they are not, the cache mechanism may not be consistent.
|
109
|
-
#
|
110
|
-
#TODO: It may be good to load partially the pileup
|
111
|
-
def mpileup_cached (opts={})
|
112
|
-
raise SAMException.new(), "A region must be provided" unless opts[:r] or opts[:region]
|
113
|
-
@pileup_cache = Hash.new unless @pileup_cache
|
114
|
-
@cached_regions = Hash.new unless @cached_regions
|
115
|
-
|
116
|
-
region = opts[:r] ? opts[:r] : opts[:region]
|
117
|
-
opts[:r] = "'#{region.to_s}'"
|
118
|
-
opts[:region] = "'#{region.to_s}'"
|
119
|
-
opts[:A] = true
|
120
|
-
#reg = region.class == Bio::DB::Fasta::Region ? region : Bio::DB::Fasta::Region.parse_region(region.to_s)
|
121
|
-
|
122
|
-
unless @cached_regions[region.to_s]
|
123
|
-
@cached_regions[region.to_s] = Bio::DB::Fasta::Region.parse_region(region.to_s)
|
124
|
-
tmp = Array.new
|
125
|
-
@cached_regions[region.to_s].pileup = tmp
|
126
|
-
#puts "Loading #{region.to_s}"
|
127
|
-
mpileup(opts) do | pile |
|
128
|
-
# puts pile
|
129
|
-
tmp << pile
|
130
|
-
yield pile
|
131
|
-
end
|
132
|
-
else
|
133
|
-
# puts "Loaded, reruning #{region.to_s}"
|
134
|
-
@cached_regions.pileup[region.to_s] .each do | pile |
|
135
|
-
yield pile
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
#Clears the pileup cache. If a region is passed as argument, just the specified region is removed
|
141
|
-
#If no region is passed, the hash is emptied
|
142
|
-
def mpileup_clear_cache (region)
|
143
|
-
return unless @cached_regions
|
144
|
-
if region
|
145
|
-
@cached_regions[region.to_s] = nil
|
146
|
-
else
|
147
|
-
@cached_regions.clear
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
#Gets the coverage of a region from a pileup.
|
152
|
-
def average_coverage_from_pileup(opts={})
|
153
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
154
|
-
region = opts[:region]
|
155
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
156
|
-
@cached_regions[region].average_coverage
|
157
|
-
end
|
158
|
-
|
159
|
-
#
|
160
|
-
def coverages_from_pileup(opts={})
|
161
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
162
|
-
region = opts[:region]
|
163
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
164
|
-
@cached_regions[region].coverages
|
165
|
-
end
|
166
|
-
|
167
|
-
def consensus_with_ambiguities(opts={})
|
168
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
169
|
-
region = opts[:region]
|
170
|
-
# p "consensus with ambiguities for: " << opts[:region]
|
171
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
172
|
-
@cached_regions[region].consensus
|
173
|
-
end
|
174
|
-
|
175
|
-
def calculate_stats_from_pile(opts={})
|
176
|
-
min_cov = opts[:min_cov] ? opts[:min_cov] : 20
|
177
|
-
|
178
|
-
|
179
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
180
|
-
region = opts[:region]
|
181
|
-
reference = self.fetch_reference(region.entry, region.start, region.end).downcase
|
182
|
-
# p "calculationg from pile..." << region.to_s
|
183
|
-
base_ratios = Array.new(region.size, BASE_COUNT_ZERO)
|
184
|
-
bases = Array.new(region.size, BASE_COUNT_ZERO)
|
185
|
-
coverages = Array.new(region.size, 0)
|
186
|
-
total_cov = 0
|
187
|
-
|
188
|
-
self.mpileup_cached(:region=>"#{region.to_s}") do | pile |
|
189
|
-
#puts pile
|
190
|
-
#puts pile.coverage
|
191
|
-
if pile.coverage > min_cov
|
192
|
-
base_ratios[pile.pos - region.start ] = pile.base_ratios
|
193
|
-
reference[pile.pos - region.start ] = pile.consensus_iuap(0.20)
|
194
|
-
coverages[pile.pos - region.start ] = pile.coverage.to_i
|
195
|
-
bases[pile.pos - region.start ] = pile.bases
|
196
|
-
end
|
197
|
-
total_cov += pile.coverage
|
198
|
-
end
|
199
|
-
|
200
|
-
region = @cached_regions[region.to_s]
|
201
|
-
region.coverages = coverages
|
202
|
-
region.base_ratios = base_ratios
|
203
|
-
region.consensus = reference
|
204
|
-
|
205
|
-
region.average_coverage = total_cov.to_f/region.size.to_f
|
206
|
-
region.bases = bases
|
207
|
-
region
|
208
|
-
end
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
BASE_COUNT_ZERO = {:A => 0, :C => 0, :G => 0, :T => 0}
|
213
|
-
|
214
|
-
#Gets an array with the proportions of the bases in the region. If there is no coverage, a
|
215
|
-
def base_ratios_in_region(opts={})
|
216
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
217
|
-
region = opts[:region]
|
218
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
219
|
-
@cached_regions[region].base_ratios
|
220
|
-
end
|
221
|
-
|
222
|
-
#Gets an array with the bsaes count in the region. If there is no coverage, a
|
223
|
-
def bases_in_region(opts={})
|
224
|
-
opts[:region] = opts[:region].to_s if opts[:region] .class == Bio::DB::Fasta::Region
|
225
|
-
region = opts[:region]
|
226
|
-
calculate_stats_from_pile(opts) if @cached_regions == nil or @cached_regions[region] == nil
|
227
|
-
@cached_regions[region].bases
|
228
|
-
end
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
def extract_reads(opts={})
|
233
|
-
opts[:region] = Bio::DB::Fasta::Region.parse_region( opts[:region] .to_s) unless opts[:region].class == Bio::DB::Fasta::Region
|
234
|
-
fastq_filename = opts[:fastq]
|
235
|
-
fastq_file = opts[:fastq_file]
|
236
|
-
|
237
|
-
out = $stdout
|
238
|
-
|
239
|
-
print_fastq = Proc.new do |alignment|
|
240
|
-
out.puts "@#{alignment.qname}"
|
241
|
-
out.puts "#{alignment.seq}"
|
242
|
-
out.puts "+#{alignment.qname}"
|
243
|
-
out.puts "#{alignment.qual}"
|
244
|
-
end
|
245
|
-
|
246
|
-
fetch_with_function(chromosome, qstart, qstart+len, print_fastq)
|
247
|
-
|
248
|
-
|
249
|
-
end
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
end
|
254
18
|
|
255
19
|
module Bio::BFRTools
|
256
20
|
|
@@ -267,7 +31,7 @@ module Bio::BFRTools
|
|
267
31
|
BASES = [:A, :C, :G, :T]
|
268
32
|
#Sets the reference file
|
269
33
|
def reference(path)
|
270
|
-
@reference_db = Bio::DB::Fasta::FastaFile.new(path)
|
34
|
+
@reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
|
271
35
|
@reference_path = path
|
272
36
|
end
|
273
37
|
|
@@ -350,33 +114,35 @@ module Bio::BFRTools
|
|
350
114
|
self.entry = reg.entry
|
351
115
|
self.start = reg.start
|
352
116
|
self.end = reg.end
|
353
|
-
|
117
|
+
opts[:region] = reg
|
354
118
|
@container = opts[:container]
|
355
119
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
@parental_2_sequence = parental_2_sam.consensus_with_ambiguities(opts)
|
120
|
+
parental_1_reg = @container.parental_1_sam.fetch_region(opts)
|
121
|
+
parental_2_reg = @container.parental_2_sam.fetch_region(opts)
|
122
|
+
bulk_1_reg = @container.bulk_1_sam.fetch_region(opts)
|
123
|
+
bulk_2_reg = @container.bulk_2_sam.fetch_region(opts)
|
124
|
+
|
125
|
+
|
363
126
|
|
364
|
-
@
|
365
|
-
@
|
127
|
+
@parental_1_sequence = parental_1_reg.consensus
|
128
|
+
@parental_2_sequence = parental_2_reg.consensus
|
129
|
+
|
130
|
+
@bulk_1_sequence = bulk_1_reg.consensus
|
131
|
+
@bulk_2_sequence = bulk_2_reg.consensus
|
366
132
|
|
367
133
|
@snp_count = Container.snps_between( @parental_1_sequence , @parental_2_sequence )
|
368
134
|
|
369
|
-
@ratios_bulk_1 =
|
370
|
-
@ratios_bulk_2 =
|
135
|
+
@ratios_bulk_1 = bulk_1_reg.base_ratios
|
136
|
+
@ratios_bulk_2 = bulk_2_reg.base_ratios
|
371
137
|
|
372
|
-
@bases_bulk_1 =
|
373
|
-
@bases_bulk_2 =
|
138
|
+
@bases_bulk_1 = bulk_1_reg.bases
|
139
|
+
@bases_bulk_2 = bulk_2_reg.bases
|
374
140
|
|
375
|
-
@avg_cov_bulk_1 =
|
376
|
-
@avg_cov_bulk_2 =
|
141
|
+
@avg_cov_bulk_1 = bulk_1_reg.average_coverage
|
142
|
+
@avg_cov_bulk_2 = bulk_2_reg.average_coverage
|
377
143
|
|
378
|
-
@coverages_1 =
|
379
|
-
@coverages_2 =
|
144
|
+
@coverages_1 = bulk_1_reg.coverages
|
145
|
+
@coverages_2 = bulk_2_reg.coverages
|
380
146
|
|
381
147
|
end
|
382
148
|
|
@@ -472,7 +238,7 @@ module Bio::BFRTools
|
|
472
238
|
raise BFRToolsException.new ("The reference for the line should be :first or :second, but was " + reference.to_s )
|
473
239
|
end
|
474
240
|
|
475
|
-
relative_position = self.start + position
|
241
|
+
relative_position = self.start + position
|
476
242
|
|
477
243
|
bfr = bfrs[reference][base][position]
|
478
244
|
cov_1 = @coverages_1[position]
|
@@ -622,7 +388,7 @@ module Bio::BFRTools
|
|
622
388
|
end
|
623
389
|
|
624
390
|
def process_region(opts={})
|
625
|
-
opts = { :min_cov=>20, :max_snp_1kbp => 10 }.merge!(opts)
|
391
|
+
opts = { :min_cov=>20, :max_snp_1kbp => 10, :max_per=>0.20 }.merge!(opts)
|
626
392
|
|
627
393
|
@proccesed_regions += 1
|
628
394
|
output = opts[:output_file] ? opts[:output_file] : $stdout
|
@@ -675,7 +441,7 @@ module Bio::BFRTools
|
|
675
441
|
|
676
442
|
|
677
443
|
for informative in info
|
678
|
-
line = region.get_bfr_line(i
|
444
|
+
line = region.get_bfr_line(i, base, informative)
|
679
445
|
output.print line , "\n"
|
680
446
|
end
|
681
447
|
end
|