bio-polyploid-tools 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
@@ -42,7 +42,8 @@ OptionParser.new do |opts|
|
|
42
42
|
opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
|
43
43
|
options[:flanking_size] = o.to_i
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
|
+
opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
|
46
47
|
requires --reference to get the sequence using a position") do |o|
|
47
48
|
options[:mutant_list] = o
|
48
49
|
test_file = o
|
@@ -76,9 +77,10 @@ File.open(test_file) do | f |
|
|
76
77
|
if region != lastRegion
|
77
78
|
lastTemplate = fasta_reference_db.fetch_sequence(region)
|
78
79
|
end
|
79
|
-
snp.
|
80
|
+
snp.full_sequence = lastTemplate
|
80
81
|
lastRegion = region
|
81
|
-
|
82
|
+
|
83
|
+
out.puts "#{snp.gene}_#{snp.snp_id_in_seq},#{snp.chromosome},#{snp.sequence_original}"
|
82
84
|
else
|
83
85
|
$stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
|
84
86
|
end
|
data/bin/snps_between_bams.rb
CHANGED
@@ -54,7 +54,6 @@ fasta_db.index.entries.each do | r |
|
|
54
54
|
|
55
55
|
|
56
56
|
begin
|
57
|
-
<<<<<<< HEAD
|
58
57
|
reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
59
58
|
reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
60
59
|
cons_1 = reg_a.consensus
|
@@ -85,34 +84,6 @@ fasta_db.index.entries.each do | r |
|
|
85
84
|
fasta_file.puts ">#{r.id}_2"
|
86
85
|
fasta_file.puts "#{cons_2}"
|
87
86
|
|
88
|
-
=======
|
89
|
-
|
90
|
-
cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
91
|
-
cons_2 = bam2.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
92
|
-
if cons_1 != cons_2
|
93
|
-
|
94
|
-
snps_1 = cons_1.count_ambiguities
|
95
|
-
snps_2 = cons_2.count_ambiguities
|
96
|
-
|
97
|
-
snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
|
98
|
-
|
99
|
-
snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
|
100
|
-
snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
|
101
|
-
snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
|
102
|
-
|
103
|
-
hist_1[snps_per_1k_1.to_i] += 1
|
104
|
-
hist_2[snps_per_1k_2.to_i] += 1
|
105
|
-
|
106
|
-
table_file.print "#{r.id}\t#{region.size}\t"
|
107
|
-
table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
|
108
|
-
table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
|
109
|
-
table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
|
110
|
-
fasta_file.puts ">#{r.id}_1"
|
111
|
-
fasta_file.puts "#{cons_1}"
|
112
|
-
fasta_file.puts ">#{r.id}_2"
|
113
|
-
fasta_file.puts "#{cons_2}"
|
114
|
-
end
|
115
|
-
>>>>>>> 1b60bd09fdb1b087d6cb53c643ff36e536efe4a3
|
116
87
|
rescue Exception => e
|
117
88
|
$stderr.puts "Unable to process #{region}: #{e.to_s}"
|
118
89
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'bio-samtools'
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
5
|
+
$: << File.expand_path('.')
|
6
|
+
path=File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def parseVCFheader(head_line="")
|
12
|
+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
|
13
|
+
|
14
|
+
m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
|
15
|
+
{:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
header_info = Hash.new
|
21
|
+
ARGF.each_line do |line|
|
22
|
+
h = nil
|
23
|
+
h = parseVCFheader(line) if line.start_with? "##INFO"
|
24
|
+
|
25
|
+
header_info[h[:id]] = h[:desc] if h
|
26
|
+
#puts header_info.inspect
|
27
|
+
next if line.start_with? "##"
|
28
|
+
if line.start_with? "#CHROM"
|
29
|
+
arr = line.split
|
30
|
+
arr = arr.drop(9)
|
31
|
+
arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
|
32
|
+
#header += arr2.join("\t")
|
33
|
+
#puts header
|
34
|
+
next
|
35
|
+
end
|
36
|
+
|
37
|
+
line.chomp!
|
38
|
+
|
39
|
+
vcf = Bio::DB::Vcf.new(line, arr)
|
40
|
+
# puts arr.join("\t") if vcf.info["TYPE"] == "snp"
|
41
|
+
# puts vcf.inspect
|
42
|
+
#pus vcf.pos.inspect
|
43
|
+
#next if vcf.info["AO"].to_i != 1
|
44
|
+
vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
|
45
|
+
|
46
|
+
arr2 = Array.new
|
47
|
+
puts "____"
|
48
|
+
i = 0
|
49
|
+
vcf.samples.each do |sample|
|
50
|
+
#puts sample.inspect
|
51
|
+
puts sample[1].keys.join("\t") if i == 0
|
52
|
+
puts sample[1].values.join("\t")
|
53
|
+
i+=1
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
data/bio-polyploid-tools.gemspec
CHANGED
@@ -1,44 +1,52 @@
|
|
1
|
-
# Generated by
|
1
|
+
# Generated by juwelier
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit
|
3
|
+
# Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: bio-polyploid-tools 0.
|
5
|
+
# stub: bio-polyploid-tools 0.8.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
|
-
s.name = "bio-polyploid-tools"
|
9
|
-
s.version = "0.
|
8
|
+
s.name = "bio-polyploid-tools".freeze
|
9
|
+
s.version = "0.8.0"
|
10
10
|
|
11
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
-
s.require_paths = ["lib"]
|
13
|
-
s.authors = ["Ricardo H. Ramirez-Gonzalez"]
|
14
|
-
s.date = "
|
15
|
-
s.description = "Repository of tools developed
|
16
|
-
s.email = "ricardo.ramirez-gonzalez@
|
17
|
-
s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "filter_exonerate_by_identity.rb", "find_best_blat_hit.rb", "find_best_exonerate.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snp_position_to_polymarker.rb", "snps_between_bams.rb"]
|
11
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib".freeze]
|
13
|
+
s.authors = ["Ricardo H. Ramirez-Gonzalez".freeze]
|
14
|
+
s.date = "2018-01-18"
|
15
|
+
s.description = "Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat".freeze
|
16
|
+
s.email = "ricardo.ramirez-gonzalez@jic.ac.uk".freeze
|
17
|
+
s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "markers_in_region.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "vcfLineToTable.rb".freeze]
|
18
18
|
s.extra_rdoc_files = [
|
19
19
|
"README",
|
20
20
|
"README.md"
|
21
21
|
]
|
22
22
|
s.files = [
|
23
|
+
".travis.yml",
|
23
24
|
"Gemfile",
|
24
|
-
"Gemfile.lock",
|
25
25
|
"README",
|
26
26
|
"README.md",
|
27
27
|
"Rakefile",
|
28
28
|
"VERSION",
|
29
29
|
"bin/bfr.rb",
|
30
|
+
"bin/blast_triads.rb",
|
31
|
+
"bin/blast_triads_promoters.rb",
|
30
32
|
"bin/count_variations.rb",
|
31
33
|
"bin/filter_blat_by_target_coverage.rb",
|
32
34
|
"bin/filter_exonerate_by_identity.rb",
|
33
35
|
"bin/find_best_blat_hit.rb",
|
34
36
|
"bin/find_best_exonerate.rb",
|
37
|
+
"bin/find_homoeologue_variations.rb",
|
38
|
+
"bin/get_longest_hsp_blastx_triads.rb",
|
35
39
|
"bin/hexaploid_primers.rb",
|
36
40
|
"bin/homokaryot_primers.rb",
|
41
|
+
"bin/mafft_triads.rb",
|
42
|
+
"bin/mafft_triads_promoters.rb",
|
37
43
|
"bin/map_markers_to_contigs.rb",
|
38
44
|
"bin/markers_in_region.rb",
|
39
45
|
"bin/polymarker.rb",
|
46
|
+
"bin/polymarker_capillary.rb",
|
40
47
|
"bin/snp_position_to_polymarker.rb",
|
41
48
|
"bin/snps_between_bams.rb",
|
49
|
+
"bin/vcfLineToTable.rb",
|
42
50
|
"bio-polyploid-tools.gemspec",
|
43
51
|
"conf/defaults.rb",
|
44
52
|
"conf/primer3_config/dangle.dh",
|
@@ -80,21 +88,29 @@ Gem::Specification.new do |s|
|
|
80
88
|
"lib/bio/PolyploidTools/ChromosomeArm.rb",
|
81
89
|
"lib/bio/PolyploidTools/ExonContainer.rb",
|
82
90
|
"lib/bio/PolyploidTools/Marker.rb",
|
91
|
+
"lib/bio/PolyploidTools/NoSNPSequence.rb",
|
83
92
|
"lib/bio/PolyploidTools/PrimerRegion.rb",
|
84
93
|
"lib/bio/PolyploidTools/SNP.rb",
|
85
94
|
"lib/bio/PolyploidTools/SNPMutant.rb",
|
86
95
|
"lib/bio/PolyploidTools/SNPSequence.rb",
|
96
|
+
"lib/bio/db/blast.rb",
|
87
97
|
"lib/bio/db/exonerate.rb",
|
88
98
|
"lib/bio/db/primer3.rb",
|
89
99
|
"lib/bioruby-polyploid-tools.rb",
|
90
100
|
"test/data/BS00068396_51.fa",
|
101
|
+
"test/data/BS00068396_51_blast.tab",
|
91
102
|
"test/data/BS00068396_51_contigs.aln",
|
92
103
|
"test/data/BS00068396_51_contigs.dnd",
|
93
104
|
"test/data/BS00068396_51_contigs.fa",
|
105
|
+
"test/data/BS00068396_51_contigs.nhr",
|
106
|
+
"test/data/BS00068396_51_contigs.nin",
|
107
|
+
"test/data/BS00068396_51_contigs.nsq",
|
94
108
|
"test/data/BS00068396_51_exonerate.tab",
|
109
|
+
"test/data/BS00068396_51_for_polymarker.fa",
|
95
110
|
"test/data/BS00068396_51_genes.txt",
|
96
111
|
"test/data/IWGSC_CSS_1AL_scaff_1455974.fa",
|
97
112
|
"test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa",
|
113
|
+
"test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai",
|
98
114
|
"test/data/LIB1716.bam",
|
99
115
|
"test/data/LIB1716.bam.bai",
|
100
116
|
"test/data/LIB1719.bam",
|
@@ -109,9 +125,31 @@ Gem::Specification.new do |s|
|
|
109
125
|
"test/data/PST130_reverse_primer.csv",
|
110
126
|
"test/data/S22380157.fa",
|
111
127
|
"test/data/S22380157.fa.fai",
|
128
|
+
"test/data/S22380157.vcf",
|
129
|
+
"test/data/S58861868/LIB1716.bam",
|
130
|
+
"test/data/S58861868/LIB1716.sam",
|
131
|
+
"test/data/S58861868/LIB1719.bam",
|
132
|
+
"test/data/S58861868/LIB1719.sam",
|
133
|
+
"test/data/S58861868/LIB1721.bam",
|
134
|
+
"test/data/S58861868/LIB1721.sam",
|
135
|
+
"test/data/S58861868/LIB1722.bam",
|
136
|
+
"test/data/S58861868/LIB1722.sam",
|
137
|
+
"test/data/S58861868/S58861868.fa",
|
138
|
+
"test/data/S58861868/S58861868.fa.fai",
|
139
|
+
"test/data/S58861868/S58861868.vcf",
|
140
|
+
"test/data/S58861868/header.txt",
|
141
|
+
"test/data/S58861868/merged.bam",
|
142
|
+
"test/data/S58861868/merged_reheader.bam",
|
143
|
+
"test/data/S58861868/merged_reheader.bam.bai",
|
112
144
|
"test/data/Test3Aspecific.csv",
|
113
145
|
"test/data/Test3Aspecific_contigs.fa",
|
114
146
|
"test/data/bfr_out_test.csv",
|
147
|
+
"test/data/headerMergeed.txt",
|
148
|
+
"test/data/headerS2238015",
|
149
|
+
"test/data/mergedLibs.bam",
|
150
|
+
"test/data/mergedLibsReheader.bam",
|
151
|
+
"test/data/mergedLibsSorted.bam",
|
152
|
+
"test/data/mergedLibsSorted.bam.bai",
|
115
153
|
"test/data/patological_cases5D.csv",
|
116
154
|
"test/data/primer_3_input_header_test",
|
117
155
|
"test/data/short_primer_design_test.csv",
|
@@ -122,38 +160,42 @@ Gem::Specification.new do |s|
|
|
122
160
|
"test/data/test_primer3_error.csv",
|
123
161
|
"test/data/test_primer3_error_contigs.fa",
|
124
162
|
"test/test_bfr.rb",
|
163
|
+
"test/test_blast.rb",
|
125
164
|
"test/test_exon_container.rb",
|
126
165
|
"test/test_exonearate.rb",
|
127
166
|
"test/test_snp_parsing.rb",
|
128
167
|
"test/test_wrong_selection.sh"
|
129
168
|
]
|
130
|
-
s.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
|
131
|
-
s.licenses = ["MIT"]
|
132
|
-
s.rubygems_version = "2.4.
|
133
|
-
s.summary = "Tool to work with polyploids, NGS and molecular biology"
|
169
|
+
s.homepage = "http://github.com/tgac/bioruby-polyploid-tools".freeze
|
170
|
+
s.licenses = ["MIT".freeze]
|
171
|
+
s.rubygems_version = "2.7.4".freeze
|
172
|
+
s.summary = "Tool to work with polyploids, NGS and molecular biology".freeze
|
134
173
|
|
135
174
|
if s.respond_to? :specification_version then
|
136
175
|
s.specification_version = 4
|
137
176
|
|
138
177
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
139
|
-
s.add_runtime_dependency(%q<bio
|
140
|
-
s.add_runtime_dependency(%q<bio-samtools
|
141
|
-
s.add_runtime_dependency(%q<
|
142
|
-
s.
|
143
|
-
s.
|
178
|
+
s.add_runtime_dependency(%q<bio>.freeze, [">= 1.5.1"])
|
179
|
+
s.add_runtime_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
|
180
|
+
s.add_runtime_dependency(%q<systemu>.freeze, [">= 2.5.2"])
|
181
|
+
s.add_development_dependency(%q<shoulda>.freeze, [">= 2.10"])
|
182
|
+
s.add_development_dependency(%q<test-unit>.freeze, [">= 0"])
|
183
|
+
s.add_development_dependency(%q<juwelier>.freeze, [">= 0"])
|
144
184
|
else
|
145
|
-
s.add_dependency(%q<bio
|
146
|
-
s.add_dependency(%q<bio-samtools
|
147
|
-
s.add_dependency(%q<
|
148
|
-
s.add_dependency(%q<
|
149
|
-
s.add_dependency(%q<
|
185
|
+
s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
|
186
|
+
s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
|
187
|
+
s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
|
188
|
+
s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
|
189
|
+
s.add_dependency(%q<test-unit>.freeze, [">= 0"])
|
190
|
+
s.add_dependency(%q<juwelier>.freeze, [">= 0"])
|
150
191
|
end
|
151
192
|
else
|
152
|
-
s.add_dependency(%q<bio
|
153
|
-
s.add_dependency(%q<bio-samtools
|
154
|
-
s.add_dependency(%q<
|
155
|
-
s.add_dependency(%q<
|
156
|
-
s.add_dependency(%q<
|
193
|
+
s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
|
194
|
+
s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
|
195
|
+
s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
|
196
|
+
s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
|
197
|
+
s.add_dependency(%q<test-unit>.freeze, [">= 0"])
|
198
|
+
s.add_dependency(%q<juwelier>.freeze, [">= 0"])
|
157
199
|
end
|
158
200
|
end
|
159
201
|
|
data/lib/bio/BFRTools.rb
CHANGED
@@ -22,6 +22,7 @@ module Bio::PolyploidTools
|
|
22
22
|
# puts entry
|
23
23
|
@fasta_db.fetch_sequence(entry.get_full_region)
|
24
24
|
end
|
25
|
+
|
25
26
|
#Loads all the chromosome arms in a folder.
|
26
27
|
#The current version requires that all the references end with .fa, and start with XXX_*.fa
|
27
28
|
#Where XXX is the chromosome name
|
@@ -29,16 +30,11 @@ module Bio::PolyploidTools
|
|
29
30
|
chromosomeArms = Hash.new
|
30
31
|
|
31
32
|
Dir.foreach(path_to_contigs) do |filename |
|
32
|
-
|
33
33
|
if File.fnmatch("*.fa", filename)
|
34
34
|
|
35
35
|
parsed = /^(?<arm>\d\w+)/.match(filename)
|
36
|
-
|
37
36
|
target="#{path_to_contigs}/#{filename}"
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
# fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
37
|
+
#fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
42
38
|
#fasta_file.load_fai_entries
|
43
39
|
arm = ChromosomeArm.new(parsed[:arm], target)
|
44
40
|
chromosomeArms[arm.name] = arm
|
@@ -19,15 +19,31 @@ module Bio::PolyploidTools
|
|
19
19
|
|
20
20
|
def gene_models(path)
|
21
21
|
@gene_models_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
|
22
|
+
@gene_models_db.index
|
22
23
|
@gene_models_path = path
|
23
24
|
end
|
24
25
|
|
25
26
|
#Returns the sequence for a region in the gene models (exon)
|
26
27
|
def gene_model_sequence(region)
|
27
|
-
#puts
|
28
|
-
|
29
|
-
|
28
|
+
#puts "Region: "
|
29
|
+
#puts region.inspect
|
30
|
+
target_reg = @gene_models_db.index.region_for_entry(region.entry)
|
31
|
+
#puts target_reg.inspect
|
32
|
+
region.end = target_reg.length if region.end > target_reg.length
|
33
|
+
#entries[region.entry]
|
30
34
|
|
35
|
+
seq=@gene_models_db.fetch_sequence(region)
|
36
|
+
#puts "sequence: "
|
37
|
+
#This is a patch that we need to fix in biosamtools:
|
38
|
+
#puts seq
|
39
|
+
index = seq.index('>')
|
40
|
+
if(index )
|
41
|
+
index -= 1
|
42
|
+
#puts "Index: #{index}"
|
43
|
+
seq = seq.slice(0..index)
|
44
|
+
end
|
45
|
+
#puts seq
|
46
|
+
seq
|
31
47
|
end
|
32
48
|
|
33
49
|
#Sets the reference file for the gene models
|
@@ -40,10 +56,10 @@ module Bio::PolyploidTools
|
|
40
56
|
def chromosome_sequence(region)
|
41
57
|
left_pad = 0
|
42
58
|
#TODO: Padd if it goes to the right
|
43
|
-
if(region.start <
|
59
|
+
if(region.start < 1)
|
44
60
|
left_pad = region.start * -1
|
45
61
|
left_pad += 1
|
46
|
-
region.start =
|
62
|
+
region.start = 1
|
47
63
|
end
|
48
64
|
str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
|
49
65
|
#str << "n" * (region.size - str.size + 1) if region.size > str.size
|
@@ -116,12 +132,17 @@ module Bio::PolyploidTools
|
|
116
132
|
@snp_map.each do | gene, snp_array|
|
117
133
|
snp_array.each do |snp|
|
118
134
|
#file.puts snp.primer_fasta_string
|
119
|
-
|
135
|
+
#puts "In print_fast_np_exones"
|
136
|
+
#puts snp.inspect
|
137
|
+
|
120
138
|
begin
|
121
139
|
file.puts snp.aligned_sequences_fasta
|
122
140
|
rescue Exception=>e
|
123
141
|
@missing_exons << snp.to_s
|
124
|
-
$stderr.puts e.to_s
|
142
|
+
$stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
|
143
|
+
$stderr.puts "Local position: #{snp.local_position}"
|
144
|
+
$stderr.puts "Local position: #{snp.parental_sequences.to_s}"
|
145
|
+
$stderr.puts e.backtrace
|
125
146
|
end
|
126
147
|
end
|
127
148
|
end
|
@@ -143,8 +164,10 @@ module Bio::PolyploidTools
|
|
143
164
|
end
|
144
165
|
rescue Exception=>e
|
145
166
|
@missing_exons << snp.to_s
|
167
|
+
# $stderr.puts ""
|
146
168
|
|
147
|
-
$stderr.puts e.to_s
|
169
|
+
$stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
|
170
|
+
$stderr.puts e.backtrace
|
148
171
|
end
|
149
172
|
end
|
150
173
|
end
|
@@ -0,0 +1,286 @@
|
|
1
|
+
|
2
|
+
require_relative "SNP"
|
3
|
+
require 'bio-samtools'
|
4
|
+
module Bio::PolyploidTools
|
5
|
+
class SNPSequenceException < RuntimeError
|
6
|
+
end
|
7
|
+
|
8
|
+
class NoSNPSequence < SNP
|
9
|
+
|
10
|
+
attr_accessor :sequence_original
|
11
|
+
#Format:
|
12
|
+
#snp name,chromsome from contig,microarray sequence
|
13
|
+
#BS00068396_51,2AS,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
|
14
|
+
def self.parse(reg_str)
|
15
|
+
reg_str.chomp!
|
16
|
+
snp = NoSNPSequence.new
|
17
|
+
|
18
|
+
arr = reg_str.split(",")
|
19
|
+
|
20
|
+
if arr.size == 3
|
21
|
+
snp.gene, snp.chromosome, snp.sequence_original = reg_str.split(",")
|
22
|
+
elsif arr.size == 2
|
23
|
+
snp.gene, snp.sequence_original = arr
|
24
|
+
else
|
25
|
+
throw SNPSequenceException.new "Need two or three fields to parse, and got #{arr.size} in #{reg_str}"
|
26
|
+
end
|
27
|
+
#snp.position = snp.position.to_i
|
28
|
+
#snp.original.upcase!
|
29
|
+
#snp.snp.upcase!
|
30
|
+
snp.chromosome. strip!
|
31
|
+
snp.snp_in = snp.chromosome
|
32
|
+
snp.parse_sequence_snp
|
33
|
+
snp.exon_list = Hash.new()
|
34
|
+
snp
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse_snp
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_sequence_snp
|
42
|
+
@position = (sequence_original.length / 2).to_i
|
43
|
+
@original = sequence_original[@position]
|
44
|
+
@snp = @original
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_s
|
48
|
+
"#{gene}:#{chromosome}"
|
49
|
+
end
|
50
|
+
|
51
|
+
def sequences_to_align
|
52
|
+
@sequences_to_align = surrounding_exon_sequences unless @sequences_to_align
|
53
|
+
@sequences_to_align
|
54
|
+
end
|
55
|
+
|
56
|
+
def mask_aligned_chromosomal_snp(chromosome)
|
57
|
+
return nil if aligned_sequences.values.size == 0
|
58
|
+
names = exon_sequences.keys
|
59
|
+
|
60
|
+
masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
|
61
|
+
|
62
|
+
masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
|
63
|
+
#TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
|
64
|
+
i = 0
|
65
|
+
while i < masked_snps.size
|
66
|
+
different = 0
|
67
|
+
cov = 0
|
68
|
+
from_group = 0
|
69
|
+
names.each do | chr |
|
70
|
+
if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
71
|
+
cov += 1
|
72
|
+
|
73
|
+
from_group += 1 if chr[0] == chromosome_group
|
74
|
+
#puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
|
75
|
+
if chr != chromosome
|
76
|
+
$stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
|
77
|
+
$stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
|
78
|
+
different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
masked_snps[i] = "-" if different == 0
|
83
|
+
masked_snps[i] = "-" if cov == 1
|
84
|
+
masked_snps[i] = "*" if cov == 0
|
85
|
+
expected_snps = names.size - 1
|
86
|
+
#puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
|
87
|
+
|
88
|
+
masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
|
89
|
+
|
90
|
+
i += 1
|
91
|
+
end
|
92
|
+
masked_snps
|
93
|
+
end
|
94
|
+
|
95
|
+
def count_deletions_around(position,target_chromosome)
|
96
|
+
first_aligned = aligned_sequences[target_chromosome]
|
97
|
+
|
98
|
+
pos_start = position - flanking_size
|
99
|
+
pos_end = position + flanking_size
|
100
|
+
pos_start = 0 if pos_start < 0
|
101
|
+
pos_end = first_aligned.size - 1 if pos_end >= first_aligned.size
|
102
|
+
count = 0
|
103
|
+
for i in pos_start..pos_end
|
104
|
+
has_del = false
|
105
|
+
|
106
|
+
aligned_sequences.each_pair do |name, val|
|
107
|
+
has_del = true if val[i] == '-'
|
108
|
+
print "#{val[i]}\t"
|
109
|
+
end
|
110
|
+
count += 1 if has_del
|
111
|
+
print "#{count}\n"
|
112
|
+
end
|
113
|
+
return count
|
114
|
+
end
|
115
|
+
|
116
|
+
def primer_region(target_chromosome, parental_chr )
|
117
|
+
chromosome_seq = aligned_sequences[target_chromosome]
|
118
|
+
#chromosome_seq = "-" * parental.size unless chromosome_seq
|
119
|
+
if aligned_sequences.size == 0
|
120
|
+
#puts aligned_sequences.inspect
|
121
|
+
#puts surrounding_exon_sequences.inspect
|
122
|
+
#puts self.inspect
|
123
|
+
chromosome_seq = surrounding_exon_sequences[target_chromosome]
|
124
|
+
|
125
|
+
end
|
126
|
+
chromosome_seq = chromosome_seq.downcase
|
127
|
+
|
128
|
+
mask = mask_aligned_chromosomal_snp(target_chromosome)
|
129
|
+
|
130
|
+
pr = PrimerRegion.new
|
131
|
+
pr.homoeologous = false
|
132
|
+
position_in_region = 0
|
133
|
+
parental = chromosome_seq.clone
|
134
|
+
(0..chromosome_seq.size-1).each do |i|
|
135
|
+
|
136
|
+
if chromosome_seq[i] != '-'
|
137
|
+
case
|
138
|
+
when mask[i] == '-'
|
139
|
+
#When the mask doesnt detect a SNP, so we take the parental
|
140
|
+
parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
|
141
|
+
when /[[:upper:]]/.match(mask[i])
|
142
|
+
#This is a good candidate for marking a SNP
|
143
|
+
#We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
|
144
|
+
if parental[i] == '-'
|
145
|
+
parental[i] = mask[i]
|
146
|
+
pr.crhomosome_specific_intron << position_in_region
|
147
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
148
|
+
parental[i] = mask[i]
|
149
|
+
pr.chromosome_specific << position_in_region if count_deletions_around(1,target_chromosome) < 3
|
150
|
+
pr.chromosome_specific_in_mask << i
|
151
|
+
end
|
152
|
+
|
153
|
+
when /[[:lower:]]/.match(mask[i])
|
154
|
+
#this is not that good candidate, but sitll gives specificity
|
155
|
+
if parental[i] == '-'
|
156
|
+
parental[i] = mask[i]
|
157
|
+
pr.almost_crhomosome_specific_intron << position_in_region
|
158
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
159
|
+
parental[i] = mask[i].upcase
|
160
|
+
pr.almost_chromosome_specific << position_in_region
|
161
|
+
pr.almost_chromosome_specific_in_mask << i
|
162
|
+
end
|
163
|
+
end #Case closes
|
164
|
+
pr.position_in_mask_from_template[position_in_region] = i
|
165
|
+
position_in_region += 1
|
166
|
+
end #Closes region with bases
|
167
|
+
end
|
168
|
+
|
169
|
+
pr.sequence=parental.gsub('-','')
|
170
|
+
pr
|
171
|
+
end
|
172
|
+
|
173
|
+
def return_primer_3_string_test(opts={})
|
174
|
+
|
175
|
+
left = opts[:right_pos]
|
176
|
+
right = opts[:right_pos]
|
177
|
+
sequence = opts[:sequence]
|
178
|
+
orientation = "forward"
|
179
|
+
if opts[:right_pos]
|
180
|
+
orientation = "forward"
|
181
|
+
if left > right
|
182
|
+
left = sequence.size - left - 1
|
183
|
+
right = sequence.size - right - 1
|
184
|
+
sequence = reverse_complement_string(sequence)
|
185
|
+
orientation = "reverse"
|
186
|
+
end
|
187
|
+
if @variation_free_region > 0
|
188
|
+
check_str = sequence[right+1, @variation_free_region]
|
189
|
+
return nil if check_str != check_str.downcase
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
196
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
197
|
+
str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
|
198
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
199
|
+
str << "=\n"
|
200
|
+
|
201
|
+
|
202
|
+
#In case that we don't have a right primer, we do both orientations
|
203
|
+
unless opts[:right_pos]
|
204
|
+
sequence = opts[:sequence]
|
205
|
+
left = sequence.size - left - 1
|
206
|
+
orientation = "reverse"
|
207
|
+
sequence = reverse_complement_string(sequence)
|
208
|
+
str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
209
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
210
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
211
|
+
str << "=\n"
|
212
|
+
end
|
213
|
+
|
214
|
+
str
|
215
|
+
end
|
216
|
+
|
217
|
+
def get_base_in_different_chromosome(position, target_chromosome)
|
218
|
+
|
219
|
+
aligned_sequences.each_pair do |name, val|
|
220
|
+
next if target_chromosome == name
|
221
|
+
return val[position]
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def primer_3_all_strings(target_chromosome, parental)
|
226
|
+
pr = primer_region(target_chromosome, parental )
|
227
|
+
primer_3_propertes = Array.new
|
228
|
+
|
229
|
+
seq_original = String.new(pr.sequence)
|
230
|
+
#puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
|
231
|
+
return primer_3_propertes if seq_original.size < primer_3_min_seq_length
|
232
|
+
|
233
|
+
if pr.homoeologous
|
234
|
+
snp_type = "homoeologous"
|
235
|
+
else
|
236
|
+
snp_type = "non-homoeologous"
|
237
|
+
end
|
238
|
+
|
239
|
+
pr.chromosome_specific.each do |pos|
|
240
|
+
|
241
|
+
seq_snp = String.new(pr.sequence)
|
242
|
+
orgiginal_base = seq_snp[pos]
|
243
|
+
other_chromosome_base = get_base_in_different_chromosome(pos, target_chromosome)
|
244
|
+
|
245
|
+
args = {
|
246
|
+
:name =>"#{gene} A chromosome_specific exon #{snp_type} #{chromosome}",
|
247
|
+
:left_pos => pos,
|
248
|
+
:sequence=>seq_original
|
249
|
+
}
|
250
|
+
|
251
|
+
|
252
|
+
primer_3_propertes << return_primer_3_string(args)
|
253
|
+
args[:name] = "#{gene} B chromosome_specific exon #{snp_type} #{chromosome}"
|
254
|
+
args[:sequence] = seq_snp
|
255
|
+
#TODO: Find base from another chromosome
|
256
|
+
seq_snp[pos] = other_chromosome_base.upcase
|
257
|
+
|
258
|
+
primer_3_propertes << return_primer_3_string(args)
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
primer_3_propertes
|
263
|
+
end
|
264
|
+
|
265
|
+
def aligned_sequences
|
266
|
+
|
267
|
+
return @aligned_sequences if @aligned_sequences
|
268
|
+
if sequences_to_align.size == 1
|
269
|
+
@aligned_sequences = sequences_to_align
|
270
|
+
return @aligned_sequences
|
271
|
+
end
|
272
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
273
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
274
|
+
# puts "Before MAFT:#{sequences_to_align.inspect}"
|
275
|
+
report = mafft.query_align(sequences_to_align)
|
276
|
+
@aligned_sequences = report.alignment
|
277
|
+
# puts "MAFFT: #{report.alignment.inspect}"
|
278
|
+
@aligned_sequences
|
279
|
+
end
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
|
284
|
+
|
285
|
+
end
|
286
|
+
end
|