bio-polyploid-tools 0.7.3 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
@@ -42,7 +42,8 @@ OptionParser.new do |opts|
|
|
42
42
|
opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
|
43
43
|
options[:flanking_size] = o.to_i
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
|
+
opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
|
46
47
|
requires --reference to get the sequence using a position") do |o|
|
47
48
|
options[:mutant_list] = o
|
48
49
|
test_file = o
|
@@ -76,9 +77,10 @@ File.open(test_file) do | f |
|
|
76
77
|
if region != lastRegion
|
77
78
|
lastTemplate = fasta_reference_db.fetch_sequence(region)
|
78
79
|
end
|
79
|
-
snp.
|
80
|
+
snp.full_sequence = lastTemplate
|
80
81
|
lastRegion = region
|
81
|
-
|
82
|
+
|
83
|
+
out.puts "#{snp.gene}_#{snp.snp_id_in_seq},#{snp.chromosome},#{snp.sequence_original}"
|
82
84
|
else
|
83
85
|
$stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
|
84
86
|
end
|
data/bin/snps_between_bams.rb
CHANGED
@@ -54,7 +54,6 @@ fasta_db.index.entries.each do | r |
|
|
54
54
|
|
55
55
|
|
56
56
|
begin
|
57
|
-
<<<<<<< HEAD
|
58
57
|
reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
59
58
|
reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
60
59
|
cons_1 = reg_a.consensus
|
@@ -85,34 +84,6 @@ fasta_db.index.entries.each do | r |
|
|
85
84
|
fasta_file.puts ">#{r.id}_2"
|
86
85
|
fasta_file.puts "#{cons_2}"
|
87
86
|
|
88
|
-
=======
|
89
|
-
|
90
|
-
cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
91
|
-
cons_2 = bam2.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
|
92
|
-
if cons_1 != cons_2
|
93
|
-
|
94
|
-
snps_1 = cons_1.count_ambiguities
|
95
|
-
snps_2 = cons_2.count_ambiguities
|
96
|
-
|
97
|
-
snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
|
98
|
-
|
99
|
-
snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
|
100
|
-
snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
|
101
|
-
snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
|
102
|
-
|
103
|
-
hist_1[snps_per_1k_1.to_i] += 1
|
104
|
-
hist_2[snps_per_1k_2.to_i] += 1
|
105
|
-
|
106
|
-
table_file.print "#{r.id}\t#{region.size}\t"
|
107
|
-
table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
|
108
|
-
table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
|
109
|
-
table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
|
110
|
-
fasta_file.puts ">#{r.id}_1"
|
111
|
-
fasta_file.puts "#{cons_1}"
|
112
|
-
fasta_file.puts ">#{r.id}_2"
|
113
|
-
fasta_file.puts "#{cons_2}"
|
114
|
-
end
|
115
|
-
>>>>>>> 1b60bd09fdb1b087d6cb53c643ff36e536efe4a3
|
116
87
|
rescue Exception => e
|
117
88
|
$stderr.puts "Unable to process #{region}: #{e.to_s}"
|
118
89
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'bio-samtools'
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
5
|
+
$: << File.expand_path('.')
|
6
|
+
path=File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def parseVCFheader(head_line="")
|
12
|
+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
|
13
|
+
|
14
|
+
m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
|
15
|
+
{:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
header_info = Hash.new
|
21
|
+
ARGF.each_line do |line|
|
22
|
+
h = nil
|
23
|
+
h = parseVCFheader(line) if line.start_with? "##INFO"
|
24
|
+
|
25
|
+
header_info[h[:id]] = h[:desc] if h
|
26
|
+
#puts header_info.inspect
|
27
|
+
next if line.start_with? "##"
|
28
|
+
if line.start_with? "#CHROM"
|
29
|
+
arr = line.split
|
30
|
+
arr = arr.drop(9)
|
31
|
+
arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
|
32
|
+
#header += arr2.join("\t")
|
33
|
+
#puts header
|
34
|
+
next
|
35
|
+
end
|
36
|
+
|
37
|
+
line.chomp!
|
38
|
+
|
39
|
+
vcf = Bio::DB::Vcf.new(line, arr)
|
40
|
+
# puts arr.join("\t") if vcf.info["TYPE"] == "snp"
|
41
|
+
# puts vcf.inspect
|
42
|
+
#pus vcf.pos.inspect
|
43
|
+
#next if vcf.info["AO"].to_i != 1
|
44
|
+
vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
|
45
|
+
|
46
|
+
arr2 = Array.new
|
47
|
+
puts "____"
|
48
|
+
i = 0
|
49
|
+
vcf.samples.each do |sample|
|
50
|
+
#puts sample.inspect
|
51
|
+
puts sample[1].keys.join("\t") if i == 0
|
52
|
+
puts sample[1].values.join("\t")
|
53
|
+
i+=1
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
data/bio-polyploid-tools.gemspec
CHANGED
@@ -1,44 +1,52 @@
|
|
1
|
-
# Generated by
|
1
|
+
# Generated by juwelier
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit
|
3
|
+
# Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: bio-polyploid-tools 0.
|
5
|
+
# stub: bio-polyploid-tools 0.8.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
|
-
s.name = "bio-polyploid-tools"
|
9
|
-
s.version = "0.
|
8
|
+
s.name = "bio-polyploid-tools".freeze
|
9
|
+
s.version = "0.8.0"
|
10
10
|
|
11
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
-
s.require_paths = ["lib"]
|
13
|
-
s.authors = ["Ricardo H. Ramirez-Gonzalez"]
|
14
|
-
s.date = "
|
15
|
-
s.description = "Repository of tools developed
|
16
|
-
s.email = "ricardo.ramirez-gonzalez@
|
17
|
-
s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "filter_exonerate_by_identity.rb", "find_best_blat_hit.rb", "find_best_exonerate.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snp_position_to_polymarker.rb", "snps_between_bams.rb"]
|
11
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib".freeze]
|
13
|
+
s.authors = ["Ricardo H. Ramirez-Gonzalez".freeze]
|
14
|
+
s.date = "2018-01-18"
|
15
|
+
s.description = "Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat".freeze
|
16
|
+
s.email = "ricardo.ramirez-gonzalez@jic.ac.uk".freeze
|
17
|
+
s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "markers_in_region.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "vcfLineToTable.rb".freeze]
|
18
18
|
s.extra_rdoc_files = [
|
19
19
|
"README",
|
20
20
|
"README.md"
|
21
21
|
]
|
22
22
|
s.files = [
|
23
|
+
".travis.yml",
|
23
24
|
"Gemfile",
|
24
|
-
"Gemfile.lock",
|
25
25
|
"README",
|
26
26
|
"README.md",
|
27
27
|
"Rakefile",
|
28
28
|
"VERSION",
|
29
29
|
"bin/bfr.rb",
|
30
|
+
"bin/blast_triads.rb",
|
31
|
+
"bin/blast_triads_promoters.rb",
|
30
32
|
"bin/count_variations.rb",
|
31
33
|
"bin/filter_blat_by_target_coverage.rb",
|
32
34
|
"bin/filter_exonerate_by_identity.rb",
|
33
35
|
"bin/find_best_blat_hit.rb",
|
34
36
|
"bin/find_best_exonerate.rb",
|
37
|
+
"bin/find_homoeologue_variations.rb",
|
38
|
+
"bin/get_longest_hsp_blastx_triads.rb",
|
35
39
|
"bin/hexaploid_primers.rb",
|
36
40
|
"bin/homokaryot_primers.rb",
|
41
|
+
"bin/mafft_triads.rb",
|
42
|
+
"bin/mafft_triads_promoters.rb",
|
37
43
|
"bin/map_markers_to_contigs.rb",
|
38
44
|
"bin/markers_in_region.rb",
|
39
45
|
"bin/polymarker.rb",
|
46
|
+
"bin/polymarker_capillary.rb",
|
40
47
|
"bin/snp_position_to_polymarker.rb",
|
41
48
|
"bin/snps_between_bams.rb",
|
49
|
+
"bin/vcfLineToTable.rb",
|
42
50
|
"bio-polyploid-tools.gemspec",
|
43
51
|
"conf/defaults.rb",
|
44
52
|
"conf/primer3_config/dangle.dh",
|
@@ -80,21 +88,29 @@ Gem::Specification.new do |s|
|
|
80
88
|
"lib/bio/PolyploidTools/ChromosomeArm.rb",
|
81
89
|
"lib/bio/PolyploidTools/ExonContainer.rb",
|
82
90
|
"lib/bio/PolyploidTools/Marker.rb",
|
91
|
+
"lib/bio/PolyploidTools/NoSNPSequence.rb",
|
83
92
|
"lib/bio/PolyploidTools/PrimerRegion.rb",
|
84
93
|
"lib/bio/PolyploidTools/SNP.rb",
|
85
94
|
"lib/bio/PolyploidTools/SNPMutant.rb",
|
86
95
|
"lib/bio/PolyploidTools/SNPSequence.rb",
|
96
|
+
"lib/bio/db/blast.rb",
|
87
97
|
"lib/bio/db/exonerate.rb",
|
88
98
|
"lib/bio/db/primer3.rb",
|
89
99
|
"lib/bioruby-polyploid-tools.rb",
|
90
100
|
"test/data/BS00068396_51.fa",
|
101
|
+
"test/data/BS00068396_51_blast.tab",
|
91
102
|
"test/data/BS00068396_51_contigs.aln",
|
92
103
|
"test/data/BS00068396_51_contigs.dnd",
|
93
104
|
"test/data/BS00068396_51_contigs.fa",
|
105
|
+
"test/data/BS00068396_51_contigs.nhr",
|
106
|
+
"test/data/BS00068396_51_contigs.nin",
|
107
|
+
"test/data/BS00068396_51_contigs.nsq",
|
94
108
|
"test/data/BS00068396_51_exonerate.tab",
|
109
|
+
"test/data/BS00068396_51_for_polymarker.fa",
|
95
110
|
"test/data/BS00068396_51_genes.txt",
|
96
111
|
"test/data/IWGSC_CSS_1AL_scaff_1455974.fa",
|
97
112
|
"test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa",
|
113
|
+
"test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai",
|
98
114
|
"test/data/LIB1716.bam",
|
99
115
|
"test/data/LIB1716.bam.bai",
|
100
116
|
"test/data/LIB1719.bam",
|
@@ -109,9 +125,31 @@ Gem::Specification.new do |s|
|
|
109
125
|
"test/data/PST130_reverse_primer.csv",
|
110
126
|
"test/data/S22380157.fa",
|
111
127
|
"test/data/S22380157.fa.fai",
|
128
|
+
"test/data/S22380157.vcf",
|
129
|
+
"test/data/S58861868/LIB1716.bam",
|
130
|
+
"test/data/S58861868/LIB1716.sam",
|
131
|
+
"test/data/S58861868/LIB1719.bam",
|
132
|
+
"test/data/S58861868/LIB1719.sam",
|
133
|
+
"test/data/S58861868/LIB1721.bam",
|
134
|
+
"test/data/S58861868/LIB1721.sam",
|
135
|
+
"test/data/S58861868/LIB1722.bam",
|
136
|
+
"test/data/S58861868/LIB1722.sam",
|
137
|
+
"test/data/S58861868/S58861868.fa",
|
138
|
+
"test/data/S58861868/S58861868.fa.fai",
|
139
|
+
"test/data/S58861868/S58861868.vcf",
|
140
|
+
"test/data/S58861868/header.txt",
|
141
|
+
"test/data/S58861868/merged.bam",
|
142
|
+
"test/data/S58861868/merged_reheader.bam",
|
143
|
+
"test/data/S58861868/merged_reheader.bam.bai",
|
112
144
|
"test/data/Test3Aspecific.csv",
|
113
145
|
"test/data/Test3Aspecific_contigs.fa",
|
114
146
|
"test/data/bfr_out_test.csv",
|
147
|
+
"test/data/headerMergeed.txt",
|
148
|
+
"test/data/headerS2238015",
|
149
|
+
"test/data/mergedLibs.bam",
|
150
|
+
"test/data/mergedLibsReheader.bam",
|
151
|
+
"test/data/mergedLibsSorted.bam",
|
152
|
+
"test/data/mergedLibsSorted.bam.bai",
|
115
153
|
"test/data/patological_cases5D.csv",
|
116
154
|
"test/data/primer_3_input_header_test",
|
117
155
|
"test/data/short_primer_design_test.csv",
|
@@ -122,38 +160,42 @@ Gem::Specification.new do |s|
|
|
122
160
|
"test/data/test_primer3_error.csv",
|
123
161
|
"test/data/test_primer3_error_contigs.fa",
|
124
162
|
"test/test_bfr.rb",
|
163
|
+
"test/test_blast.rb",
|
125
164
|
"test/test_exon_container.rb",
|
126
165
|
"test/test_exonearate.rb",
|
127
166
|
"test/test_snp_parsing.rb",
|
128
167
|
"test/test_wrong_selection.sh"
|
129
168
|
]
|
130
|
-
s.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
|
131
|
-
s.licenses = ["MIT"]
|
132
|
-
s.rubygems_version = "2.4.
|
133
|
-
s.summary = "Tool to work with polyploids, NGS and molecular biology"
|
169
|
+
s.homepage = "http://github.com/tgac/bioruby-polyploid-tools".freeze
|
170
|
+
s.licenses = ["MIT".freeze]
|
171
|
+
s.rubygems_version = "2.7.4".freeze
|
172
|
+
s.summary = "Tool to work with polyploids, NGS and molecular biology".freeze
|
134
173
|
|
135
174
|
if s.respond_to? :specification_version then
|
136
175
|
s.specification_version = 4
|
137
176
|
|
138
177
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
139
|
-
s.add_runtime_dependency(%q<bio
|
140
|
-
s.add_runtime_dependency(%q<bio-samtools
|
141
|
-
s.add_runtime_dependency(%q<
|
142
|
-
s.
|
143
|
-
s.
|
178
|
+
s.add_runtime_dependency(%q<bio>.freeze, [">= 1.5.1"])
|
179
|
+
s.add_runtime_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
|
180
|
+
s.add_runtime_dependency(%q<systemu>.freeze, [">= 2.5.2"])
|
181
|
+
s.add_development_dependency(%q<shoulda>.freeze, [">= 2.10"])
|
182
|
+
s.add_development_dependency(%q<test-unit>.freeze, [">= 0"])
|
183
|
+
s.add_development_dependency(%q<juwelier>.freeze, [">= 0"])
|
144
184
|
else
|
145
|
-
s.add_dependency(%q<bio
|
146
|
-
s.add_dependency(%q<bio-samtools
|
147
|
-
s.add_dependency(%q<
|
148
|
-
s.add_dependency(%q<
|
149
|
-
s.add_dependency(%q<
|
185
|
+
s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
|
186
|
+
s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
|
187
|
+
s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
|
188
|
+
s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
|
189
|
+
s.add_dependency(%q<test-unit>.freeze, [">= 0"])
|
190
|
+
s.add_dependency(%q<juwelier>.freeze, [">= 0"])
|
150
191
|
end
|
151
192
|
else
|
152
|
-
s.add_dependency(%q<bio
|
153
|
-
s.add_dependency(%q<bio-samtools
|
154
|
-
s.add_dependency(%q<
|
155
|
-
s.add_dependency(%q<
|
156
|
-
s.add_dependency(%q<
|
193
|
+
s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
|
194
|
+
s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
|
195
|
+
s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
|
196
|
+
s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
|
197
|
+
s.add_dependency(%q<test-unit>.freeze, [">= 0"])
|
198
|
+
s.add_dependency(%q<juwelier>.freeze, [">= 0"])
|
157
199
|
end
|
158
200
|
end
|
159
201
|
|
data/lib/bio/BFRTools.rb
CHANGED
@@ -22,6 +22,7 @@ module Bio::PolyploidTools
|
|
22
22
|
# puts entry
|
23
23
|
@fasta_db.fetch_sequence(entry.get_full_region)
|
24
24
|
end
|
25
|
+
|
25
26
|
#Loads all the chromosome arms in a folder.
|
26
27
|
#The current version requires that all the references end with .fa, and start with XXX_*.fa
|
27
28
|
#Where XXX is the chromosome name
|
@@ -29,16 +30,11 @@ module Bio::PolyploidTools
|
|
29
30
|
chromosomeArms = Hash.new
|
30
31
|
|
31
32
|
Dir.foreach(path_to_contigs) do |filename |
|
32
|
-
|
33
33
|
if File.fnmatch("*.fa", filename)
|
34
34
|
|
35
35
|
parsed = /^(?<arm>\d\w+)/.match(filename)
|
36
|
-
|
37
36
|
target="#{path_to_contigs}/#{filename}"
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
# fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
37
|
+
#fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
42
38
|
#fasta_file.load_fai_entries
|
43
39
|
arm = ChromosomeArm.new(parsed[:arm], target)
|
44
40
|
chromosomeArms[arm.name] = arm
|
@@ -19,15 +19,31 @@ module Bio::PolyploidTools
|
|
19
19
|
|
20
20
|
def gene_models(path)
|
21
21
|
@gene_models_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
|
22
|
+
@gene_models_db.index
|
22
23
|
@gene_models_path = path
|
23
24
|
end
|
24
25
|
|
25
26
|
#Returns the sequence for a region in the gene models (exon)
|
26
27
|
def gene_model_sequence(region)
|
27
|
-
#puts
|
28
|
-
|
29
|
-
|
28
|
+
#puts "Region: "
|
29
|
+
#puts region.inspect
|
30
|
+
target_reg = @gene_models_db.index.region_for_entry(region.entry)
|
31
|
+
#puts target_reg.inspect
|
32
|
+
region.end = target_reg.length if region.end > target_reg.length
|
33
|
+
#entries[region.entry]
|
30
34
|
|
35
|
+
seq=@gene_models_db.fetch_sequence(region)
|
36
|
+
#puts "sequence: "
|
37
|
+
#This is a patch that we need to fix in biosamtools:
|
38
|
+
#puts seq
|
39
|
+
index = seq.index('>')
|
40
|
+
if(index )
|
41
|
+
index -= 1
|
42
|
+
#puts "Index: #{index}"
|
43
|
+
seq = seq.slice(0..index)
|
44
|
+
end
|
45
|
+
#puts seq
|
46
|
+
seq
|
31
47
|
end
|
32
48
|
|
33
49
|
#Sets the reference file for the gene models
|
@@ -40,10 +56,10 @@ module Bio::PolyploidTools
|
|
40
56
|
def chromosome_sequence(region)
|
41
57
|
left_pad = 0
|
42
58
|
#TODO: Padd if it goes to the right
|
43
|
-
if(region.start <
|
59
|
+
if(region.start < 1)
|
44
60
|
left_pad = region.start * -1
|
45
61
|
left_pad += 1
|
46
|
-
region.start =
|
62
|
+
region.start = 1
|
47
63
|
end
|
48
64
|
str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
|
49
65
|
#str << "n" * (region.size - str.size + 1) if region.size > str.size
|
@@ -116,12 +132,17 @@ module Bio::PolyploidTools
|
|
116
132
|
@snp_map.each do | gene, snp_array|
|
117
133
|
snp_array.each do |snp|
|
118
134
|
#file.puts snp.primer_fasta_string
|
119
|
-
|
135
|
+
#puts "In print_fast_np_exones"
|
136
|
+
#puts snp.inspect
|
137
|
+
|
120
138
|
begin
|
121
139
|
file.puts snp.aligned_sequences_fasta
|
122
140
|
rescue Exception=>e
|
123
141
|
@missing_exons << snp.to_s
|
124
|
-
$stderr.puts e.to_s
|
142
|
+
$stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
|
143
|
+
$stderr.puts "Local position: #{snp.local_position}"
|
144
|
+
$stderr.puts "Local position: #{snp.parental_sequences.to_s}"
|
145
|
+
$stderr.puts e.backtrace
|
125
146
|
end
|
126
147
|
end
|
127
148
|
end
|
@@ -143,8 +164,10 @@ module Bio::PolyploidTools
|
|
143
164
|
end
|
144
165
|
rescue Exception=>e
|
145
166
|
@missing_exons << snp.to_s
|
167
|
+
# $stderr.puts ""
|
146
168
|
|
147
|
-
$stderr.puts e.to_s
|
169
|
+
$stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
|
170
|
+
$stderr.puts e.backtrace
|
148
171
|
end
|
149
172
|
end
|
150
173
|
end
|
@@ -0,0 +1,286 @@
|
|
1
|
+
|
2
|
+
require_relative "SNP"
|
3
|
+
require 'bio-samtools'
|
4
|
+
module Bio::PolyploidTools
|
5
|
+
class SNPSequenceException < RuntimeError
|
6
|
+
end
|
7
|
+
|
8
|
+
class NoSNPSequence < SNP
|
9
|
+
|
10
|
+
attr_accessor :sequence_original
|
11
|
+
#Format:
|
12
|
+
#snp name,chromsome from contig,microarray sequence
|
13
|
+
#BS00068396_51,2AS,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
|
14
|
+
def self.parse(reg_str)
|
15
|
+
reg_str.chomp!
|
16
|
+
snp = NoSNPSequence.new
|
17
|
+
|
18
|
+
arr = reg_str.split(",")
|
19
|
+
|
20
|
+
if arr.size == 3
|
21
|
+
snp.gene, snp.chromosome, snp.sequence_original = reg_str.split(",")
|
22
|
+
elsif arr.size == 2
|
23
|
+
snp.gene, snp.sequence_original = arr
|
24
|
+
else
|
25
|
+
throw SNPSequenceException.new "Need two or three fields to parse, and got #{arr.size} in #{reg_str}"
|
26
|
+
end
|
27
|
+
#snp.position = snp.position.to_i
|
28
|
+
#snp.original.upcase!
|
29
|
+
#snp.snp.upcase!
|
30
|
+
snp.chromosome. strip!
|
31
|
+
snp.snp_in = snp.chromosome
|
32
|
+
snp.parse_sequence_snp
|
33
|
+
snp.exon_list = Hash.new()
|
34
|
+
snp
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse_snp
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_sequence_snp
|
42
|
+
@position = (sequence_original.length / 2).to_i
|
43
|
+
@original = sequence_original[@position]
|
44
|
+
@snp = @original
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_s
|
48
|
+
"#{gene}:#{chromosome}"
|
49
|
+
end
|
50
|
+
|
51
|
+
def sequences_to_align
|
52
|
+
@sequences_to_align = surrounding_exon_sequences unless @sequences_to_align
|
53
|
+
@sequences_to_align
|
54
|
+
end
|
55
|
+
|
56
|
+
def mask_aligned_chromosomal_snp(chromosome)
|
57
|
+
return nil if aligned_sequences.values.size == 0
|
58
|
+
names = exon_sequences.keys
|
59
|
+
|
60
|
+
masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
|
61
|
+
|
62
|
+
masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
|
63
|
+
#TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
|
64
|
+
i = 0
|
65
|
+
while i < masked_snps.size
|
66
|
+
different = 0
|
67
|
+
cov = 0
|
68
|
+
from_group = 0
|
69
|
+
names.each do | chr |
|
70
|
+
if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
|
71
|
+
cov += 1
|
72
|
+
|
73
|
+
from_group += 1 if chr[0] == chromosome_group
|
74
|
+
#puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
|
75
|
+
if chr != chromosome
|
76
|
+
$stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
|
77
|
+
$stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
|
78
|
+
different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
masked_snps[i] = "-" if different == 0
|
83
|
+
masked_snps[i] = "-" if cov == 1
|
84
|
+
masked_snps[i] = "*" if cov == 0
|
85
|
+
expected_snps = names.size - 1
|
86
|
+
#puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
|
87
|
+
|
88
|
+
masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
|
89
|
+
|
90
|
+
i += 1
|
91
|
+
end
|
92
|
+
masked_snps
|
93
|
+
end
|
94
|
+
|
95
|
+
def count_deletions_around(position,target_chromosome)
|
96
|
+
first_aligned = aligned_sequences[target_chromosome]
|
97
|
+
|
98
|
+
pos_start = position - flanking_size
|
99
|
+
pos_end = position + flanking_size
|
100
|
+
pos_start = 0 if pos_start < 0
|
101
|
+
pos_end = first_aligned.size - 1 if pos_end >= first_aligned.size
|
102
|
+
count = 0
|
103
|
+
for i in pos_start..pos_end
|
104
|
+
has_del = false
|
105
|
+
|
106
|
+
aligned_sequences.each_pair do |name, val|
|
107
|
+
has_del = true if val[i] == '-'
|
108
|
+
print "#{val[i]}\t"
|
109
|
+
end
|
110
|
+
count += 1 if has_del
|
111
|
+
print "#{count}\n"
|
112
|
+
end
|
113
|
+
return count
|
114
|
+
end
|
115
|
+
|
116
|
+
def primer_region(target_chromosome, parental_chr )
|
117
|
+
chromosome_seq = aligned_sequences[target_chromosome]
|
118
|
+
#chromosome_seq = "-" * parental.size unless chromosome_seq
|
119
|
+
if aligned_sequences.size == 0
|
120
|
+
#puts aligned_sequences.inspect
|
121
|
+
#puts surrounding_exon_sequences.inspect
|
122
|
+
#puts self.inspect
|
123
|
+
chromosome_seq = surrounding_exon_sequences[target_chromosome]
|
124
|
+
|
125
|
+
end
|
126
|
+
chromosome_seq = chromosome_seq.downcase
|
127
|
+
|
128
|
+
mask = mask_aligned_chromosomal_snp(target_chromosome)
|
129
|
+
|
130
|
+
pr = PrimerRegion.new
|
131
|
+
pr.homoeologous = false
|
132
|
+
position_in_region = 0
|
133
|
+
parental = chromosome_seq.clone
|
134
|
+
(0..chromosome_seq.size-1).each do |i|
|
135
|
+
|
136
|
+
if chromosome_seq[i] != '-'
|
137
|
+
case
|
138
|
+
when mask[i] == '-'
|
139
|
+
#When the mask doesnt detect a SNP, so we take the parental
|
140
|
+
parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
|
141
|
+
when /[[:upper:]]/.match(mask[i])
|
142
|
+
#This is a good candidate for marking a SNP
|
143
|
+
#We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
|
144
|
+
if parental[i] == '-'
|
145
|
+
parental[i] = mask[i]
|
146
|
+
pr.crhomosome_specific_intron << position_in_region
|
147
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
148
|
+
parental[i] = mask[i]
|
149
|
+
pr.chromosome_specific << position_in_region if count_deletions_around(1,target_chromosome) < 3
|
150
|
+
pr.chromosome_specific_in_mask << i
|
151
|
+
end
|
152
|
+
|
153
|
+
when /[[:lower:]]/.match(mask[i])
|
154
|
+
#this is not that good candidate, but sitll gives specificity
|
155
|
+
if parental[i] == '-'
|
156
|
+
parental[i] = mask[i]
|
157
|
+
pr.almost_crhomosome_specific_intron << position_in_region
|
158
|
+
elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
|
159
|
+
parental[i] = mask[i].upcase
|
160
|
+
pr.almost_chromosome_specific << position_in_region
|
161
|
+
pr.almost_chromosome_specific_in_mask << i
|
162
|
+
end
|
163
|
+
end #Case closes
|
164
|
+
pr.position_in_mask_from_template[position_in_region] = i
|
165
|
+
position_in_region += 1
|
166
|
+
end #Closes region with bases
|
167
|
+
end
|
168
|
+
|
169
|
+
pr.sequence=parental.gsub('-','')
|
170
|
+
pr
|
171
|
+
end
|
172
|
+
|
173
|
+
def return_primer_3_string_test(opts={})
|
174
|
+
|
175
|
+
left = opts[:right_pos]
|
176
|
+
right = opts[:right_pos]
|
177
|
+
sequence = opts[:sequence]
|
178
|
+
orientation = "forward"
|
179
|
+
if opts[:right_pos]
|
180
|
+
orientation = "forward"
|
181
|
+
if left > right
|
182
|
+
left = sequence.size - left - 1
|
183
|
+
right = sequence.size - right - 1
|
184
|
+
sequence = reverse_complement_string(sequence)
|
185
|
+
orientation = "reverse"
|
186
|
+
end
|
187
|
+
if @variation_free_region > 0
|
188
|
+
check_str = sequence[right+1, @variation_free_region]
|
189
|
+
return nil if check_str != check_str.downcase
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
196
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
197
|
+
str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
|
198
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
199
|
+
str << "=\n"
|
200
|
+
|
201
|
+
|
202
|
+
#In case that we don't have a right primer, we do both orientations
|
203
|
+
unless opts[:right_pos]
|
204
|
+
sequence = opts[:sequence]
|
205
|
+
left = sequence.size - left - 1
|
206
|
+
orientation = "reverse"
|
207
|
+
sequence = reverse_complement_string(sequence)
|
208
|
+
str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
|
209
|
+
str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
|
210
|
+
str << "SEQUENCE_TEMPLATE=#{sequence}\n"
|
211
|
+
str << "=\n"
|
212
|
+
end
|
213
|
+
|
214
|
+
str
|
215
|
+
end
|
216
|
+
|
217
|
+
def get_base_in_different_chromosome(position, target_chromosome)
|
218
|
+
|
219
|
+
aligned_sequences.each_pair do |name, val|
|
220
|
+
next if target_chromosome == name
|
221
|
+
return val[position]
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def primer_3_all_strings(target_chromosome, parental)
|
226
|
+
pr = primer_region(target_chromosome, parental )
|
227
|
+
primer_3_propertes = Array.new
|
228
|
+
|
229
|
+
seq_original = String.new(pr.sequence)
|
230
|
+
#puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
|
231
|
+
return primer_3_propertes if seq_original.size < primer_3_min_seq_length
|
232
|
+
|
233
|
+
if pr.homoeologous
|
234
|
+
snp_type = "homoeologous"
|
235
|
+
else
|
236
|
+
snp_type = "non-homoeologous"
|
237
|
+
end
|
238
|
+
|
239
|
+
pr.chromosome_specific.each do |pos|
|
240
|
+
|
241
|
+
seq_snp = String.new(pr.sequence)
|
242
|
+
orgiginal_base = seq_snp[pos]
|
243
|
+
other_chromosome_base = get_base_in_different_chromosome(pos, target_chromosome)
|
244
|
+
|
245
|
+
args = {
|
246
|
+
:name =>"#{gene} A chromosome_specific exon #{snp_type} #{chromosome}",
|
247
|
+
:left_pos => pos,
|
248
|
+
:sequence=>seq_original
|
249
|
+
}
|
250
|
+
|
251
|
+
|
252
|
+
primer_3_propertes << return_primer_3_string(args)
|
253
|
+
args[:name] = "#{gene} B chromosome_specific exon #{snp_type} #{chromosome}"
|
254
|
+
args[:sequence] = seq_snp
|
255
|
+
#TODO: Find base from another chromosome
|
256
|
+
seq_snp[pos] = other_chromosome_base.upcase
|
257
|
+
|
258
|
+
primer_3_propertes << return_primer_3_string(args)
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
primer_3_propertes
|
263
|
+
end
|
264
|
+
|
265
|
+
def aligned_sequences
|
266
|
+
|
267
|
+
return @aligned_sequences if @aligned_sequences
|
268
|
+
if sequences_to_align.size == 1
|
269
|
+
@aligned_sequences = sequences_to_align
|
270
|
+
return @aligned_sequences
|
271
|
+
end
|
272
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
273
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
274
|
+
# puts "Before MAFT:#{sequences_to_align.inspect}"
|
275
|
+
report = mafft.query_align(sequences_to_align)
|
276
|
+
@aligned_sequences = report.alignment
|
277
|
+
# puts "MAFFT: #{report.alignment.inspect}"
|
278
|
+
@aligned_sequences
|
279
|
+
end
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
|
284
|
+
|
285
|
+
end
|
286
|
+
end
|