bio-polyploid-tools 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +17 -0
  3. data/Gemfile +10 -7
  4. data/README.md +44 -0
  5. data/Rakefile +14 -14
  6. data/VERSION +1 -1
  7. data/bin/bfr.rb +2 -2
  8. data/bin/blast_triads.rb +166 -0
  9. data/bin/blast_triads_promoters.rb +192 -0
  10. data/bin/find_homoeologue_variations.rb +385 -0
  11. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  12. data/bin/hexaploid_primers.rb +2 -2
  13. data/bin/homokaryot_primers.rb +2 -2
  14. data/bin/mafft_triads.rb +120 -0
  15. data/bin/mafft_triads_promoters.rb +403 -0
  16. data/bin/polymarker.rb +73 -17
  17. data/bin/polymarker_capillary.rb +416 -0
  18. data/bin/snp_position_to_polymarker.rb +5 -3
  19. data/bin/snps_between_bams.rb +0 -29
  20. data/bin/vcfLineToTable.rb +56 -0
  21. data/bio-polyploid-tools.gemspec +74 -32
  22. data/lib/bio/BFRTools.rb +1 -0
  23. data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
  24. data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
  25. data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
  26. data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
  27. data/lib/bio/PolyploidTools/SNP.rb +58 -18
  28. data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
  29. data/lib/bio/db/blast.rb +112 -0
  30. data/lib/bio/db/exonerate.rb +4 -5
  31. data/lib/bio/db/primer3.rb +83 -14
  32. data/test/data/BS00068396_51_blast.tab +4 -0
  33. data/test/data/BS00068396_51_contigs.nhr +0 -0
  34. data/test/data/BS00068396_51_contigs.nin +0 -0
  35. data/test/data/BS00068396_51_contigs.nsq +0 -0
  36. data/test/data/BS00068396_51_for_polymarker.fa +1 -0
  37. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  38. data/test/data/S22380157.vcf +67 -0
  39. data/test/data/S58861868/LIB1716.bam +0 -0
  40. data/test/data/S58861868/LIB1716.sam +651 -0
  41. data/test/data/S58861868/LIB1719.bam +0 -0
  42. data/test/data/S58861868/LIB1719.sam +805 -0
  43. data/test/data/S58861868/LIB1721.bam +0 -0
  44. data/test/data/S58861868/LIB1721.sam +1790 -0
  45. data/test/data/S58861868/LIB1722.bam +0 -0
  46. data/test/data/S58861868/LIB1722.sam +1271 -0
  47. data/test/data/S58861868/S58861868.fa +16 -0
  48. data/test/data/S58861868/S58861868.fa.fai +1 -0
  49. data/test/data/S58861868/S58861868.vcf +76 -0
  50. data/test/data/S58861868/header.txt +9 -0
  51. data/test/data/S58861868/merged.bam +0 -0
  52. data/test/data/S58861868/merged_reheader.bam +0 -0
  53. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  54. data/test/data/bfr_out_test.csv +5 -5
  55. data/test/data/headerMergeed.txt +9 -0
  56. data/test/data/headerS2238015 +1 -0
  57. data/test/data/mergedLibs.bam +0 -0
  58. data/test/data/mergedLibsReheader.bam +0 -0
  59. data/test/data/mergedLibsSorted.bam +0 -0
  60. data/test/data/mergedLibsSorted.bam.bai +0 -0
  61. data/test/test_bfr.rb +26 -34
  62. data/test/test_blast.rb +47 -0
  63. data/test/test_exonearate.rb +4 -9
  64. data/test/test_snp_parsing.rb +42 -22
  65. metadata +81 -20
  66. data/Gemfile.lock +0 -67
@@ -42,7 +42,8 @@ OptionParser.new do |opts|
42
42
  opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
43
43
  options[:flanking_size] = o.to_i
44
44
  end
45
- opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line.\n\
45
+
46
+ opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
46
47
  requires --reference to get the sequence using a position") do |o|
47
48
  options[:mutant_list] = o
48
49
  test_file = o
@@ -76,9 +77,10 @@ File.open(test_file) do | f |
76
77
  if region != lastRegion
77
78
  lastTemplate = fasta_reference_db.fetch_sequence(region)
78
79
  end
79
- snp.template_sequence = lastTemplate
80
+ snp.full_sequence = lastTemplate
80
81
  lastRegion = region
81
- out.puts "#{snp.gene}_#{snp.snp_id_in_seq},#{snp.chromosome},#{snp.to_polymarker_sequence(options[:flanking_size])}"
82
+
83
+ out.puts "#{snp.gene}_#{snp.snp_id_in_seq},#{snp.chromosome},#{snp.sequence_original}"
82
84
  else
83
85
  $stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
84
86
  end
@@ -54,7 +54,6 @@ fasta_db.index.entries.each do | r |
54
54
 
55
55
 
56
56
  begin
57
- <<<<<<< HEAD
58
57
  reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
59
58
  reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
60
59
  cons_1 = reg_a.consensus
@@ -85,34 +84,6 @@ fasta_db.index.entries.each do | r |
85
84
  fasta_file.puts ">#{r.id}_2"
86
85
  fasta_file.puts "#{cons_2}"
87
86
 
88
- =======
89
-
90
- cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
91
- cons_2 = bam2.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
92
- if cons_1 != cons_2
93
-
94
- snps_1 = cons_1.count_ambiguities
95
- snps_2 = cons_2.count_ambiguities
96
-
97
- snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
98
-
99
- snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
100
- snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
101
- snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
102
-
103
- hist_1[snps_per_1k_1.to_i] += 1
104
- hist_2[snps_per_1k_2.to_i] += 1
105
-
106
- table_file.print "#{r.id}\t#{region.size}\t"
107
- table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
108
- table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
109
- table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
110
- fasta_file.puts ">#{r.id}_1"
111
- fasta_file.puts "#{cons_1}"
112
- fasta_file.puts ">#{r.id}_2"
113
- fasta_file.puts "#{cons_2}"
114
- end
115
- >>>>>>> 1b60bd09fdb1b087d6cb53c643ff36e536efe4a3
116
87
  rescue Exception => e
117
88
  $stderr.puts "Unable to process #{region}: #{e.to_s}"
118
89
  end
@@ -0,0 +1,56 @@
1
+ require 'bio-samtools'
2
+ require 'optparse'
3
+
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path=File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
7
+
8
+
9
+
10
+
11
+ def parseVCFheader(head_line="")
12
+ ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
13
+
14
+ m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
15
+ {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
16
+
17
+ end
18
+
19
+
20
+ header_info = Hash.new
21
+ ARGF.each_line do |line|
22
+ h = nil
23
+ h = parseVCFheader(line) if line.start_with? "##INFO"
24
+
25
+ header_info[h[:id]] = h[:desc] if h
26
+ #puts header_info.inspect
27
+ next if line.start_with? "##"
28
+ if line.start_with? "#CHROM"
29
+ arr = line.split
30
+ arr = arr.drop(9)
31
+ arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
32
+ #header += arr2.join("\t")
33
+ #puts header
34
+ next
35
+ end
36
+
37
+ line.chomp!
38
+
39
+ vcf = Bio::DB::Vcf.new(line, arr)
40
+ # puts arr.join("\t") if vcf.info["TYPE"] == "snp"
41
+ # puts vcf.inspect
42
+ #pus vcf.pos.inspect
43
+ #next if vcf.info["AO"].to_i != 1
44
+ vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
45
+
46
+ arr2 = Array.new
47
+ puts "____"
48
+ i = 0
49
+ vcf.samples.each do |sample|
50
+ #puts sample.inspect
51
+ puts sample[1].keys.join("\t") if i == 0
52
+ puts sample[1].values.join("\t")
53
+ i+=1
54
+ end
55
+
56
+ end
@@ -1,44 +1,52 @@
1
- # Generated by jeweler
1
+ # Generated by juwelier
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
3
+ # Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: bio-polyploid-tools 0.7.3 ruby lib
5
+ # stub: bio-polyploid-tools 0.8.0 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
- s.name = "bio-polyploid-tools"
9
- s.version = "0.7.3"
8
+ s.name = "bio-polyploid-tools".freeze
9
+ s.version = "0.8.0"
10
10
 
11
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
- s.require_paths = ["lib"]
13
- s.authors = ["Ricardo H. Ramirez-Gonzalez"]
14
- s.date = "2015-08-10"
15
- s.description = "Repository of tools developed in TGAC and Crop Genetics in JIC to work with polyploid wheat"
16
- s.email = "ricardo.ramirez-gonzalez@tgac.ac.uk"
17
- s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "filter_exonerate_by_identity.rb", "find_best_blat_hit.rb", "find_best_exonerate.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snp_position_to_polymarker.rb", "snps_between_bams.rb"]
11
+ s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib".freeze]
13
+ s.authors = ["Ricardo H. Ramirez-Gonzalez".freeze]
14
+ s.date = "2018-01-18"
15
+ s.description = "Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat".freeze
16
+ s.email = "ricardo.ramirez-gonzalez@jic.ac.uk".freeze
17
+ s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "markers_in_region.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "vcfLineToTable.rb".freeze]
18
18
  s.extra_rdoc_files = [
19
19
  "README",
20
20
  "README.md"
21
21
  ]
22
22
  s.files = [
23
+ ".travis.yml",
23
24
  "Gemfile",
24
- "Gemfile.lock",
25
25
  "README",
26
26
  "README.md",
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "bin/bfr.rb",
30
+ "bin/blast_triads.rb",
31
+ "bin/blast_triads_promoters.rb",
30
32
  "bin/count_variations.rb",
31
33
  "bin/filter_blat_by_target_coverage.rb",
32
34
  "bin/filter_exonerate_by_identity.rb",
33
35
  "bin/find_best_blat_hit.rb",
34
36
  "bin/find_best_exonerate.rb",
37
+ "bin/find_homoeologue_variations.rb",
38
+ "bin/get_longest_hsp_blastx_triads.rb",
35
39
  "bin/hexaploid_primers.rb",
36
40
  "bin/homokaryot_primers.rb",
41
+ "bin/mafft_triads.rb",
42
+ "bin/mafft_triads_promoters.rb",
37
43
  "bin/map_markers_to_contigs.rb",
38
44
  "bin/markers_in_region.rb",
39
45
  "bin/polymarker.rb",
46
+ "bin/polymarker_capillary.rb",
40
47
  "bin/snp_position_to_polymarker.rb",
41
48
  "bin/snps_between_bams.rb",
49
+ "bin/vcfLineToTable.rb",
42
50
  "bio-polyploid-tools.gemspec",
43
51
  "conf/defaults.rb",
44
52
  "conf/primer3_config/dangle.dh",
@@ -80,21 +88,29 @@ Gem::Specification.new do |s|
80
88
  "lib/bio/PolyploidTools/ChromosomeArm.rb",
81
89
  "lib/bio/PolyploidTools/ExonContainer.rb",
82
90
  "lib/bio/PolyploidTools/Marker.rb",
91
+ "lib/bio/PolyploidTools/NoSNPSequence.rb",
83
92
  "lib/bio/PolyploidTools/PrimerRegion.rb",
84
93
  "lib/bio/PolyploidTools/SNP.rb",
85
94
  "lib/bio/PolyploidTools/SNPMutant.rb",
86
95
  "lib/bio/PolyploidTools/SNPSequence.rb",
96
+ "lib/bio/db/blast.rb",
87
97
  "lib/bio/db/exonerate.rb",
88
98
  "lib/bio/db/primer3.rb",
89
99
  "lib/bioruby-polyploid-tools.rb",
90
100
  "test/data/BS00068396_51.fa",
101
+ "test/data/BS00068396_51_blast.tab",
91
102
  "test/data/BS00068396_51_contigs.aln",
92
103
  "test/data/BS00068396_51_contigs.dnd",
93
104
  "test/data/BS00068396_51_contigs.fa",
105
+ "test/data/BS00068396_51_contigs.nhr",
106
+ "test/data/BS00068396_51_contigs.nin",
107
+ "test/data/BS00068396_51_contigs.nsq",
94
108
  "test/data/BS00068396_51_exonerate.tab",
109
+ "test/data/BS00068396_51_for_polymarker.fa",
95
110
  "test/data/BS00068396_51_genes.txt",
96
111
  "test/data/IWGSC_CSS_1AL_scaff_1455974.fa",
97
112
  "test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa",
113
+ "test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai",
98
114
  "test/data/LIB1716.bam",
99
115
  "test/data/LIB1716.bam.bai",
100
116
  "test/data/LIB1719.bam",
@@ -109,9 +125,31 @@ Gem::Specification.new do |s|
109
125
  "test/data/PST130_reverse_primer.csv",
110
126
  "test/data/S22380157.fa",
111
127
  "test/data/S22380157.fa.fai",
128
+ "test/data/S22380157.vcf",
129
+ "test/data/S58861868/LIB1716.bam",
130
+ "test/data/S58861868/LIB1716.sam",
131
+ "test/data/S58861868/LIB1719.bam",
132
+ "test/data/S58861868/LIB1719.sam",
133
+ "test/data/S58861868/LIB1721.bam",
134
+ "test/data/S58861868/LIB1721.sam",
135
+ "test/data/S58861868/LIB1722.bam",
136
+ "test/data/S58861868/LIB1722.sam",
137
+ "test/data/S58861868/S58861868.fa",
138
+ "test/data/S58861868/S58861868.fa.fai",
139
+ "test/data/S58861868/S58861868.vcf",
140
+ "test/data/S58861868/header.txt",
141
+ "test/data/S58861868/merged.bam",
142
+ "test/data/S58861868/merged_reheader.bam",
143
+ "test/data/S58861868/merged_reheader.bam.bai",
112
144
  "test/data/Test3Aspecific.csv",
113
145
  "test/data/Test3Aspecific_contigs.fa",
114
146
  "test/data/bfr_out_test.csv",
147
+ "test/data/headerMergeed.txt",
148
+ "test/data/headerS2238015",
149
+ "test/data/mergedLibs.bam",
150
+ "test/data/mergedLibsReheader.bam",
151
+ "test/data/mergedLibsSorted.bam",
152
+ "test/data/mergedLibsSorted.bam.bai",
115
153
  "test/data/patological_cases5D.csv",
116
154
  "test/data/primer_3_input_header_test",
117
155
  "test/data/short_primer_design_test.csv",
@@ -122,38 +160,42 @@ Gem::Specification.new do |s|
122
160
  "test/data/test_primer3_error.csv",
123
161
  "test/data/test_primer3_error_contigs.fa",
124
162
  "test/test_bfr.rb",
163
+ "test/test_blast.rb",
125
164
  "test/test_exon_container.rb",
126
165
  "test/test_exonearate.rb",
127
166
  "test/test_snp_parsing.rb",
128
167
  "test/test_wrong_selection.sh"
129
168
  ]
130
- s.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
131
- s.licenses = ["MIT"]
132
- s.rubygems_version = "2.4.7"
133
- s.summary = "Tool to work with polyploids, NGS and molecular biology"
169
+ s.homepage = "http://github.com/tgac/bioruby-polyploid-tools".freeze
170
+ s.licenses = ["MIT".freeze]
171
+ s.rubygems_version = "2.7.4".freeze
172
+ s.summary = "Tool to work with polyploids, NGS and molecular biology".freeze
134
173
 
135
174
  if s.respond_to? :specification_version then
136
175
  s.specification_version = 4
137
176
 
138
177
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
139
- s.add_runtime_dependency(%q<bio>, [">= 1.4.3"])
140
- s.add_runtime_dependency(%q<bio-samtools>, [">= 2.0.4"])
141
- s.add_runtime_dependency(%q<rake>, [">= 0"])
142
- s.add_runtime_dependency(%q<jeweler>, [">= 0"])
143
- s.add_runtime_dependency(%q<systemu>, [">= 2.5.2"])
178
+ s.add_runtime_dependency(%q<bio>.freeze, [">= 1.5.1"])
179
+ s.add_runtime_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
180
+ s.add_runtime_dependency(%q<systemu>.freeze, [">= 2.5.2"])
181
+ s.add_development_dependency(%q<shoulda>.freeze, [">= 2.10"])
182
+ s.add_development_dependency(%q<test-unit>.freeze, [">= 0"])
183
+ s.add_development_dependency(%q<juwelier>.freeze, [">= 0"])
144
184
  else
145
- s.add_dependency(%q<bio>, [">= 1.4.3"])
146
- s.add_dependency(%q<bio-samtools>, [">= 2.0.4"])
147
- s.add_dependency(%q<rake>, [">= 0"])
148
- s.add_dependency(%q<jeweler>, [">= 0"])
149
- s.add_dependency(%q<systemu>, [">= 2.5.2"])
185
+ s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
186
+ s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
187
+ s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
188
+ s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
189
+ s.add_dependency(%q<test-unit>.freeze, [">= 0"])
190
+ s.add_dependency(%q<juwelier>.freeze, [">= 0"])
150
191
  end
151
192
  else
152
- s.add_dependency(%q<bio>, [">= 1.4.3"])
153
- s.add_dependency(%q<bio-samtools>, [">= 2.0.4"])
154
- s.add_dependency(%q<rake>, [">= 0"])
155
- s.add_dependency(%q<jeweler>, [">= 0"])
156
- s.add_dependency(%q<systemu>, [">= 2.5.2"])
193
+ s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
194
+ s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
195
+ s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
196
+ s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
197
+ s.add_dependency(%q<test-unit>.freeze, [">= 0"])
198
+ s.add_dependency(%q<juwelier>.freeze, [">= 0"])
157
199
  end
158
200
  end
159
201
 
@@ -114,6 +114,7 @@ module Bio::BFRTools
114
114
  self.entry = reg.entry
115
115
  self.start = reg.start
116
116
  self.end = reg.end
117
+ @BFRs = nil
117
118
  opts[:region] = reg
118
119
  @container = opts[:container]
119
120
 
@@ -22,6 +22,7 @@ module Bio::PolyploidTools
22
22
  # puts entry
23
23
  @fasta_db.fetch_sequence(entry.get_full_region)
24
24
  end
25
+
25
26
  #Loads all the chromosome arms in a folder.
26
27
  #The current version requires that all the references end with .fa, and start with XXX_*.fa
27
28
  #Where XXX is the chromosome name
@@ -29,16 +30,11 @@ module Bio::PolyploidTools
29
30
  chromosomeArms = Hash.new
30
31
 
31
32
  Dir.foreach(path_to_contigs) do |filename |
32
-
33
33
  if File.fnmatch("*.fa", filename)
34
34
 
35
35
  parsed = /^(?<arm>\d\w+)/.match(filename)
36
-
37
36
  target="#{path_to_contigs}/#{filename}"
38
-
39
-
40
-
41
- # fasta_file = Bio::DB::Fasta::FastaFile.new(target)
37
+ #fasta_file = Bio::DB::Fasta::FastaFile.new(target)
42
38
  #fasta_file.load_fai_entries
43
39
  arm = ChromosomeArm.new(parsed[:arm], target)
44
40
  chromosomeArms[arm.name] = arm
@@ -19,15 +19,31 @@ module Bio::PolyploidTools
19
19
 
20
20
  def gene_models(path)
21
21
  @gene_models_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
22
+ @gene_models_db.index
22
23
  @gene_models_path = path
23
24
  end
24
25
 
25
26
  #Returns the sequence for a region in the gene models (exon)
26
27
  def gene_model_sequence(region)
27
- #puts region
28
- seq=@gene_models_db.fetch_sequence(region)
29
-
28
+ #puts "Region: "
29
+ #puts region.inspect
30
+ target_reg = @gene_models_db.index.region_for_entry(region.entry)
31
+ #puts target_reg.inspect
32
+ region.end = target_reg.length if region.end > target_reg.length
33
+ #entries[region.entry]
30
34
 
35
+ seq=@gene_models_db.fetch_sequence(region)
36
+ #puts "sequence: "
37
+ #This is a patch that we need to fix in biosamtools:
38
+ #puts seq
39
+ index = seq.index('>')
40
+ if(index )
41
+ index -= 1
42
+ #puts "Index: #{index}"
43
+ seq = seq.slice(0..index)
44
+ end
45
+ #puts seq
46
+ seq
31
47
  end
32
48
 
33
49
  #Sets the reference file for the gene models
@@ -40,10 +56,10 @@ module Bio::PolyploidTools
40
56
  def chromosome_sequence(region)
41
57
  left_pad = 0
42
58
  #TODO: Padd if it goes to the right
43
- if(region.start < 0)
59
+ if(region.start < 1)
44
60
  left_pad = region.start * -1
45
61
  left_pad += 1
46
- region.start = 0
62
+ region.start = 1
47
63
  end
48
64
  str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
49
65
  #str << "n" * (region.size - str.size + 1) if region.size > str.size
@@ -116,12 +132,17 @@ module Bio::PolyploidTools
116
132
  @snp_map.each do | gene, snp_array|
117
133
  snp_array.each do |snp|
118
134
  #file.puts snp.primer_fasta_string
119
-
135
+ #puts "In print_fast_np_exones"
136
+ #puts snp.inspect
137
+
120
138
  begin
121
139
  file.puts snp.aligned_sequences_fasta
122
140
  rescue Exception=>e
123
141
  @missing_exons << snp.to_s
124
- $stderr.puts e.to_s
142
+ $stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
143
+ $stderr.puts "Local position: #{snp.local_position}"
144
+ $stderr.puts "Local position: #{snp.parental_sequences.to_s}"
145
+ $stderr.puts e.backtrace
125
146
  end
126
147
  end
127
148
  end
@@ -143,8 +164,10 @@ module Bio::PolyploidTools
143
164
  end
144
165
  rescue Exception=>e
145
166
  @missing_exons << snp.to_s
167
+ # $stderr.puts ""
146
168
 
147
- $stderr.puts e.to_s
169
+ $stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
170
+ $stderr.puts e.backtrace
148
171
  end
149
172
  end
150
173
  end
@@ -0,0 +1,286 @@
1
+
2
+ require_relative "SNP"
3
+ require 'bio-samtools'
4
+ module Bio::PolyploidTools
5
+ class SNPSequenceException < RuntimeError
6
+ end
7
+
8
+ class NoSNPSequence < SNP
9
+
10
+ attr_accessor :sequence_original
11
+ #Format:
12
+ #snp name,chromsome from contig,microarray sequence
13
+ #BS00068396_51,2AS,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
14
+ def self.parse(reg_str)
15
+ reg_str.chomp!
16
+ snp = NoSNPSequence.new
17
+
18
+ arr = reg_str.split(",")
19
+
20
+ if arr.size == 3
21
+ snp.gene, snp.chromosome, snp.sequence_original = reg_str.split(",")
22
+ elsif arr.size == 2
23
+ snp.gene, snp.sequence_original = arr
24
+ else
25
+ throw SNPSequenceException.new "Need two or three fields to parse, and got #{arr.size} in #{reg_str}"
26
+ end
27
+ #snp.position = snp.position.to_i
28
+ #snp.original.upcase!
29
+ #snp.snp.upcase!
30
+ snp.chromosome. strip!
31
+ snp.snp_in = snp.chromosome
32
+ snp.parse_sequence_snp
33
+ snp.exon_list = Hash.new()
34
+ snp
35
+ end
36
+
37
+ def parse_snp
38
+
39
+ end
40
+
41
+ def parse_sequence_snp
42
+ @position = (sequence_original.length / 2).to_i
43
+ @original = sequence_original[@position]
44
+ @snp = @original
45
+ end
46
+
47
+ def to_s
48
+ "#{gene}:#{chromosome}"
49
+ end
50
+
51
+ def sequences_to_align
52
+ @sequences_to_align = surrounding_exon_sequences unless @sequences_to_align
53
+ @sequences_to_align
54
+ end
55
+
56
+ def mask_aligned_chromosomal_snp(chromosome)
57
+ return nil if aligned_sequences.values.size == 0
58
+ names = exon_sequences.keys
59
+
60
+ masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
61
+
62
+ masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
63
+ #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
64
+ i = 0
65
+ while i < masked_snps.size
66
+ different = 0
67
+ cov = 0
68
+ from_group = 0
69
+ names.each do | chr |
70
+ if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
71
+ cov += 1
72
+
73
+ from_group += 1 if chr[0] == chromosome_group
74
+ #puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
75
+ if chr != chromosome
76
+ $stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
77
+ $stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
78
+ different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
79
+ end
80
+ end
81
+ end
82
+ masked_snps[i] = "-" if different == 0
83
+ masked_snps[i] = "-" if cov == 1
84
+ masked_snps[i] = "*" if cov == 0
85
+ expected_snps = names.size - 1
86
+ #puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
87
+
88
+ masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
89
+
90
+ i += 1
91
+ end
92
+ masked_snps
93
+ end
94
+
95
+ def count_deletions_around(position,target_chromosome)
96
+ first_aligned = aligned_sequences[target_chromosome]
97
+
98
+ pos_start = position - flanking_size
99
+ pos_end = position + flanking_size
100
+ pos_start = 0 if pos_start < 0
101
+ pos_end = first_aligned.size - 1 if pos_end >= first_aligned.size
102
+ count = 0
103
+ for i in pos_start..pos_end
104
+ has_del = false
105
+
106
+ aligned_sequences.each_pair do |name, val|
107
+ has_del = true if val[i] == '-'
108
+ print "#{val[i]}\t"
109
+ end
110
+ count += 1 if has_del
111
+ print "#{count}\n"
112
+ end
113
+ return count
114
+ end
115
+
116
+ def primer_region(target_chromosome, parental_chr )
117
+ chromosome_seq = aligned_sequences[target_chromosome]
118
+ #chromosome_seq = "-" * parental.size unless chromosome_seq
119
+ if aligned_sequences.size == 0
120
+ #puts aligned_sequences.inspect
121
+ #puts surrounding_exon_sequences.inspect
122
+ #puts self.inspect
123
+ chromosome_seq = surrounding_exon_sequences[target_chromosome]
124
+
125
+ end
126
+ chromosome_seq = chromosome_seq.downcase
127
+
128
+ mask = mask_aligned_chromosomal_snp(target_chromosome)
129
+
130
+ pr = PrimerRegion.new
131
+ pr.homoeologous = false
132
+ position_in_region = 0
133
+ parental = chromosome_seq.clone
134
+ (0..chromosome_seq.size-1).each do |i|
135
+
136
+ if chromosome_seq[i] != '-'
137
+ case
138
+ when mask[i] == '-'
139
+ #When the mask doesnt detect a SNP, so we take the parental
140
+ parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
141
+ when /[[:upper:]]/.match(mask[i])
142
+ #This is a good candidate for marking a SNP
143
+ #We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
144
+ if parental[i] == '-'
145
+ parental[i] = mask[i]
146
+ pr.crhomosome_specific_intron << position_in_region
147
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
148
+ parental[i] = mask[i]
149
+ pr.chromosome_specific << position_in_region if count_deletions_around(1,target_chromosome) < 3
150
+ pr.chromosome_specific_in_mask << i
151
+ end
152
+
153
+ when /[[:lower:]]/.match(mask[i])
154
+ #this is not that good candidate, but sitll gives specificity
155
+ if parental[i] == '-'
156
+ parental[i] = mask[i]
157
+ pr.almost_crhomosome_specific_intron << position_in_region
158
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
159
+ parental[i] = mask[i].upcase
160
+ pr.almost_chromosome_specific << position_in_region
161
+ pr.almost_chromosome_specific_in_mask << i
162
+ end
163
+ end #Case closes
164
+ pr.position_in_mask_from_template[position_in_region] = i
165
+ position_in_region += 1
166
+ end #Closes region with bases
167
+ end
168
+
169
+ pr.sequence=parental.gsub('-','')
170
+ pr
171
+ end
172
+
173
+ def return_primer_3_string_test(opts={})
174
+
175
+ left = opts[:right_pos]
176
+ right = opts[:right_pos]
177
+ sequence = opts[:sequence]
178
+ orientation = "forward"
179
+ if opts[:right_pos]
180
+ orientation = "forward"
181
+ if left > right
182
+ left = sequence.size - left - 1
183
+ right = sequence.size - right - 1
184
+ sequence = reverse_complement_string(sequence)
185
+ orientation = "reverse"
186
+ end
187
+ if @variation_free_region > 0
188
+ check_str = sequence[right+1, @variation_free_region]
189
+ return nil if check_str != check_str.downcase
190
+ end
191
+
192
+ end
193
+
194
+
195
+ str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
196
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
197
+ str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
198
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
199
+ str << "=\n"
200
+
201
+
202
+ #In case that we don't have a right primer, we do both orientations
203
+ unless opts[:right_pos]
204
+ sequence = opts[:sequence]
205
+ left = sequence.size - left - 1
206
+ orientation = "reverse"
207
+ sequence = reverse_complement_string(sequence)
208
+ str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
209
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
210
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
211
+ str << "=\n"
212
+ end
213
+
214
+ str
215
+ end
216
+
217
+ def get_base_in_different_chromosome(position, target_chromosome)
218
+
219
+ aligned_sequences.each_pair do |name, val|
220
+ next if target_chromosome == name
221
+ return val[position]
222
+ end
223
+ end
224
+
225
+ def primer_3_all_strings(target_chromosome, parental)
226
+ pr = primer_region(target_chromosome, parental )
227
+ primer_3_propertes = Array.new
228
+
229
+ seq_original = String.new(pr.sequence)
230
+ #puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
231
+ return primer_3_propertes if seq_original.size < primer_3_min_seq_length
232
+
233
+ if pr.homoeologous
234
+ snp_type = "homoeologous"
235
+ else
236
+ snp_type = "non-homoeologous"
237
+ end
238
+
239
+ pr.chromosome_specific.each do |pos|
240
+
241
+ seq_snp = String.new(pr.sequence)
242
+ orgiginal_base = seq_snp[pos]
243
+ other_chromosome_base = get_base_in_different_chromosome(pos, target_chromosome)
244
+
245
+ args = {
246
+ :name =>"#{gene} A chromosome_specific exon #{snp_type} #{chromosome}",
247
+ :left_pos => pos,
248
+ :sequence=>seq_original
249
+ }
250
+
251
+
252
+ primer_3_propertes << return_primer_3_string(args)
253
+ args[:name] = "#{gene} B chromosome_specific exon #{snp_type} #{chromosome}"
254
+ args[:sequence] = seq_snp
255
+ #TODO: Find base from another chromosome
256
+ seq_snp[pos] = other_chromosome_base.upcase
257
+
258
+ primer_3_propertes << return_primer_3_string(args)
259
+ end
260
+
261
+
262
+ primer_3_propertes
263
+ end
264
+
265
+ def aligned_sequences
266
+
267
+ return @aligned_sequences if @aligned_sequences
268
+ if sequences_to_align.size == 1
269
+ @aligned_sequences = sequences_to_align
270
+ return @aligned_sequences
271
+ end
272
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
273
+ mafft = Bio::MAFFT.new( "mafft" , options)
274
+ # puts "Before MAFT:#{sequences_to_align.inspect}"
275
+ report = mafft.query_align(sequences_to_align)
276
+ @aligned_sequences = report.alignment
277
+ # puts "MAFFT: #{report.alignment.inspect}"
278
+ @aligned_sequences
279
+ end
280
+
281
+
282
+
283
+
284
+
285
+ end
286
+ end