bio-polyploid-tools 0.7.3 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +17 -0
  3. data/Gemfile +10 -7
  4. data/README.md +44 -0
  5. data/Rakefile +14 -14
  6. data/VERSION +1 -1
  7. data/bin/bfr.rb +2 -2
  8. data/bin/blast_triads.rb +166 -0
  9. data/bin/blast_triads_promoters.rb +192 -0
  10. data/bin/find_homoeologue_variations.rb +385 -0
  11. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  12. data/bin/hexaploid_primers.rb +2 -2
  13. data/bin/homokaryot_primers.rb +2 -2
  14. data/bin/mafft_triads.rb +120 -0
  15. data/bin/mafft_triads_promoters.rb +403 -0
  16. data/bin/polymarker.rb +73 -17
  17. data/bin/polymarker_capillary.rb +416 -0
  18. data/bin/snp_position_to_polymarker.rb +5 -3
  19. data/bin/snps_between_bams.rb +0 -29
  20. data/bin/vcfLineToTable.rb +56 -0
  21. data/bio-polyploid-tools.gemspec +74 -32
  22. data/lib/bio/BFRTools.rb +1 -0
  23. data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
  24. data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
  25. data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
  26. data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
  27. data/lib/bio/PolyploidTools/SNP.rb +58 -18
  28. data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
  29. data/lib/bio/db/blast.rb +112 -0
  30. data/lib/bio/db/exonerate.rb +4 -5
  31. data/lib/bio/db/primer3.rb +83 -14
  32. data/test/data/BS00068396_51_blast.tab +4 -0
  33. data/test/data/BS00068396_51_contigs.nhr +0 -0
  34. data/test/data/BS00068396_51_contigs.nin +0 -0
  35. data/test/data/BS00068396_51_contigs.nsq +0 -0
  36. data/test/data/BS00068396_51_for_polymarker.fa +1 -0
  37. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  38. data/test/data/S22380157.vcf +67 -0
  39. data/test/data/S58861868/LIB1716.bam +0 -0
  40. data/test/data/S58861868/LIB1716.sam +651 -0
  41. data/test/data/S58861868/LIB1719.bam +0 -0
  42. data/test/data/S58861868/LIB1719.sam +805 -0
  43. data/test/data/S58861868/LIB1721.bam +0 -0
  44. data/test/data/S58861868/LIB1721.sam +1790 -0
  45. data/test/data/S58861868/LIB1722.bam +0 -0
  46. data/test/data/S58861868/LIB1722.sam +1271 -0
  47. data/test/data/S58861868/S58861868.fa +16 -0
  48. data/test/data/S58861868/S58861868.fa.fai +1 -0
  49. data/test/data/S58861868/S58861868.vcf +76 -0
  50. data/test/data/S58861868/header.txt +9 -0
  51. data/test/data/S58861868/merged.bam +0 -0
  52. data/test/data/S58861868/merged_reheader.bam +0 -0
  53. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  54. data/test/data/bfr_out_test.csv +5 -5
  55. data/test/data/headerMergeed.txt +9 -0
  56. data/test/data/headerS2238015 +1 -0
  57. data/test/data/mergedLibs.bam +0 -0
  58. data/test/data/mergedLibsReheader.bam +0 -0
  59. data/test/data/mergedLibsSorted.bam +0 -0
  60. data/test/data/mergedLibsSorted.bam.bai +0 -0
  61. data/test/test_bfr.rb +26 -34
  62. data/test/test_blast.rb +47 -0
  63. data/test/test_exonearate.rb +4 -9
  64. data/test/test_snp_parsing.rb +42 -22
  65. metadata +81 -20
  66. data/Gemfile.lock +0 -67
@@ -42,7 +42,8 @@ OptionParser.new do |opts|
42
42
  opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
43
43
  options[:flanking_size] = o.to_i
44
44
  end
45
- opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line.\n\
45
+
46
+ opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
46
47
  requires --reference to get the sequence using a position") do |o|
47
48
  options[:mutant_list] = o
48
49
  test_file = o
@@ -76,9 +77,10 @@ File.open(test_file) do | f |
76
77
  if region != lastRegion
77
78
  lastTemplate = fasta_reference_db.fetch_sequence(region)
78
79
  end
79
- snp.template_sequence = lastTemplate
80
+ snp.full_sequence = lastTemplate
80
81
  lastRegion = region
81
- out.puts "#{snp.gene}_#{snp.snp_id_in_seq},#{snp.chromosome},#{snp.to_polymarker_sequence(options[:flanking_size])}"
82
+
83
+ out.puts "#{snp.gene}_#{snp.snp_id_in_seq},#{snp.chromosome},#{snp.sequence_original}"
82
84
  else
83
85
  $stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
84
86
  end
@@ -54,7 +54,6 @@ fasta_db.index.entries.each do | r |
54
54
 
55
55
 
56
56
  begin
57
- <<<<<<< HEAD
58
57
  reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
59
58
  reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
60
59
  cons_1 = reg_a.consensus
@@ -85,34 +84,6 @@ fasta_db.index.entries.each do | r |
85
84
  fasta_file.puts ">#{r.id}_2"
86
85
  fasta_file.puts "#{cons_2}"
87
86
 
88
- =======
89
-
90
- cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
91
- cons_2 = bam2.consensus_with_ambiguities({:region=>region, :case=>true, :min_cov=>min_cov})
92
- if cons_1 != cons_2
93
-
94
- snps_1 = cons_1.count_ambiguities
95
- snps_2 = cons_2.count_ambiguities
96
-
97
- snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
98
-
99
- snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
100
- snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
101
- snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
102
-
103
- hist_1[snps_per_1k_1.to_i] += 1
104
- hist_2[snps_per_1k_2.to_i] += 1
105
-
106
- table_file.print "#{r.id}\t#{region.size}\t"
107
- table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
108
- table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
109
- table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
110
- fasta_file.puts ">#{r.id}_1"
111
- fasta_file.puts "#{cons_1}"
112
- fasta_file.puts ">#{r.id}_2"
113
- fasta_file.puts "#{cons_2}"
114
- end
115
- >>>>>>> 1b60bd09fdb1b087d6cb53c643ff36e536efe4a3
116
87
  rescue Exception => e
117
88
  $stderr.puts "Unable to process #{region}: #{e.to_s}"
118
89
  end
@@ -0,0 +1,56 @@
1
+ require 'bio-samtools'
2
+ require 'optparse'
3
+
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path=File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
7
+
8
+
9
+
10
+
11
+ def parseVCFheader(head_line="")
12
+ ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
13
+
14
+ m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
15
+ {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
16
+
17
+ end
18
+
19
+
20
+ header_info = Hash.new
21
+ ARGF.each_line do |line|
22
+ h = nil
23
+ h = parseVCFheader(line) if line.start_with? "##INFO"
24
+
25
+ header_info[h[:id]] = h[:desc] if h
26
+ #puts header_info.inspect
27
+ next if line.start_with? "##"
28
+ if line.start_with? "#CHROM"
29
+ arr = line.split
30
+ arr = arr.drop(9)
31
+ arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
32
+ #header += arr2.join("\t")
33
+ #puts header
34
+ next
35
+ end
36
+
37
+ line.chomp!
38
+
39
+ vcf = Bio::DB::Vcf.new(line, arr)
40
+ # puts arr.join("\t") if vcf.info["TYPE"] == "snp"
41
+ # puts vcf.inspect
42
+ #pus vcf.pos.inspect
43
+ #next if vcf.info["AO"].to_i != 1
44
+ vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
45
+
46
+ arr2 = Array.new
47
+ puts "____"
48
+ i = 0
49
+ vcf.samples.each do |sample|
50
+ #puts sample.inspect
51
+ puts sample[1].keys.join("\t") if i == 0
52
+ puts sample[1].values.join("\t")
53
+ i+=1
54
+ end
55
+
56
+ end
@@ -1,44 +1,52 @@
1
- # Generated by jeweler
1
+ # Generated by juwelier
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
3
+ # Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: bio-polyploid-tools 0.7.3 ruby lib
5
+ # stub: bio-polyploid-tools 0.8.0 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
- s.name = "bio-polyploid-tools"
9
- s.version = "0.7.3"
8
+ s.name = "bio-polyploid-tools".freeze
9
+ s.version = "0.8.0"
10
10
 
11
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
- s.require_paths = ["lib"]
13
- s.authors = ["Ricardo H. Ramirez-Gonzalez"]
14
- s.date = "2015-08-10"
15
- s.description = "Repository of tools developed in TGAC and Crop Genetics in JIC to work with polyploid wheat"
16
- s.email = "ricardo.ramirez-gonzalez@tgac.ac.uk"
17
- s.executables = ["bfr.rb", "count_variations.rb", "filter_blat_by_target_coverage.rb", "filter_exonerate_by_identity.rb", "find_best_blat_hit.rb", "find_best_exonerate.rb", "hexaploid_primers.rb", "homokaryot_primers.rb", "map_markers_to_contigs.rb", "markers_in_region.rb", "polymarker.rb", "snp_position_to_polymarker.rb", "snps_between_bams.rb"]
11
+ s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib".freeze]
13
+ s.authors = ["Ricardo H. Ramirez-Gonzalez".freeze]
14
+ s.date = "2018-01-18"
15
+ s.description = "Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat".freeze
16
+ s.email = "ricardo.ramirez-gonzalez@jic.ac.uk".freeze
17
+ s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "markers_in_region.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "vcfLineToTable.rb".freeze]
18
18
  s.extra_rdoc_files = [
19
19
  "README",
20
20
  "README.md"
21
21
  ]
22
22
  s.files = [
23
+ ".travis.yml",
23
24
  "Gemfile",
24
- "Gemfile.lock",
25
25
  "README",
26
26
  "README.md",
27
27
  "Rakefile",
28
28
  "VERSION",
29
29
  "bin/bfr.rb",
30
+ "bin/blast_triads.rb",
31
+ "bin/blast_triads_promoters.rb",
30
32
  "bin/count_variations.rb",
31
33
  "bin/filter_blat_by_target_coverage.rb",
32
34
  "bin/filter_exonerate_by_identity.rb",
33
35
  "bin/find_best_blat_hit.rb",
34
36
  "bin/find_best_exonerate.rb",
37
+ "bin/find_homoeologue_variations.rb",
38
+ "bin/get_longest_hsp_blastx_triads.rb",
35
39
  "bin/hexaploid_primers.rb",
36
40
  "bin/homokaryot_primers.rb",
41
+ "bin/mafft_triads.rb",
42
+ "bin/mafft_triads_promoters.rb",
37
43
  "bin/map_markers_to_contigs.rb",
38
44
  "bin/markers_in_region.rb",
39
45
  "bin/polymarker.rb",
46
+ "bin/polymarker_capillary.rb",
40
47
  "bin/snp_position_to_polymarker.rb",
41
48
  "bin/snps_between_bams.rb",
49
+ "bin/vcfLineToTable.rb",
42
50
  "bio-polyploid-tools.gemspec",
43
51
  "conf/defaults.rb",
44
52
  "conf/primer3_config/dangle.dh",
@@ -80,21 +88,29 @@ Gem::Specification.new do |s|
80
88
  "lib/bio/PolyploidTools/ChromosomeArm.rb",
81
89
  "lib/bio/PolyploidTools/ExonContainer.rb",
82
90
  "lib/bio/PolyploidTools/Marker.rb",
91
+ "lib/bio/PolyploidTools/NoSNPSequence.rb",
83
92
  "lib/bio/PolyploidTools/PrimerRegion.rb",
84
93
  "lib/bio/PolyploidTools/SNP.rb",
85
94
  "lib/bio/PolyploidTools/SNPMutant.rb",
86
95
  "lib/bio/PolyploidTools/SNPSequence.rb",
96
+ "lib/bio/db/blast.rb",
87
97
  "lib/bio/db/exonerate.rb",
88
98
  "lib/bio/db/primer3.rb",
89
99
  "lib/bioruby-polyploid-tools.rb",
90
100
  "test/data/BS00068396_51.fa",
101
+ "test/data/BS00068396_51_blast.tab",
91
102
  "test/data/BS00068396_51_contigs.aln",
92
103
  "test/data/BS00068396_51_contigs.dnd",
93
104
  "test/data/BS00068396_51_contigs.fa",
105
+ "test/data/BS00068396_51_contigs.nhr",
106
+ "test/data/BS00068396_51_contigs.nin",
107
+ "test/data/BS00068396_51_contigs.nsq",
94
108
  "test/data/BS00068396_51_exonerate.tab",
109
+ "test/data/BS00068396_51_for_polymarker.fa",
95
110
  "test/data/BS00068396_51_genes.txt",
96
111
  "test/data/IWGSC_CSS_1AL_scaff_1455974.fa",
97
112
  "test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa",
113
+ "test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai",
98
114
  "test/data/LIB1716.bam",
99
115
  "test/data/LIB1716.bam.bai",
100
116
  "test/data/LIB1719.bam",
@@ -109,9 +125,31 @@ Gem::Specification.new do |s|
109
125
  "test/data/PST130_reverse_primer.csv",
110
126
  "test/data/S22380157.fa",
111
127
  "test/data/S22380157.fa.fai",
128
+ "test/data/S22380157.vcf",
129
+ "test/data/S58861868/LIB1716.bam",
130
+ "test/data/S58861868/LIB1716.sam",
131
+ "test/data/S58861868/LIB1719.bam",
132
+ "test/data/S58861868/LIB1719.sam",
133
+ "test/data/S58861868/LIB1721.bam",
134
+ "test/data/S58861868/LIB1721.sam",
135
+ "test/data/S58861868/LIB1722.bam",
136
+ "test/data/S58861868/LIB1722.sam",
137
+ "test/data/S58861868/S58861868.fa",
138
+ "test/data/S58861868/S58861868.fa.fai",
139
+ "test/data/S58861868/S58861868.vcf",
140
+ "test/data/S58861868/header.txt",
141
+ "test/data/S58861868/merged.bam",
142
+ "test/data/S58861868/merged_reheader.bam",
143
+ "test/data/S58861868/merged_reheader.bam.bai",
112
144
  "test/data/Test3Aspecific.csv",
113
145
  "test/data/Test3Aspecific_contigs.fa",
114
146
  "test/data/bfr_out_test.csv",
147
+ "test/data/headerMergeed.txt",
148
+ "test/data/headerS2238015",
149
+ "test/data/mergedLibs.bam",
150
+ "test/data/mergedLibsReheader.bam",
151
+ "test/data/mergedLibsSorted.bam",
152
+ "test/data/mergedLibsSorted.bam.bai",
115
153
  "test/data/patological_cases5D.csv",
116
154
  "test/data/primer_3_input_header_test",
117
155
  "test/data/short_primer_design_test.csv",
@@ -122,38 +160,42 @@ Gem::Specification.new do |s|
122
160
  "test/data/test_primer3_error.csv",
123
161
  "test/data/test_primer3_error_contigs.fa",
124
162
  "test/test_bfr.rb",
163
+ "test/test_blast.rb",
125
164
  "test/test_exon_container.rb",
126
165
  "test/test_exonearate.rb",
127
166
  "test/test_snp_parsing.rb",
128
167
  "test/test_wrong_selection.sh"
129
168
  ]
130
- s.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
131
- s.licenses = ["MIT"]
132
- s.rubygems_version = "2.4.7"
133
- s.summary = "Tool to work with polyploids, NGS and molecular biology"
169
+ s.homepage = "http://github.com/tgac/bioruby-polyploid-tools".freeze
170
+ s.licenses = ["MIT".freeze]
171
+ s.rubygems_version = "2.7.4".freeze
172
+ s.summary = "Tool to work with polyploids, NGS and molecular biology".freeze
134
173
 
135
174
  if s.respond_to? :specification_version then
136
175
  s.specification_version = 4
137
176
 
138
177
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
139
- s.add_runtime_dependency(%q<bio>, [">= 1.4.3"])
140
- s.add_runtime_dependency(%q<bio-samtools>, [">= 2.0.4"])
141
- s.add_runtime_dependency(%q<rake>, [">= 0"])
142
- s.add_runtime_dependency(%q<jeweler>, [">= 0"])
143
- s.add_runtime_dependency(%q<systemu>, [">= 2.5.2"])
178
+ s.add_runtime_dependency(%q<bio>.freeze, [">= 1.5.1"])
179
+ s.add_runtime_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
180
+ s.add_runtime_dependency(%q<systemu>.freeze, [">= 2.5.2"])
181
+ s.add_development_dependency(%q<shoulda>.freeze, [">= 2.10"])
182
+ s.add_development_dependency(%q<test-unit>.freeze, [">= 0"])
183
+ s.add_development_dependency(%q<juwelier>.freeze, [">= 0"])
144
184
  else
145
- s.add_dependency(%q<bio>, [">= 1.4.3"])
146
- s.add_dependency(%q<bio-samtools>, [">= 2.0.4"])
147
- s.add_dependency(%q<rake>, [">= 0"])
148
- s.add_dependency(%q<jeweler>, [">= 0"])
149
- s.add_dependency(%q<systemu>, [">= 2.5.2"])
185
+ s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
186
+ s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
187
+ s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
188
+ s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
189
+ s.add_dependency(%q<test-unit>.freeze, [">= 0"])
190
+ s.add_dependency(%q<juwelier>.freeze, [">= 0"])
150
191
  end
151
192
  else
152
- s.add_dependency(%q<bio>, [">= 1.4.3"])
153
- s.add_dependency(%q<bio-samtools>, [">= 2.0.4"])
154
- s.add_dependency(%q<rake>, [">= 0"])
155
- s.add_dependency(%q<jeweler>, [">= 0"])
156
- s.add_dependency(%q<systemu>, [">= 2.5.2"])
193
+ s.add_dependency(%q<bio>.freeze, [">= 1.5.1"])
194
+ s.add_dependency(%q<bio-samtools>.freeze, [">= 2.6.2"])
195
+ s.add_dependency(%q<systemu>.freeze, [">= 2.5.2"])
196
+ s.add_dependency(%q<shoulda>.freeze, [">= 2.10"])
197
+ s.add_dependency(%q<test-unit>.freeze, [">= 0"])
198
+ s.add_dependency(%q<juwelier>.freeze, [">= 0"])
157
199
  end
158
200
  end
159
201
 
@@ -114,6 +114,7 @@ module Bio::BFRTools
114
114
  self.entry = reg.entry
115
115
  self.start = reg.start
116
116
  self.end = reg.end
117
+ @BFRs = nil
117
118
  opts[:region] = reg
118
119
  @container = opts[:container]
119
120
 
@@ -22,6 +22,7 @@ module Bio::PolyploidTools
22
22
  # puts entry
23
23
  @fasta_db.fetch_sequence(entry.get_full_region)
24
24
  end
25
+
25
26
  #Loads all the chromosome arms in a folder.
26
27
  #The current version requires that all the references end with .fa, and start with XXX_*.fa
27
28
  #Where XXX is the chromosome name
@@ -29,16 +30,11 @@ module Bio::PolyploidTools
29
30
  chromosomeArms = Hash.new
30
31
 
31
32
  Dir.foreach(path_to_contigs) do |filename |
32
-
33
33
  if File.fnmatch("*.fa", filename)
34
34
 
35
35
  parsed = /^(?<arm>\d\w+)/.match(filename)
36
-
37
36
  target="#{path_to_contigs}/#{filename}"
38
-
39
-
40
-
41
- # fasta_file = Bio::DB::Fasta::FastaFile.new(target)
37
+ #fasta_file = Bio::DB::Fasta::FastaFile.new(target)
42
38
  #fasta_file.load_fai_entries
43
39
  arm = ChromosomeArm.new(parsed[:arm], target)
44
40
  chromosomeArms[arm.name] = arm
@@ -19,15 +19,31 @@ module Bio::PolyploidTools
19
19
 
20
20
  def gene_models(path)
21
21
  @gene_models_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path})
22
+ @gene_models_db.index
22
23
  @gene_models_path = path
23
24
  end
24
25
 
25
26
  #Returns the sequence for a region in the gene models (exon)
26
27
  def gene_model_sequence(region)
27
- #puts region
28
- seq=@gene_models_db.fetch_sequence(region)
29
-
28
+ #puts "Region: "
29
+ #puts region.inspect
30
+ target_reg = @gene_models_db.index.region_for_entry(region.entry)
31
+ #puts target_reg.inspect
32
+ region.end = target_reg.length if region.end > target_reg.length
33
+ #entries[region.entry]
30
34
 
35
+ seq=@gene_models_db.fetch_sequence(region)
36
+ #puts "sequence: "
37
+ #This is a patch that we need to fix in biosamtools:
38
+ #puts seq
39
+ index = seq.index('>')
40
+ if(index )
41
+ index -= 1
42
+ #puts "Index: #{index}"
43
+ seq = seq.slice(0..index)
44
+ end
45
+ #puts seq
46
+ seq
31
47
  end
32
48
 
33
49
  #Sets the reference file for the gene models
@@ -40,10 +56,10 @@ module Bio::PolyploidTools
40
56
  def chromosome_sequence(region)
41
57
  left_pad = 0
42
58
  #TODO: Padd if it goes to the right
43
- if(region.start < 0)
59
+ if(region.start < 1)
44
60
  left_pad = region.start * -1
45
61
  left_pad += 1
46
- region.start = 0
62
+ region.start = 1
47
63
  end
48
64
  str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
49
65
  #str << "n" * (region.size - str.size + 1) if region.size > str.size
@@ -116,12 +132,17 @@ module Bio::PolyploidTools
116
132
  @snp_map.each do | gene, snp_array|
117
133
  snp_array.each do |snp|
118
134
  #file.puts snp.primer_fasta_string
119
-
135
+ #puts "In print_fast_np_exones"
136
+ #puts snp.inspect
137
+
120
138
  begin
121
139
  file.puts snp.aligned_sequences_fasta
122
140
  rescue Exception=>e
123
141
  @missing_exons << snp.to_s
124
- $stderr.puts e.to_s
142
+ $stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
143
+ $stderr.puts "Local position: #{snp.local_position}"
144
+ $stderr.puts "Local position: #{snp.parental_sequences.to_s}"
145
+ $stderr.puts e.backtrace
125
146
  end
126
147
  end
127
148
  end
@@ -143,8 +164,10 @@ module Bio::PolyploidTools
143
164
  end
144
165
  rescue Exception=>e
145
166
  @missing_exons << snp.to_s
167
+ # $stderr.puts ""
146
168
 
147
- $stderr.puts e.to_s
169
+ $stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
170
+ $stderr.puts e.backtrace
148
171
  end
149
172
  end
150
173
  end
@@ -0,0 +1,286 @@
1
+
2
+ require_relative "SNP"
3
+ require 'bio-samtools'
4
+ module Bio::PolyploidTools
5
+ class SNPSequenceException < RuntimeError
6
+ end
7
+
8
+ class NoSNPSequence < SNP
9
+
10
+ attr_accessor :sequence_original
11
+ #Format:
12
+ #snp name,chromsome from contig,microarray sequence
13
+ #BS00068396_51,2AS,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
14
+ def self.parse(reg_str)
15
+ reg_str.chomp!
16
+ snp = NoSNPSequence.new
17
+
18
+ arr = reg_str.split(",")
19
+
20
+ if arr.size == 3
21
+ snp.gene, snp.chromosome, snp.sequence_original = reg_str.split(",")
22
+ elsif arr.size == 2
23
+ snp.gene, snp.sequence_original = arr
24
+ else
25
+ throw SNPSequenceException.new "Need two or three fields to parse, and got #{arr.size} in #{reg_str}"
26
+ end
27
+ #snp.position = snp.position.to_i
28
+ #snp.original.upcase!
29
+ #snp.snp.upcase!
30
+ snp.chromosome. strip!
31
+ snp.snp_in = snp.chromosome
32
+ snp.parse_sequence_snp
33
+ snp.exon_list = Hash.new()
34
+ snp
35
+ end
36
+
37
+ def parse_snp
38
+
39
+ end
40
+
41
+ def parse_sequence_snp
42
+ @position = (sequence_original.length / 2).to_i
43
+ @original = sequence_original[@position]
44
+ @snp = @original
45
+ end
46
+
47
+ def to_s
48
+ "#{gene}:#{chromosome}"
49
+ end
50
+
51
+ def sequences_to_align
52
+ @sequences_to_align = surrounding_exon_sequences unless @sequences_to_align
53
+ @sequences_to_align
54
+ end
55
+
56
+ def mask_aligned_chromosomal_snp(chromosome)
57
+ return nil if aligned_sequences.values.size == 0
58
+ names = exon_sequences.keys
59
+
60
+ masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
61
+
62
+ masked_snps = "-" * aligned_sequences.values[0].size unless aligned_sequences[chromosome]
63
+ #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
64
+ i = 0
65
+ while i < masked_snps.size
66
+ different = 0
67
+ cov = 0
68
+ from_group = 0
69
+ names.each do | chr |
70
+ if aligned_sequences[chr] and aligned_sequences[chr][i] != "-"
71
+ cov += 1
72
+
73
+ from_group += 1 if chr[0] == chromosome_group
74
+ #puts "Comparing #{chromosome_group} and #{chr[0]} as chromosomes"
75
+ if chr != chromosome
76
+ $stderr.puts "WARN: No base for #{masked_snps} : ##{i}" unless masked_snps[i].upcase
77
+ $stderr.puts "WARN: No base for #{aligned_sequences[chr]} : ##{i}" unless masked_snps[i].upcase
78
+ different += 1 if masked_snps[i].upcase != aligned_sequences[chr][i].upcase
79
+ end
80
+ end
81
+ end
82
+ masked_snps[i] = "-" if different == 0
83
+ masked_snps[i] = "-" if cov == 1
84
+ masked_snps[i] = "*" if cov == 0
85
+ expected_snps = names.size - 1
86
+ #puts "Diferences: #{different} to expected: #{ expected_snps } [#{i}] Genome count (#{from_group} == #{genomes_count})"
87
+
88
+ masked_snps[i] = masked_snps[i].upcase if different == expected_snps and from_group == genomes_count
89
+
90
+ i += 1
91
+ end
92
+ masked_snps
93
+ end
94
+
95
+ def count_deletions_around(position,target_chromosome)
96
+ first_aligned = aligned_sequences[target_chromosome]
97
+
98
+ pos_start = position - flanking_size
99
+ pos_end = position + flanking_size
100
+ pos_start = 0 if pos_start < 0
101
+ pos_end = first_aligned.size - 1 if pos_end >= first_aligned.size
102
+ count = 0
103
+ for i in pos_start..pos_end
104
+ has_del = false
105
+
106
+ aligned_sequences.each_pair do |name, val|
107
+ has_del = true if val[i] == '-'
108
+ print "#{val[i]}\t"
109
+ end
110
+ count += 1 if has_del
111
+ print "#{count}\n"
112
+ end
113
+ return count
114
+ end
115
+
116
+ def primer_region(target_chromosome, parental_chr )
117
+ chromosome_seq = aligned_sequences[target_chromosome]
118
+ #chromosome_seq = "-" * parental.size unless chromosome_seq
119
+ if aligned_sequences.size == 0
120
+ #puts aligned_sequences.inspect
121
+ #puts surrounding_exon_sequences.inspect
122
+ #puts self.inspect
123
+ chromosome_seq = surrounding_exon_sequences[target_chromosome]
124
+
125
+ end
126
+ chromosome_seq = chromosome_seq.downcase
127
+
128
+ mask = mask_aligned_chromosomal_snp(target_chromosome)
129
+
130
+ pr = PrimerRegion.new
131
+ pr.homoeologous = false
132
+ position_in_region = 0
133
+ parental = chromosome_seq.clone
134
+ (0..chromosome_seq.size-1).each do |i|
135
+
136
+ if chromosome_seq[i] != '-'
137
+ case
138
+ when mask[i] == '-'
139
+ #When the mask doesnt detect a SNP, so we take the parental
140
+ parental[i] = chromosome_seq[i] unless Bio::NucleicAcid::is_unambiguous(parental[i])
141
+ when /[[:upper:]]/.match(mask[i])
142
+ #This is a good candidate for marking a SNP
143
+ #We validate that the consensus from the sam file accepts the variation from the chromosomal sequence
144
+ if parental[i] == '-'
145
+ parental[i] = mask[i]
146
+ pr.crhomosome_specific_intron << position_in_region
147
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
148
+ parental[i] = mask[i]
149
+ pr.chromosome_specific << position_in_region if count_deletions_around(1,target_chromosome) < 3
150
+ pr.chromosome_specific_in_mask << i
151
+ end
152
+
153
+ when /[[:lower:]]/.match(mask[i])
154
+ #this is not that good candidate, but sitll gives specificity
155
+ if parental[i] == '-'
156
+ parental[i] = mask[i]
157
+ pr.almost_crhomosome_specific_intron << position_in_region
158
+ elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
159
+ parental[i] = mask[i].upcase
160
+ pr.almost_chromosome_specific << position_in_region
161
+ pr.almost_chromosome_specific_in_mask << i
162
+ end
163
+ end #Case closes
164
+ pr.position_in_mask_from_template[position_in_region] = i
165
+ position_in_region += 1
166
+ end #Closes region with bases
167
+ end
168
+
169
+ pr.sequence=parental.gsub('-','')
170
+ pr
171
+ end
172
+
173
+ def return_primer_3_string_test(opts={})
174
+
175
+ left = opts[:right_pos]
176
+ right = opts[:right_pos]
177
+ sequence = opts[:sequence]
178
+ orientation = "forward"
179
+ if opts[:right_pos]
180
+ orientation = "forward"
181
+ if left > right
182
+ left = sequence.size - left - 1
183
+ right = sequence.size - right - 1
184
+ sequence = reverse_complement_string(sequence)
185
+ orientation = "reverse"
186
+ end
187
+ if @variation_free_region > 0
188
+ check_str = sequence[right+1, @variation_free_region]
189
+ return nil if check_str != check_str.downcase
190
+ end
191
+
192
+ end
193
+
194
+
195
+ str = "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
196
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
197
+ str << "SEQUENCE_FORCE_RIGHT_END=#{right}\n" if opts[:right_pos]
198
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
199
+ str << "=\n"
200
+
201
+
202
+ #In case that we don't have a right primer, we do both orientations
203
+ unless opts[:right_pos]
204
+ sequence = opts[:sequence]
205
+ left = sequence.size - left - 1
206
+ orientation = "reverse"
207
+ sequence = reverse_complement_string(sequence)
208
+ str << "SEQUENCE_ID=#{opts[:name]} #{orientation}\n"
209
+ str << "SEQUENCE_FORCE_LEFT_END=#{left}\n"
210
+ str << "SEQUENCE_TEMPLATE=#{sequence}\n"
211
+ str << "=\n"
212
+ end
213
+
214
+ str
215
+ end
216
+
217
+ def get_base_in_different_chromosome(position, target_chromosome)
218
+
219
+ aligned_sequences.each_pair do |name, val|
220
+ next if target_chromosome == name
221
+ return val[position]
222
+ end
223
+ end
224
+
225
+ def primer_3_all_strings(target_chromosome, parental)
226
+ pr = primer_region(target_chromosome, parental )
227
+ primer_3_propertes = Array.new
228
+
229
+ seq_original = String.new(pr.sequence)
230
+ #puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
231
+ return primer_3_propertes if seq_original.size < primer_3_min_seq_length
232
+
233
+ if pr.homoeologous
234
+ snp_type = "homoeologous"
235
+ else
236
+ snp_type = "non-homoeologous"
237
+ end
238
+
239
+ pr.chromosome_specific.each do |pos|
240
+
241
+ seq_snp = String.new(pr.sequence)
242
+ orgiginal_base = seq_snp[pos]
243
+ other_chromosome_base = get_base_in_different_chromosome(pos, target_chromosome)
244
+
245
+ args = {
246
+ :name =>"#{gene} A chromosome_specific exon #{snp_type} #{chromosome}",
247
+ :left_pos => pos,
248
+ :sequence=>seq_original
249
+ }
250
+
251
+
252
+ primer_3_propertes << return_primer_3_string(args)
253
+ args[:name] = "#{gene} B chromosome_specific exon #{snp_type} #{chromosome}"
254
+ args[:sequence] = seq_snp
255
+ #TODO: Find base from another chromosome
256
+ seq_snp[pos] = other_chromosome_base.upcase
257
+
258
+ primer_3_propertes << return_primer_3_string(args)
259
+ end
260
+
261
+
262
+ primer_3_propertes
263
+ end
264
+
265
+ def aligned_sequences
266
+
267
+ return @aligned_sequences if @aligned_sequences
268
+ if sequences_to_align.size == 1
269
+ @aligned_sequences = sequences_to_align
270
+ return @aligned_sequences
271
+ end
272
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
273
+ mafft = Bio::MAFFT.new( "mafft" , options)
274
+ # puts "Before MAFT:#{sequences_to_align.inspect}"
275
+ report = mafft.query_align(sequences_to_align)
276
+ @aligned_sequences = report.alignment
277
+ # puts "MAFFT: #{report.alignment.inspect}"
278
+ @aligned_sequences
279
+ end
280
+
281
+
282
+
283
+
284
+
285
+ end
286
+ end