bio-polyploid-tools 0.9.9 → 0.9.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 48bdfc00532bec29d3ca2cd3214e22f8907ef45abb3bdf21802be60654fd188a
4
- data.tar.gz: 8ade77f7455c3a38d16f011642ecc69325c4d9bcd650c7f440d524e870c702ba
2
+ SHA1:
3
+ metadata.gz: 73e03b0d4cf0869a9e1a6b7d146bbb0e3ed4c91e
4
+ data.tar.gz: 64e0efc76fc1cd424b3febf1f4f298e202619a94
5
5
  SHA512:
6
- metadata.gz: e1fa868fc0d1f2d249a40c1a4f351d7bc2918d17303868e84342e0cc6a31656160af459efed2697b9b09bf0e47cf987d4ed3404d9ebf101e91f0b3d420f9c717
7
- data.tar.gz: 66839a8ae1794b510a98c8c69b3819f8eebee58518907a4169955f12a7d5502c2438e8a0449dea9b58671ee177ba519384d35327a1d2e7adec957a0bd8f005c9
6
+ metadata.gz: cd81ec303480157a64ac9af123bc331381cc825cfac4e557894c35ba3ab815f15d95d03b8d5e6f7921a8ff950cebefdb9a6182b81a274568d8dca9f1d0aa13db
7
+ data.tar.gz: 4104dc656d8b29604fc2293731f75a906c9d23941138ebb70e7b7a2a84a456a459e30be23530ca4c16d6ebdad03ca9dcbca4bd1e1ea7e068a734dee1c350e1b1
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.9.9
1
+ 0.9.10
@@ -0,0 +1,241 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools'
6
+ require 'optparse'
7
+ require 'set'
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
11
+ require path
12
+
13
+ options = {}
14
+ options[:min_identity] = 90
15
+ options[:filter_best] = false
16
+ options[:debug] = false
17
+
18
+ OptionParser.new do |opts|
19
+ opts.banner = "Usage: marler_to_vcf.rb [options]"
20
+
21
+ opts.on("-c", "--contigs FILE", "File with contigs to use as database") do |o|
22
+ options[:path_to_contigs] = o
23
+ end
24
+
25
+ opts.on("-m", "--marker_list FILE", "File with the list of markers to search from") do |o|
26
+ options[:marker_list] = o
27
+ end
28
+
29
+ opts.on("-b", "--filter_best", "If set, only keep the best alignment for each chromosome") do
30
+ options[:filter_best] = false
31
+ end
32
+
33
+ opts.on("-D", "--debug", "Validate that the flanking sequences are correct") do
34
+ options[:debug] = true
35
+ end
36
+
37
+ opts.on("-i", "--min_identity INT", "Minimum identity to consider a hit (default 90)") do |o|
38
+ options[:min_identity] = o.to_i
39
+ end
40
+
41
+ opts.on("-o", "--output FOLDER", "Output folder") do |o|
42
+ options[:output_folder] = o
43
+ end
44
+
45
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
46
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
47
+ end
48
+
49
+ opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: blast") do |o|
50
+ raise "Invalid aligner" unless o == "exonerate" or o == "blast"
51
+ options[:aligner] = o.to_sym
52
+ end
53
+
54
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
55
+ options[:database] = o
56
+ end
57
+
58
+ end.parse!
59
+ options[:database] = options[:path_to_contigs]
60
+ p options
61
+ p ARGV
62
+
63
+
64
+ path_to_contigs=options[:path_to_contigs]
65
+
66
+ original_name="A"
67
+ snp_in="B"
68
+
69
+ fasta_reference = nil
70
+ test_file=options[:marker_list]
71
+
72
+ output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}"
73
+ output_folder= options[:output_folder] if options[:output_folder]
74
+ Dir.mkdir(output_folder)
75
+ #T
76
+ temp_fasta_query="#{output_folder}/to_align.fa"
77
+ temp_contigs="#{output_folder}/contigs_tmp.fa"
78
+ exonerate_file="#{output_folder}/exonerate_tmp.tab"
79
+ vcf_file="#{output_folder}/snp_positions.vcf"
80
+
81
+ min_identity= options[:min_identity]
82
+
83
+ @status_file="#{output_folder}/status.txt"
84
+
85
+
86
+ def write_status(status)
87
+ f=File.open(@status_file, "a")
88
+ f.puts "#{Time.now.to_s},#{status}"
89
+ f.close
90
+ end
91
+
92
+
93
+ snps = Hash.new
94
+
95
+ fasta_reference_db=nil
96
+
97
+ #if options[:debug]
98
+ write_status "Loading Reference"
99
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path_to_contigs})
100
+ fasta_reference_db.load_fai_entries
101
+ write_status "Fasta reference: #{fasta_reference}"
102
+ #end
103
+
104
+ #1. Read all the SNP files
105
+ #chromosome = nil
106
+ write_status "Reading SNPs"
107
+
108
+ File.open(test_file) do | f |
109
+ f.each_line do | line |
110
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
111
+ snp.genomes_count = options[:genomes_count]
112
+ snp.snp_in = snp_in
113
+ snp.original_name = original_name
114
+ if snp.position
115
+ snps[snp.gene] = snp
116
+ else
117
+ $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
118
+ end
119
+ end
120
+ end
121
+
122
+ #2. Generate all the fasta files
123
+ write_status "Writing sequences to align"
124
+ written_seqs = Set.new
125
+ file = File.open(temp_fasta_query, "w")
126
+ snps.each_pair do |k,snp|
127
+ unless written_seqs.include?(snp.gene)
128
+ written_seqs << snp.gene
129
+ file.puts snp.to_fasta
130
+ end
131
+ end
132
+ file.close
133
+
134
+
135
+ #3. Run exonerate on each of the possible chromosomes for the SNP
136
+ #puts chromosome
137
+ #chr_group = chromosome[0]
138
+ write_status "Searching markers in genome"
139
+ exo_f = File.open(exonerate_file, "w")
140
+ contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
141
+ filename=path_to_contigs
142
+ #puts filename
143
+ target=filename
144
+
145
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
146
+ fasta_file.load_fai_entries
147
+ found_contigs = Set.new
148
+
149
+ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
150
+ if aln.identity > min_identity
151
+ exo_f.puts aln.line
152
+ end
153
+ end
154
+
155
+ Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database]}) do |aln|
156
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
157
+ end
158
+
159
+ exo_f.close()
160
+
161
+ def print_positions(min_identity:90, filter_best:false, exonerate_filename:"test.exo", snps:{}, reference:nil, out:$stdout)
162
+ marker_count=Hash.new { |h, k| h[k] = 1 }
163
+ File.open(exonerate_filename) do |f|
164
+ f.each_line do | line |
165
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
166
+ next unless record and record.identity >= min_identity
167
+ snp = snps[record.query_id]
168
+ next unless snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
169
+ begin
170
+
171
+ position = record.query_position_on_target(snp.position)
172
+ q_strand = record.query_strand
173
+ t_strand = record.target_strand
174
+ template = snp.template_sequence
175
+
176
+ vulgar = record.exon_on_gene_position(snp.position)
177
+ tr = vulgar.target_region
178
+ qr = vulgar.query_region
179
+ template_pre = template[qr.start - 1 .. snp.position - 1 ]
180
+ tr.orientation == :forward ? tr.end = position : tr.start = position
181
+ region = tr
182
+ target_seq = reference.fetch_sequence(region)
183
+ target_seq[-1] = target_seq[-1].upcase
184
+ ref_base = target_seq[-1]
185
+ ma = ref_base
186
+ alt_base = [snp.snp, snp.original].join(",")
187
+
188
+ if snp.original == ref_base
189
+ alt_base = snp.snp
190
+ elsif snp.snp == ref_base
191
+ alt_base = snp.original
192
+ end
193
+
194
+ if record.target_strand == :reverse
195
+ alt_base = Bio::Sequence::NA.new(alt_base)
196
+ ref_base = Bio::Sequence::NA.new(ref_base)
197
+ alt_base.complement!.upcase!
198
+ ref_base.complement!.upcase!
199
+ end
200
+
201
+ info = ["OR=#{record.target_strand}"]
202
+ info << "SC=#{record.score}"
203
+ info << "PI=#{record.pi}"
204
+ info << "MA=#{ma}"
205
+ info << "TS=#{target_seq}"
206
+ vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
207
+ #snp2 = Bio::PolyploidTools::SNP.parseVCF( vcf_line )
208
+ #snp2.setTemplateFromFastaFile(reference)
209
+ #seq2=snp2.to_polymarker_sequence(50)
210
+ #info << "PS=#{seq2}"
211
+ vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
212
+ out.puts(vcf_line)
213
+
214
+ marker_count[record.query_id] += 1
215
+ rescue Bio::DB::Exonerate::ExonerateException
216
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
217
+ end
218
+ end
219
+ end
220
+ end
221
+
222
+
223
+ write_status "Printing VCF file"
224
+ #puts snps.inspect
225
+ out = File.open(vcf_file, "w")
226
+ out.puts "##fileformat=VCFv4.2"
227
+ out.puts "##fileDate=#{Time.now.strftime("%Y%m%d")}"
228
+ out.puts "##source=#{$0}"
229
+ out.puts "##reference=file://#{options[:path_to_contigs]}"
230
+ out.puts "##INFO=<ID=OR,Number=1,Type=String,Description=\"Orientation of the alignment of the marker\">"
231
+ out.puts "##INFO=<ID=SC,Number=1,Type=Float,Description=\"Alignment score of the marker\">"
232
+ out.puts "##INFO=<ID=PI,Number=1,Type=Float,Description=\"Percentage of identity of the alignment to the marker\">"
233
+ out.puts "##INFO=<ID=PS,Number=1,Type=String,Description=\"SNP sequence for PolyMarker\">"
234
+ out.puts "##INFO=<ID=MA,Number=1,Type=String,Description=\"Allele based on the original marker sequence\">"
235
+ out.puts "##INFO=<ID=TS,Number=1,Type=String,Description=\"Target sequence before the SNP from the reference\">"
236
+ out.puts "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
237
+ print_positions(exonerate_filename:exonerate_file, min_identity:95, snps:snps, reference: fasta_reference_db, out:out)
238
+ out.close
239
+ write_status "DONE"
240
+
241
+
data/bin/polymarker.rb CHANGED
@@ -124,7 +124,7 @@ OptionParser.new do |opts|
124
124
  options[:scoring] = :het_dels
125
125
  end
126
126
 
127
- opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: exonerate") do |o|
127
+ opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: #{options[:aligner]}") do |o|
128
128
  raise "Invalid aligner" unless o == "exonerate" or o == "blast"
129
129
  options[:aligner] = o.to_sym
130
130
  end
@@ -137,7 +137,7 @@ end.parse!
137
137
 
138
138
  validate_files(options)
139
139
 
140
- options[:database] = options[:path_to_contigs] unless options[:database]
140
+ options[:database] = options[:path_to_contigs] unless options[:database]
141
141
 
142
142
 
143
143
  if options[:primer_3_preferences][:primer_product_size_range]
@@ -169,7 +169,7 @@ test_file=options[:mutant_list] if options[:mutant_list]
169
169
  fasta_reference = options[:reference]
170
170
  output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}"
171
171
  output_folder= options[:output_folder] if options[:output_folder]
172
- Dir.mkdir(output_folder)
172
+ Dir.mkdir(output_folder) unless Dir.exist?(output_folder)
173
173
  #TODO Make this tmp files
174
174
  temp_fasta_query="#{output_folder}/to_align.fa"
175
175
  temp_contigs="#{output_folder}/contigs_tmp.fa"
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ require 'csv'
6
+ require 'bio'
7
+ require 'bio-samtools'
8
+
9
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
10
+ $: << File.expand_path('.')
11
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
12
+ require path
13
+
14
+ options = {}
15
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene");
16
+
17
+
18
+ OptionParser.new do |opts|
19
+ opts.banner = "Usage: polymarker.rb [options]"
20
+
21
+ opts.on("-c", "--reference FILE", "File with genome reference to use as database") do |o|
22
+ options[:path_to_contigs] = o
23
+ end
24
+
25
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
26
+ tmp_str = o
27
+ arr = o.split(",")
28
+ if arr.size == 2
29
+ options[:arm_selection] = lambda do |contig_name|
30
+ separator, field = arr
31
+ field = field.to_i
32
+ ret = contig_name.split(separator)[field]
33
+ return ret
34
+ end
35
+ else
36
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
37
+ end
38
+ end
39
+
40
+
41
+ end.parse!
42
+
43
+ def parseVCFheader(head_line="")
44
+ ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
45
+
46
+ m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
47
+ {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
48
+
49
+ end
50
+
51
+
52
+
53
+
54
+ header_info = Hash.new
55
+ ref=options[:path_to_contigs]
56
+
57
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>ref})
58
+ fasta_reference_db.load_fai_entries
59
+
60
+ $stdin.each do |line|
61
+
62
+ h = nil
63
+ h = parseVCFheader(line) if line.start_with? "##INFO"
64
+
65
+ header_info[h[:id]] = h[:desc] if h
66
+ #puts header_info.inspect
67
+ next if line.start_with? "##"
68
+ if line.start_with? "#CHROM"
69
+ arr = line.split
70
+ arr = arr.drop(9)
71
+ arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
72
+ #header += arr2.join("\t")
73
+ #puts header
74
+ next
75
+ end
76
+ line.chomp!
77
+ #puts line
78
+ snp = Bio::PolyploidTools::SNP.parseVCF( line , options[:arm_selection])
79
+ #puts snp.inspect
80
+ snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size: 100)
81
+ puts [snp.gene, snp.chromosome ,snp.to_polymarker_sequence(100)].join(",")
82
+ end
@@ -2,19 +2,19 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: bio-polyploid-tools 0.9.9 ruby lib
5
+ # stub: bio-polyploid-tools 0.9.10 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "bio-polyploid-tools".freeze
9
- s.version = "0.9.9"
9
+ s.version = "0.9.10"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib".freeze]
13
13
  s.authors = ["Ricardo H. Ramirez-Gonzalez".freeze]
14
- s.date = "2018-11-21"
14
+ s.date = "2019-03-11"
15
15
  s.description = "Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat".freeze
16
16
  s.email = "ricardo.ramirez-gonzalez@jic.ac.uk".freeze
17
- s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "markers_in_region.rb".freeze, "mask_triads.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "tag_stats.rb".freeze, "vcfLineToTable.rb".freeze]
17
+ s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "marker_to_vcf.rb".freeze, "markers_in_region.rb".freeze, "mask_triads.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "tag_stats.rb".freeze, "vcfLineToTable.rb".freeze, "vcfToPolyMarker.rb".freeze]
18
18
  s.extra_rdoc_files = [
19
19
  "README",
20
20
  "README.md"
@@ -41,6 +41,7 @@ Gem::Specification.new do |s|
41
41
  "bin/mafft_triads.rb",
42
42
  "bin/mafft_triads_promoters.rb",
43
43
  "bin/map_markers_to_contigs.rb",
44
+ "bin/marker_to_vcf.rb",
44
45
  "bin/markers_in_region.rb",
45
46
  "bin/mask_triads.rb",
46
47
  "bin/polymarker.rb",
@@ -49,6 +50,7 @@ Gem::Specification.new do |s|
49
50
  "bin/snps_between_bams.rb",
50
51
  "bin/tag_stats.rb",
51
52
  "bin/vcfLineToTable.rb",
53
+ "bin/vcfToPolyMarker.rb",
52
54
  "bio-polyploid-tools.gemspec",
53
55
  "conf/defaults.rb",
54
56
  "conf/primer3_config/dangle.dh",
@@ -183,7 +185,7 @@ Gem::Specification.new do |s|
183
185
  ]
184
186
  s.homepage = "http://github.com/tgac/bioruby-polyploid-tools".freeze
185
187
  s.licenses = ["MIT".freeze]
186
- s.rubygems_version = "2.7.7".freeze
188
+ s.rubygems_version = "2.6.14".freeze
187
189
  s.summary = "Tool to work with polyploids, NGS and molecular biology".freeze
188
190
 
189
191
  if s.respond_to? :specification_version then
@@ -1,3 +1,5 @@
1
+ require 'bio'
2
+
1
3
  class Array
2
4
  def sum
3
5
  inject(0.0) { |result, el| result + el }
@@ -9,7 +11,6 @@ class Array
9
11
  end
10
12
 
11
13
  module Bio::PolyploidTools::Mask
12
-
13
14
  def self.find_end(seqs)
14
15
  size = seqs.values[0].size
15
16
  names = seqs.keys
@@ -112,3 +113,4 @@ module Bio::PolyploidTools::Mask
112
113
  }
113
114
  end
114
115
  end
116
+
@@ -1,11 +1,12 @@
1
1
  require 'bio'
2
2
  module Bio::PolyploidTools
3
3
  class SNPException < RuntimeError
4
- end
4
+ end
5
+
5
6
  class SNP
6
-
7
7
  #GENE,ORIGINAL,POS,SNP
8
8
  attr_accessor :gene, :original, :position, :snp, :snp_in, :original_name
9
+ attr_accessor :contig
9
10
  attr_accessor :exon_list
10
11
  attr_accessor :container
11
12
  attr_accessor :flanking_size, :ideal_min, :ideal_max
@@ -20,6 +21,7 @@ module Bio::PolyploidTools
20
21
  attr_accessor :repetitive
21
22
  attr_accessor :hit_count
22
23
  attr_accessor :snp_type
24
+ attr_accessor :orientation
23
25
 
24
26
  #Format:
25
27
  #Gene_name,Original,SNP_Pos,pos,chromosome
@@ -35,28 +37,57 @@ module Bio::PolyploidTools
35
37
  snp.snp.upcase!
36
38
  snp.snp.strip!
37
39
  snp.chromosome.strip!
38
-
40
+
39
41
  snp.use_reference = false
40
42
  snp
41
43
  end
42
44
 
43
- def setTemplateFromFastaFile(fastaFile ,flanking_size = 100)
45
+ #Format:
46
+ #IWGSC_CSS_1AL_scaff_1455974 127 test_snp C T 135.03 .
47
+ def self.parseVCF(vcf_line, chr_arm_parser: Bio::PolyploidTools::ChromosomeArm.getArmSelection("first_two") )
48
+ snp = SNP.new
49
+ arr = vcf_line.split("\t")
50
+ snp.gene = arr[2]
51
+ snp.original = arr[3]
52
+ snp.position = arr[1]
53
+ snp.snp = arr[4]
54
+ snp.chromosome = chr_arm_parser.call(arr[0])
55
+ snp.contig = arr[0]
56
+ snp.position.strip!
57
+ snp.position = snp.position.to_i
58
+ snp.original.upcase!
59
+ snp.original.strip!
60
+ snp.snp.upcase!
61
+ snp.snp.strip!
62
+ snp.chromosome.strip!
63
+ snp.orientation = :forward
64
+
65
+ info = arr[7]
66
+ if info
67
+ details = info.scan(/(\w+)=([\w|.]+)/).collect { |id, value| { :id => id, :value => value }}
68
+ details.each do |e|
69
+ snp.orientation = :reverse if e[:id] == "OR" and e[:value] == "reverse"
70
+ end
71
+ end
72
+ return snp
73
+ end
74
+
75
+ def setTemplateFromFastaFile(fastaFile ,flanking_size: 100)
44
76
  reg = Bio::DB::Fasta::Region.new
45
77
  reg.entry = gene
46
78
  reg.entry = @contig if @contig
47
- #puts reg.entry
48
- #puts @contig
49
- #puts gene
50
79
  reg.start = position - flanking_size
51
- reg.end = position + flanking_size +1
80
+ reg.end = position + flanking_size + 1
52
81
  reg.orientation = :forward
53
- entry = fastaFile.index.region_for_entry(gene)
82
+ entry = fastaFile.index.region_for_entry(reg.entry)
54
83
  reg.start = 1 if reg.start < 1
55
84
  reg.end = entry.length if reg.end > entry.length
56
85
  amb = Bio::NucleicAcid.to_IUAPC("#{original}#{snp}")
57
86
  @position = @position - reg.start + 1
58
87
  @position = 1 if @position < 1
88
+ #puts "about to fetch"
59
89
  self.template_sequence = fastaFile.fetch_sequence(reg)
90
+ #puts "done fetching"
60
91
  template_sequence[position - 1] = amb
61
92
  end
62
93
 
@@ -83,15 +114,24 @@ module Bio::PolyploidTools
83
114
 
84
115
  def to_polymarker_sequence(flanking_size, total:nil)
85
116
  out = template_sequence.clone
86
- #puts "changing: #{position} #{flanking_size} len: #{total}"
87
- out[position-1] = "[#{original}/#{snp}]"
117
+ snp_seq = "[#{original}/#{snp}]"
118
+ p = position-1
119
+ if orientation == :reverse
120
+ p = out.length - p - 1
121
+ s = Bio::Sequence::NA.new(out)
122
+ s1 = Bio::Sequence::NA.new(original)
123
+ s2 = Bio::Sequence::NA.new(snp)
124
+ out = s.reverse_complement
125
+ snp_seq = "[#{s1.reverse_complement}/#{s2.reverse_complement}]"
126
+
127
+ end
128
+
129
+ out[p] = snp_seq
88
130
  start = position - flanking_size - 1
89
- #puts "Start: #{start}"
90
131
  start = 0 if start < 0
91
132
  total = flanking_size * 2 unless total
92
133
  total += 5
93
- #puts "Total: #{total}"
94
- out[start , total ]
134
+ out[start , total ].upcase
95
135
  end
96
136
 
97
137
  def snp_id_in_seq
@@ -187,7 +227,7 @@ module Bio::PolyploidTools
187
227
  self.position - self.covered_region.start
188
228
  end
189
229
 
190
- def padded_position (pos)
230
+ def padded_position(pos)
191
231
  pos + left_padding
192
232
  end
193
233
 
@@ -18,9 +18,10 @@ module Bio::PolyploidTools
18
18
  arr = reg_str.split(",")
19
19
 
20
20
  if arr.size == 3
21
- snp.gene, snp.chromosome, snp.sequence_original = reg_str.split(",")
21
+ snp.gene, snp.chromosome, snp.sequence_original = arr
22
22
  elsif arr.size == 2
23
23
  snp.gene, snp.sequence_original = arr
24
+ snp.chromosome = ""
24
25
  else
25
26
  throw SNPSequenceException.new "Need two or three fields to parse, and got #{arr.size} in #{reg_str}"
26
27
  end
@@ -51,4 +52,4 @@ module Bio::PolyploidTools
51
52
 
52
53
 
53
54
  end
54
- end
55
+ end
data/lib/bio/db/blast.rb CHANGED
@@ -79,7 +79,7 @@ module Bio::DB::Blast
79
79
  def self.align(opts={})
80
80
  target=opts[:target]
81
81
  query=opts[:query]
82
- max_target_seqs = 15
82
+ max_target_seqs = 6 #TODO: Actually add this as an argument to PolyMarker.
83
83
  max_target_seqs = opts[:max_hits] * 2 if opts[:max_hits]
84
84
  cmdline = "blastn -max_target_seqs #{max_target_seqs} -query #{query} -db #{target} -outfmt '6 qseqid qstart qend qframe sseqid sstart send sframe score pident qlen slen qseq sseq'"
85
85
 
@@ -190,6 +190,26 @@ module Bio::DB::Exonerate
190
190
  nil
191
191
  end
192
192
 
193
+ def query_position_on_target(position, base:0)
194
+ vulgar = exon_on_gene_position(position)
195
+ qr = vulgar.query_region
196
+ tr = vulgar.target_region
197
+
198
+ offset = qr.orientation == :forward ? position - qr.start + 1 : qr.end - position
199
+
200
+ #puts vulgar.to_s
201
+ #puts "SNP position: #{position}"
202
+ #puts vulgar.query_region
203
+ #puts vulgar.query_region.orientation
204
+ #puts "Offset query: #{offset}"
205
+ #puts vulgar.target_region
206
+ #puts vulgar.target_region.orientation
207
+
208
+ new_pos = tr.orientation == :forward ? offset + tr.start - 1 : tr.end - offset + 1
209
+
210
+ return new_pos
211
+ end
212
+
193
213
  def tarpostion_from_query_position(position)
194
214
  ret = nil
195
215
  vulgar_block = exon_on_gene_position(position)
@@ -206,7 +226,6 @@ module Bio::DB::Exonerate
206
226
  end
207
227
  end
208
228
 
209
-
210
229
  class Vulgar
211
230
  attr_reader :label, :query_length, :target_length, :query_start, :query_end, :target_start, :target_end, :record, :snp_in_gap
212
231
  def initialize(label, ql, tl, target_start, target_multiply, query_start, query_multiply, record)
@@ -552,7 +552,18 @@ module Bio::DB::Primer3
552
552
  #CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward
553
553
  def parse_header
554
554
  #puts "Parsing header: '#{self.sequence_id}'"
555
- @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation = self.sequence_id.split(" ")
555
+ arr = self.sequence_id.split(" ")
556
+
557
+ #if arr.size == 7 This validation can be useful to get the best primers regardless of the chromosome,
558
+ #But it is commented as it will require further testing.
559
+ @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation = arr
560
+ #else
561
+ # if arr.size == 6
562
+ # @snp, @line, @type, @in, @polymorphism, @orientation = arr
563
+ # @chromosome = ""
564
+ # end
565
+ #end
566
+
556
567
  @type = @type.to_sym
557
568
  if @in
558
569
  @in = @in.to_sym == :exon
@@ -9,7 +9,9 @@ require 'json'
9
9
  #require 'bio/db/fasta'
10
10
 
11
11
  #puts "Loading all... #{Dir[File.dirname(__FILE__) + "/bio/*/*.rb"]}"
12
+ module Bio::PolyploidTools
12
13
 
14
+ end
13
15
  Dir[File.dirname(__FILE__) + "/bio/*.rb"].each {|file|
14
16
  # puts file
15
17
  require_relative file }
@@ -22,4 +24,5 @@ require_relative File.dirname(__FILE__) + "/../conf/defaults.rb"
22
24
 
23
25
 
24
26
  #require_relative "bio/BFRTools.rb"
25
- #require_relative "bio/PolyploidTools/ExonContainer.rb"
27
+ #require_relative "bio/PolyploidTools/ExonContainer.rb"
28
+
@@ -36,17 +36,50 @@ class TestSNPparsing < Test::Unit::TestCase
36
36
  assert_equal(snp.contig, "IWGSC_CSS_1AL_scaff_1455974")
37
37
  assert_equal(snp.chromosome, "1A", "The chromosome wasnt parsed: #{snp.chromosome}")
38
38
  assert_equal(snp.position, 127, "The position is not parsed: #{snp.position}")
39
- #snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size = 100)
39
+ #snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size: 100)
40
40
  region = fasta_reference_db.index.region_for_entry(snp.contig).get_full_region
41
41
  snp.full_sequence = fasta_reference_db.fetch_sequence(region)
42
42
 
43
- assert_equal(snp.template_sequence, "actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctcYttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaag")
44
- assert_equal(snp.sequence_original, "actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctc[C/T]ttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaag")
43
+ assert_equal(snp.template_sequence, "actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctcYttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaag".upcase)
44
+ assert_equal(snp.sequence_original, "actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctc[C/T]ttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaag".upcase)
45
45
  assert_equal(snp.position, 101)
46
46
  assert_equal(snp.original, "C")
47
47
  assert_equal(snp.snp, "T")
48
+ end
49
+
50
+ def test_vcf_line
51
+ ref=@data + "/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa"
52
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>ref})
48
53
 
49
-
54
+ fasta_reference_db.load_fai_entries
55
+ vcf="IWGSC_CSS_1AL_scaff_1455974 127 test_snp C T 135.03 . "
56
+
57
+ chr_arm_parser = Bio::PolyploidTools::ChromosomeArm.getArmSelection("embl");
58
+ snp = Bio::PolyploidTools::SNP.parseVCF(vcf, chr_arm_parser: chr_arm_parser)
59
+ assert_equal(snp.gene , "test_snp", "The original name was not parsed: #{snp.gene}")
60
+ assert_equal("1A", snp.chromosome, "The chromosome wasnt parsed: #{snp.chromosome}")
61
+ assert_equal(127, snp.position, "The position is not parsed: #{snp.position}")
62
+ snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size: 100)
63
+ assert_equal("actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctcYttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaaga", snp.template_sequence)
64
+ assert_equal("actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctc[C/T]ttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaag".upcase, snp.to_polymarker_sequence(100))
65
+ assert_equal(101,snp.position)
66
+ assert_equal("C",snp.original)
67
+ assert_equal("T",snp.snp)
68
+
69
+ vcf="IWGSC_CSS_1AL_scaff_1455974\t127\ttest_snp\tC\tT\t135.03\t.\tOR=reverse"
70
+
71
+ chr_arm_parser = Bio::PolyploidTools::ChromosomeArm.getArmSelection("embl");
72
+ snp = Bio::PolyploidTools::SNP.parseVCF(vcf, chr_arm_parser: chr_arm_parser)
73
+ assert_equal(snp.gene , "test_snp", "The original name was not parsed: #{snp.gene}")
74
+ assert_equal("1A", snp.chromosome, "The chromosome wasnt parsed: #{snp.chromosome}")
75
+ assert_equal(127, snp.position, "The position is not parsed: #{snp.position}")
76
+ snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size: 100)
77
+ assert_equal("actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctcYttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaaga", snp.template_sequence)
78
+ assert_equal("TCTTGTACCTACCGAGTGCAGCATGCTACGTACCTTATAGCCAGAAGCCTTGACGTGGTGGATGCGGTCTCCAAAGCGCTTGTCAAGTCCGGGTACGACAA[G/A]GAGACCTGTAAGCAGCGCGTGCTCATACAGTCAGAGGACGCCCCGGTGCTTGCGGCGTTCAAGACGTTCCCCAAGTTCCAGCGGGTGCTGACGATCGAG", snp.to_polymarker_sequence(100))
79
+ assert_equal(101,snp.position)
80
+ assert_equal("C",snp.original)
81
+ assert_equal("T",snp.snp)
82
+
50
83
  end
51
84
 
52
85
  def test_reference_snp
@@ -60,9 +93,9 @@ class TestSNPparsing < Test::Unit::TestCase
60
93
  assert_equal(snp.gene , "IWGSC_CSS_1AL_scaff_1455974", "The original name was not parsed: #{snp.gene}")
61
94
  assert_equal("1A", snp.chromosome, "The chromosome wasnt parsed: #{snp.chromosome}")
62
95
  assert_equal(127, snp.position, "The position is not parsed: #{snp.position}")
63
- snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size = 100)
96
+ snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size: 100)
64
97
  assert_equal("actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctcYttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaaga", snp.template_sequence)
65
- assert_equal("actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctc[C/T]ttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaag", snp.to_polymarker_sequence(100))
98
+ assert_equal("actcgatcgtcagcacccgctggaacttggggaacgtcttgaacgccgcaagcaccggggcgtcctctgactgtatgagcacgcgctgcttacaggtctc[C/T]ttgtcgtacccggacttgacaagcgctttggagaccgcatccaccacgtcaaggcttctggctataaggtacgtagcatgctgcactcggtaggtacaag".upcase, snp.to_polymarker_sequence(100))
66
99
  assert_equal(101,snp.position)
67
100
  assert_equal("C",snp.original)
68
101
  assert_equal("T",snp.snp)
@@ -70,14 +103,14 @@ class TestSNPparsing < Test::Unit::TestCase
70
103
  flanking_size = 3
71
104
 
72
105
  snp = Bio::PolyploidTools::SNP.parse("IWGSC_CSS_1DL_scaff_2258883,A,12498,C,1D")
73
- snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size = flanking_size)
106
+ snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size: flanking_size)
74
107
  assert_equal(4,snp.position)
75
108
  assert_equal("A",snp.original)
76
109
  assert_equal("C",snp.snp)
77
110
  assert_equal("gatM", snp.template_sequence)
78
111
 
79
112
  snp = Bio::PolyploidTools::SNP.parse("IWGSC_CSS_1BL_scaff_3810460,G,1,T,1B")
80
- snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size = flanking_size)
113
+ snp.setTemplateFromFastaFile(fasta_reference_db, flanking_size: flanking_size)
81
114
  assert_equal(1,snp.position)
82
115
  assert_equal("G",snp.original)
83
116
  assert_equal("T",snp.snp)
@@ -85,4 +118,4 @@ class TestSNPparsing < Test::Unit::TestCase
85
118
  end
86
119
 
87
120
 
88
- end
121
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-polyploid-tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.9
4
+ version: 0.9.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ricardo H. Ramirez-Gonzalez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-11-21 00:00:00.000000000 Z
11
+ date: 2019-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -127,6 +127,7 @@ executables:
127
127
  - mafft_triads.rb
128
128
  - mafft_triads_promoters.rb
129
129
  - map_markers_to_contigs.rb
130
+ - marker_to_vcf.rb
130
131
  - markers_in_region.rb
131
132
  - mask_triads.rb
132
133
  - polymarker.rb
@@ -135,6 +136,7 @@ executables:
135
136
  - snps_between_bams.rb
136
137
  - tag_stats.rb
137
138
  - vcfLineToTable.rb
139
+ - vcfToPolyMarker.rb
138
140
  extensions: []
139
141
  extra_rdoc_files:
140
142
  - README
@@ -161,6 +163,7 @@ files:
161
163
  - bin/mafft_triads.rb
162
164
  - bin/mafft_triads_promoters.rb
163
165
  - bin/map_markers_to_contigs.rb
166
+ - bin/marker_to_vcf.rb
164
167
  - bin/markers_in_region.rb
165
168
  - bin/mask_triads.rb
166
169
  - bin/polymarker.rb
@@ -169,6 +172,7 @@ files:
169
172
  - bin/snps_between_bams.rb
170
173
  - bin/tag_stats.rb
171
174
  - bin/vcfLineToTable.rb
175
+ - bin/vcfToPolyMarker.rb
172
176
  - bio-polyploid-tools.gemspec
173
177
  - conf/defaults.rb
174
178
  - conf/primer3_config/dangle.dh
@@ -320,7 +324,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
320
324
  version: '0'
321
325
  requirements: []
322
326
  rubyforge_project:
323
- rubygems_version: 2.7.7
327
+ rubygems_version: 2.6.14
324
328
  signing_key:
325
329
  specification_version: 4
326
330
  summary: Tool to work with polyploids, NGS and molecular biology