bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+ options[:cut_promoter_length] = 0
18
+ options[:reverse] = true
19
+
20
+ OptionParser.new do |opts|
21
+
22
+ opts.banner = "Usage: filter_blat.rb [options]"
23
+
24
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
25
+ options[:identity] = o.to_f
26
+ end
27
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
28
+ options[:min_bases] = o.to_i
29
+ end
30
+
31
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
32
+ options[:triads] = o
33
+ end
34
+
35
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
36
+ options[:fasta] = o
37
+ end
38
+
39
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
40
+ options[:split_token] = o
41
+ end
42
+
43
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
44
+ options[:program] = o
45
+ end
46
+
47
+ opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
48
+ options[:random_sample] = o.to_i
49
+ end
50
+
51
+ opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
52
+ options[:cut_promoter_length] = o.to_i
53
+ end
54
+
55
+ opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
56
+ if o == 'T'
57
+ options[:reverse] = true
58
+ elsif o == 'F'
59
+ options[:reverse] = false
60
+ else
61
+ $stderr.puts "Invalid option for reverse (should be T or F)"
62
+ exit -1
63
+ end
64
+ end
65
+ end.parse!
66
+
67
+
68
+ def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
69
+ cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
70
+ #puts cmd
71
+ executed = system cmd
72
+ result = []
73
+ blast_version = nil
74
+ n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
75
+ longest = nil
76
+ max_length = 0
77
+ max_pident = 0.0
78
+ n.each do | iter |
79
+ iter.each do | hit |
80
+ hit.each do | hsp |
81
+ if hsp.align_len > max_length
82
+ max_length = hsp.align_len
83
+ max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
84
+ end
85
+ end
86
+ end
87
+ end
88
+ [max_length, max_pident]
89
+ end
90
+
91
+ valid_pairs_A_B = Hash.new
92
+ valid_pairs_A_D = Hash.new
93
+ valid_pairs_B_D = Hash.new
94
+
95
+ split_token = options[:split_token]
96
+
97
+ sequences = Hash.new
98
+ sequence_count=0
99
+ Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
100
+ fasta_file.each do |entry|
101
+ gene_name = entry.entry_id.split(split_token)[0]
102
+ seq = entry.naseq
103
+ seq.reverse_complement! if options[:reverse]
104
+ seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
105
+ entry.data = seq
106
+ sequences[gene_name] = entry unless sequences[gene_name]
107
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
108
+ sequence_count += 1
109
+ end
110
+ end
111
+
112
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
113
+ #FileUtils.mkdir_p(options[:tmp_folder])
114
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
115
+
116
+ a_tmp = options[:tmp_folder] + "/A.fa"
117
+ b_tmp = options[:tmp_folder] + "/B.fa"
118
+ d_tmp = options[:tmp_folder] + "/D.fa"
119
+ out_tmp = options[:tmp_folder] + "/out.blast"
120
+
121
+
122
+ puts [
123
+ "group_id" , "query" , "subject" ,
124
+ "chr_query", "chr_subject", "aln_type",
125
+ "length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
126
+
127
+ count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
128
+
129
+ probability = options[:random_sample] / count_lines.to_f
130
+ probability = 1 if options[:random_sample] == 0
131
+ prng = Random.new
132
+ #puts probability
133
+ prom_len = options[:cut_promoter_length]
134
+ CSV.foreach(options[:triads], headers:true ) do |row|
135
+ a = row['A']
136
+ b = row['B']
137
+ d = row['D']
138
+ triad = row['group_id'].to_i
139
+ triad_folder = triad/100
140
+
141
+ save = probability > prng.rand && probability < 1
142
+ run = probability == 1 || save
143
+ next unless run
144
+
145
+ seq_a = sequences[a]
146
+ seq_b = sequences[b]
147
+ seq_d = sequences[d]
148
+ File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
149
+ File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
150
+ File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
151
+
152
+ ns_a = seq_a.seq.count('Nn') if seq_a
153
+ ns_b = seq_b.seq.count('Nn') if seq_b
154
+ ns_d = seq_d.seq.count('Nn') if seq_d
155
+
156
+ save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
157
+
158
+ #if save
159
+ FileUtils.mkdir_p save_folder
160
+ FileUtils.cp(a_tmp, save_folder) if seq_a
161
+ FileUtils.cp(b_tmp, save_folder) if seq_b
162
+ FileUtils.cp(d_tmp, save_folder) if seq_d
163
+ #end
164
+
165
+ if seq_a and seq_b
166
+ to_print = [triad, a, b , "A","B","A->B"]
167
+ to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
168
+ to_print << ns_a
169
+ to_print << ns_b
170
+ to_print << ns_a + ns_b
171
+ FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
172
+ puts to_print.join("\t")
173
+ end
174
+ if seq_a and seq_d
175
+ to_print = [triad, a, b , "A","D","A->D"]
176
+ to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
177
+ to_print << ns_a
178
+ to_print << ns_d
179
+ to_print << ns_a + ns_d
180
+ FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
181
+ puts to_print.join("\t")
182
+ end
183
+ if seq_b and seq_d
184
+ to_print = [triad, a, b , "B","D","B->D"]
185
+ to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
186
+ to_print << ns_b
187
+ to_print << ns_d
188
+ to_print << ns_b + ns_d
189
+ FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
190
+ puts to_print.join("\t")
191
+ end
192
+ end
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'rubygems'
5
+ require 'pathname'
6
+ require 'bio-samtools-wrapper'
7
+
8
+ require 'set'
9
+
10
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
11
+ $: << File.expand_path('.')
12
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
13
+ require path
14
+
15
+ puts ARGV[0]
16
+
17
+ fasta_db = Bio::DB::Fasta::FastaFile.new( {:fasta=>ARGV[0]})
18
+ fasta_db.load_fai_entries
19
+ bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
20
+
21
+ fasta_db.index.entries.each do | r |
22
+ #Np r.get_full_region
23
+ #container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
24
+ region=r.get_full_region
25
+
26
+
27
+
28
+ cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true})
29
+
30
+ snps = cons_1.count_ambiguities
31
+
32
+ snps_per_1k = (1000 * snps.to_f ) / region.size
33
+
34
+ puts "#{r.id}\t#{region.size}\t#{snps}\t#{snps_per_1k}\n#{cons_1}"
35
+
36
+ end
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'optparse'
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
7
+ require path
8
+ module Bio
9
+ class Blat
10
+ class StreamedReport < Report
11
+
12
+ def self.each_hit(text = '')
13
+ flag = false
14
+ head = []
15
+
16
+ text.each_line do |line|
17
+ if flag then
18
+ yield Hit.new(line)
19
+ else
20
+ # for headerless data
21
+ if /^\d/ =~ line then
22
+ flag = true
23
+ redo
24
+ end
25
+ line = line.chomp
26
+ if /\A\-+\s*\z/ =~ line
27
+ flag = true
28
+ else
29
+ head << line
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+
39
+ #blat_file=ARGV[0]
40
+
41
+ options = {}
42
+ options[:identity] = 95
43
+ options[:covered] = 60
44
+ OptionParser.new do |opts|
45
+
46
+ opts.banner = "Usage: filter_blat_by_target_coverage.rb [options]"
47
+
48
+ opts.on("-p", "--psl FILE", "PSL file") do |o|
49
+ options[:blat_file] = o.upcase
50
+ end
51
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
52
+ options[:identity] = o.to_f
53
+ end
54
+ opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
55
+ options[:covered] = o.to_f
56
+ end
57
+
58
+ end.parse!
59
+
60
+
61
+ blat_file = options[:blat_file]
62
+
63
+ Bio::Blat::StreamedReport.each_hit(Bio::FlatFile.open(blat_file).to_io) do |hit|
64
+ if hit.percentage_covered >= options[:covered] and hit.percent_identity >= options[:identity]
65
+ puts hit.data.join("\t")
66
+ end
67
+ end
68
+
69
+
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'optparse'
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
7
+ require path
8
+
9
+ options = {}
10
+ options[:identity] = 95
11
+ options[:covered] = 90
12
+ OptionParser.new do |opts|
13
+
14
+ opts.banner = "Usage: filter_exonerate_by_identity.rb [options]"
15
+
16
+ opts.on("-e", "--exo FILE", "Exonerate alignment produced by polymarker or with the following ryo: 'RESULT:\\t%S\\t%pi\\t%ql\\t%tl\\t%g\\t%V\\n'") do |o|
17
+ options[:exo_file] = o.upcase
18
+ end
19
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
20
+ options[:identity] = o.to_f
21
+ end
22
+ opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
23
+ options[:covered] = o.to_f
24
+ end
25
+
26
+ end.parse!
27
+
28
+
29
+ exo_file = options[:exo_file]
30
+ min_identity = options[:identity];
31
+ min_coverage = options[:covered]
32
+ File.foreach(exo_file) do |line|
33
+ aln = Bio::DB::Exonerate::Alignment.parse_custom(line)
34
+ if aln.identity > min_identity and aln.query_coverage > min_coverage
35
+ puts aln.line
36
+ end
37
+ end
38
+
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+
4
+ def load_blat_alignments (blat_filename, best_aln)
5
+ blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(blat_filename).to_io)
6
+ blat_aln.each_hit() do |hit|
7
+ current_matches = hit.match
8
+ current_name = hit.query_id
9
+ current_identity = hit.percent_identity
10
+ current_score = hit.score
11
+ #p current_name
12
+
13
+ best = best_aln[current_name]
14
+
15
+ if best == nil
16
+ best_aln[current_name] = hit
17
+ else
18
+ if current_score > best.score
19
+ best_aln[current_name] = hit
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ blat_file=ARGV[0]
26
+ best_aln = Hash.new
27
+
28
+ load_blat_alignments( blat_file,best_aln)
29
+ puts "QUERY\tTARGET"
30
+ best_aln.each do |k, hit|
31
+ #puts "#{k}\t#{hit.target_id}"
32
+ puts hit.data.join("\t")
33
+ end
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ found_cointigs = Set.new
5
+ Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model, :chunk=>chunk, :total_chunks=>}) do |aln|
6
+ if aln.identity > min_identity
7
+ exo_f.puts aln.line
8
+ unless found_cointigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
9
+ found_cointigs.add(aln.target_id)
10
+ entry = fasta_file.index.region_for_entry(aln.target_id)
11
+ raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
12
+ region = entry.get_full_region
13
+ seq = fasta_file.fetch_sequence(region)
14
+ contigs_f.puts(">#{aln.target_id}\n#{seq}")
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ #$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
6
+ #$: << File.expand_path('.')
7
+ #path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
8
+ #require path
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:blastx] = "-"
14
+
15
+ OptionParser.new do |opts|
16
+
17
+ opts.banner = "Usage: filter_blat.rb [options]"
18
+
19
+ opts.on("-p", "--blastx FILE", "BLAST XML file") do |o|
20
+ options[:blastx] = o
21
+ end
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ end.parse!
34
+
35
+ valid_pairs_A_B = Hash.new
36
+ valid_pairs_A_D = Hash.new
37
+ valid_pairs_B_D = Hash.new
38
+
39
+ CSV.foreach(options[:triads], headers:true ) do |row|
40
+ valid_pairs_A_B[row['A']] = row['B']
41
+ valid_pairs_A_D[row['A']] = row['D']
42
+ valid_pairs_B_D[row['B']] = row['D']
43
+ end
44
+
45
+ stream = ARGF
46
+ stream = IO.open(options[:blastx]) unless options[:blastx] == "-"
47
+ puts "Loaded #{valid_pairs_B_D.length} triads"
48
+ $stdout.flush
49
+
50
+ blast_report = Bio::FlatFile.new(Bio::Blast::Report, stream)
51
+
52
+ blast_report.each_entry do |report|
53
+ puts "Hits for " + report.query_def + " against " + report.db
54
+ $stdout.flush
55
+ report.each do |hit|
56
+ query = hit.query_id.split("-")[0]
57
+ target = hit.target_id.split("-")[0]
58
+ if valid_pairs_A_B[query] == target or valid_pairs_A_D[query] == target or valid_pairs_B_D[query] == target
59
+ puts hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
60
+ puts hit.inspect
61
+ end
62
+
63
+ end
64
+ end
65
+
66
+ stream.close unless options[:blat_file] == "-"
@@ -0,0 +1,168 @@
1
+ #!
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+
7
+ require 'set'
8
+
9
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
10
+ $: << File.expand_path('.')
11
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
12
+ require path
13
+
14
+
15
+ #TODO: Use temporary files somewhere in the file system and add traps to delete them/forward them as a result.
16
+ #TODO: Make all this parameters
17
+ path_to_contigs="/Users/ramirezr/Documents/PHD/201305_Databases/iwgcs"
18
+ #path_to_contigs=path_to_chromosomes
19
+ snp_in="A"
20
+ original_name="B"
21
+ fasta_reference = nil
22
+ #test_file="/Users/ramirezr/Dropbox/JIC/PrimersToTest/test_primers_nick_and_james_1.csv"
23
+ test_file=ARGV[0]
24
+ fasta_reference = ARGV[1] if ARGV[1]
25
+ output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
26
+ Dir.mkdir(output_folder)
27
+ #TODO Make this tmp files
28
+ temp_fasta_query="#{output_folder}to_align.fa"
29
+ temp_contigs="#{output_folder}contigs_tmp.fa"
30
+ exonerate_file="#{output_folder}exonerate_tmp.tab"
31
+ primer_3_input="#{output_folder}primer_3_input_temp"
32
+ primer_3_output="#{output_folder}primer_3_output_temp"
33
+ exons_filename="#{output_folder}exons_genes_and_contigs.fa"
34
+ output_primers="#{output_folder}primers.csv"
35
+
36
+ primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
37
+ model="est2genome"
38
+
39
+
40
+ min_identity= 92
41
+ snps = Array.new
42
+
43
+ #0. Load the fasta index
44
+ fasta_reference_db = nil
45
+ if fasta_reference
46
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
47
+ fasta_reference_db.load_fai_entries
48
+ p "Fasta reference: #{fasta_reference}"
49
+ end
50
+
51
+
52
+ #1. Read all the SNP files
53
+ #All the SNPs should be on the same chromosome as the first SNP.
54
+ chromosome = nil
55
+ File.open(test_file) do | f |
56
+ f.each_line do | line |
57
+ # p line.chomp!
58
+ snp = nil
59
+ if ARGV.size == 1 #List with Sequence
60
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
61
+ elsif ARGV.size == 2 #List and fasta file
62
+ snp = Bio::PolyploidTools::SNP.parse(line)
63
+ region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
64
+ snp.template_sequence = fasta_reference_db.fetch_sequence(region)
65
+ else
66
+ raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
67
+ end
68
+ raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
69
+ snp.snp_in = snp_in
70
+ snp.original_name = original_name
71
+ snps << snp
72
+ chromosome = snp.chromosome unless chromosome
73
+ raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
74
+ end
75
+ end
76
+
77
+ #1.1 Close fasta file
78
+ #fasta_reference_db.close() if fasta_reference_db
79
+ #2. Generate all the fasta files
80
+
81
+ written_seqs = Set.new
82
+ file = File.open(temp_fasta_query, "w")
83
+ snps.each do |snp|
84
+ unless written_seqs.include?(snp.gene)
85
+ written_seqs << snp.gene
86
+ file.puts snp.to_fasta
87
+ end
88
+ end
89
+ file.close
90
+
91
+ #3. Run exonerate on each of the possible chromosomes for the SNP
92
+ puts chromosome
93
+ chr_group = chromosome[0]
94
+ exo_f = File.open(exonerate_file, "w")
95
+ contigs_f = File.open(temp_contigs, "w")
96
+ Dir.foreach(path_to_contigs) do |filename |
97
+ #puts filename
98
+ if File.fnmatch("#{chr_group}*.fa", filename)
99
+ puts filename
100
+ target="#{path_to_contigs}/#{filename}"
101
+
102
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
103
+ fasta_file.load_fai_entries
104
+ Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
105
+ if aln.identity > min_identity
106
+ exo_f.puts aln.line
107
+ region = fasta_file.index.region_for_entry(aln.target_id).get_full_region
108
+ seq = fasta_file.fetch_sequence(region)
109
+ contigs_f.puts(">#{aln.target_id}\n#{seq}")
110
+ end
111
+
112
+ end
113
+ end
114
+ end
115
+
116
+ exo_f.close()
117
+ contigs_f.close()
118
+
119
+ #4. Load all the results from exonerate and get the input filename for primer3
120
+ #Custom arm selection function that only uses the first two characters. Maybe
121
+ #we want to make it a bit more cleaver
122
+ arm_selection = lambda do | contig_name |
123
+ ret = contig_name[0,2]
124
+ return ret
125
+ end
126
+
127
+ container= Bio::PolyploidTools::ExonContainer.new
128
+ container.flanking_size=100
129
+ container.gene_models(temp_fasta_query)
130
+ container.chromosomes(temp_contigs)
131
+ container.add_parental({:name=>snp_in})
132
+ container.add_parental({:name=>original_name})
133
+ snps.each do |snp|
134
+ snp.container = container
135
+ snp.flanking_size = container.flanking_size
136
+ container.add_snp(snp)
137
+ end
138
+ container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection})
139
+
140
+ file = File.open(exons_filename, "w")
141
+ container.print_fasta_snp_exones(file)
142
+ file.close
143
+
144
+ file = File.open(primer_3_input, "w")
145
+ file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
146
+ file.puts("PRIMER_MAX_SIZE=25")
147
+ file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
148
+ file.puts("PRIMER_LIBERAL_BASE=1")
149
+ file.puts("PRIMER_NUM_RETURN=5")
150
+ file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
151
+ container.print_primer_3_exons(file, chromosome,snp_in)
152
+ file.close
153
+
154
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
155
+
156
+ #5. Pick the best primer and make the primer3 output
157
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
158
+ kasp_container.line_1=snp_in
159
+ kasp_container.line_2=original_name
160
+
161
+ snps.each do |snp|
162
+ kasp_container.add_snp(snp)
163
+ end
164
+
165
+ kasp_container.add_primers_file(primer_3_output)
166
+ header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
167
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
168
+