bio-polymarker 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+ options[:cut_promoter_length] = 0
18
+ options[:reverse] = true
19
+
20
+ OptionParser.new do |opts|
21
+
22
+ opts.banner = "Usage: filter_blat.rb [options]"
23
+
24
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
25
+ options[:identity] = o.to_f
26
+ end
27
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
28
+ options[:min_bases] = o.to_i
29
+ end
30
+
31
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
32
+ options[:triads] = o
33
+ end
34
+
35
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
36
+ options[:fasta] = o
37
+ end
38
+
39
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
40
+ options[:split_token] = o
41
+ end
42
+
43
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
44
+ options[:program] = o
45
+ end
46
+
47
+ opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
48
+ options[:random_sample] = o.to_i
49
+ end
50
+
51
+ opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
52
+ options[:cut_promoter_length] = o.to_i
53
+ end
54
+
55
+ opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
56
+ if o == 'T'
57
+ options[:reverse] = true
58
+ elsif o == 'F'
59
+ options[:reverse] = false
60
+ else
61
+ $stderr.puts "Invalid option for reverse (should be T or F)"
62
+ exit -1
63
+ end
64
+ end
65
+ end.parse!
66
+
67
+
68
+ def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
69
+ cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
70
+ #puts cmd
71
+ executed = system cmd
72
+ result = []
73
+ blast_version = nil
74
+ n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
75
+ longest = nil
76
+ max_length = 0
77
+ max_pident = 0.0
78
+ n.each do | iter |
79
+ iter.each do | hit |
80
+ hit.each do | hsp |
81
+ if hsp.align_len > max_length
82
+ max_length = hsp.align_len
83
+ max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
84
+ end
85
+ end
86
+ end
87
+ end
88
+ [max_length, max_pident]
89
+ end
90
+
91
+ valid_pairs_A_B = Hash.new
92
+ valid_pairs_A_D = Hash.new
93
+ valid_pairs_B_D = Hash.new
94
+
95
+ split_token = options[:split_token]
96
+
97
+ sequences = Hash.new
98
+ sequence_count=0
99
+ Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
100
+ fasta_file.each do |entry|
101
+ gene_name = entry.entry_id.split(split_token)[0]
102
+ seq = entry.naseq
103
+ seq.reverse_complement! if options[:reverse]
104
+ seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
105
+ entry.data = seq
106
+ sequences[gene_name] = entry unless sequences[gene_name]
107
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
108
+ sequence_count += 1
109
+ end
110
+ end
111
+
112
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
113
+ #FileUtils.mkdir_p(options[:tmp_folder])
114
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
115
+
116
+ a_tmp = options[:tmp_folder] + "/A.fa"
117
+ b_tmp = options[:tmp_folder] + "/B.fa"
118
+ d_tmp = options[:tmp_folder] + "/D.fa"
119
+ out_tmp = options[:tmp_folder] + "/out.blast"
120
+
121
+
122
+ puts [
123
+ "group_id" , "query" , "subject" ,
124
+ "chr_query", "chr_subject", "aln_type",
125
+ "length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
126
+
127
+ count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
128
+
129
+ probability = options[:random_sample] / count_lines.to_f
130
+ probability = 1 if options[:random_sample] == 0
131
+ prng = Random.new
132
+ #puts probability
133
+ prom_len = options[:cut_promoter_length]
134
+ CSV.foreach(options[:triads], headers:true ) do |row|
135
+ a = row['A']
136
+ b = row['B']
137
+ d = row['D']
138
+ triad = row['group_id'].to_i
139
+ triad_folder = triad/100
140
+
141
+ save = probability > prng.rand && probability < 1
142
+ run = probability == 1 || save
143
+ next unless run
144
+
145
+ seq_a = sequences[a]
146
+ seq_b = sequences[b]
147
+ seq_d = sequences[d]
148
+ File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
149
+ File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
150
+ File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
151
+
152
+ ns_a = seq_a.seq.count('Nn') if seq_a
153
+ ns_b = seq_b.seq.count('Nn') if seq_b
154
+ ns_d = seq_d.seq.count('Nn') if seq_d
155
+
156
+ save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
157
+
158
+ #if save
159
+ FileUtils.mkdir_p save_folder
160
+ FileUtils.cp(a_tmp, save_folder) if seq_a
161
+ FileUtils.cp(b_tmp, save_folder) if seq_b
162
+ FileUtils.cp(d_tmp, save_folder) if seq_d
163
+ #end
164
+
165
+ if seq_a and seq_b
166
+ to_print = [triad, a, b , "A","B","A->B"]
167
+ to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
168
+ to_print << ns_a
169
+ to_print << ns_b
170
+ to_print << ns_a + ns_b
171
+ FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
172
+ puts to_print.join("\t")
173
+ end
174
+ if seq_a and seq_d
175
+ to_print = [triad, a, b , "A","D","A->D"]
176
+ to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
177
+ to_print << ns_a
178
+ to_print << ns_d
179
+ to_print << ns_a + ns_d
180
+ FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
181
+ puts to_print.join("\t")
182
+ end
183
+ if seq_b and seq_d
184
+ to_print = [triad, a, b , "B","D","B->D"]
185
+ to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
186
+ to_print << ns_b
187
+ to_print << ns_d
188
+ to_print << ns_b + ns_d
189
+ FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
190
+ puts to_print.join("\t")
191
+ end
192
+ end
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'rubygems'
5
+ require 'pathname'
6
+ require 'bio-samtools-wrapper'
7
+
8
+ require 'set'
9
+
10
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
11
+ $: << File.expand_path('.')
12
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
13
+ require path
14
+
15
+ puts ARGV[0]
16
+
17
+ fasta_db = Bio::DB::Fasta::FastaFile.new( {:fasta=>ARGV[0]})
18
+ fasta_db.load_fai_entries
19
+ bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
20
+
21
+ fasta_db.index.entries.each do | r |
22
+ #Np r.get_full_region
23
+ #container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
24
+ region=r.get_full_region
25
+
26
+
27
+
28
+ cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true})
29
+
30
+ snps = cons_1.count_ambiguities
31
+
32
+ snps_per_1k = (1000 * snps.to_f ) / region.size
33
+
34
+ puts "#{r.id}\t#{region.size}\t#{snps}\t#{snps_per_1k}\n#{cons_1}"
35
+
36
+ end
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'optparse'
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
7
+ require path
8
+ module Bio
9
+ class Blat
10
+ class StreamedReport < Report
11
+
12
+ def self.each_hit(text = '')
13
+ flag = false
14
+ head = []
15
+
16
+ text.each_line do |line|
17
+ if flag then
18
+ yield Hit.new(line)
19
+ else
20
+ # for headerless data
21
+ if /^\d/ =~ line then
22
+ flag = true
23
+ redo
24
+ end
25
+ line = line.chomp
26
+ if /\A\-+\s*\z/ =~ line
27
+ flag = true
28
+ else
29
+ head << line
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+
39
+ #blat_file=ARGV[0]
40
+
41
+ options = {}
42
+ options[:identity] = 95
43
+ options[:covered] = 60
44
+ OptionParser.new do |opts|
45
+
46
+ opts.banner = "Usage: filter_blat_by_target_coverage.rb [options]"
47
+
48
+ opts.on("-p", "--psl FILE", "PSL file") do |o|
49
+ options[:blat_file] = o.upcase
50
+ end
51
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
52
+ options[:identity] = o.to_f
53
+ end
54
+ opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
55
+ options[:covered] = o.to_f
56
+ end
57
+
58
+ end.parse!
59
+
60
+
61
+ blat_file = options[:blat_file]
62
+
63
+ Bio::Blat::StreamedReport.each_hit(Bio::FlatFile.open(blat_file).to_io) do |hit|
64
+ if hit.percentage_covered >= options[:covered] and hit.percent_identity >= options[:identity]
65
+ puts hit.data.join("\t")
66
+ end
67
+ end
68
+
69
+
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'optparse'
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
7
+ require path
8
+
9
+ options = {}
10
+ options[:identity] = 95
11
+ options[:covered] = 90
12
+ OptionParser.new do |opts|
13
+
14
+ opts.banner = "Usage: filter_exonerate_by_identity.rb [options]"
15
+
16
+ opts.on("-e", "--exo FILE", "Exonerate alignment produced by polymarker or with the following ryo: 'RESULT:\\t%S\\t%pi\\t%ql\\t%tl\\t%g\\t%V\\n'") do |o|
17
+ options[:exo_file] = o.upcase
18
+ end
19
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
20
+ options[:identity] = o.to_f
21
+ end
22
+ opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
23
+ options[:covered] = o.to_f
24
+ end
25
+
26
+ end.parse!
27
+
28
+
29
+ exo_file = options[:exo_file]
30
+ min_identity = options[:identity];
31
+ min_coverage = options[:covered]
32
+ File.foreach(exo_file) do |line|
33
+ aln = Bio::DB::Exonerate::Alignment.parse_custom(line)
34
+ if aln.identity > min_identity and aln.query_coverage > min_coverage
35
+ puts aln.line
36
+ end
37
+ end
38
+
@@ -0,0 +1,33 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+
4
+ def load_blat_alignments (blat_filename, best_aln)
5
+ blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(blat_filename).to_io)
6
+ blat_aln.each_hit() do |hit|
7
+ current_matches = hit.match
8
+ current_name = hit.query_id
9
+ current_identity = hit.percent_identity
10
+ current_score = hit.score
11
+ #p current_name
12
+
13
+ best = best_aln[current_name]
14
+
15
+ if best == nil
16
+ best_aln[current_name] = hit
17
+ else
18
+ if current_score > best.score
19
+ best_aln[current_name] = hit
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ blat_file=ARGV[0]
26
+ best_aln = Hash.new
27
+
28
+ load_blat_alignments( blat_file,best_aln)
29
+ puts "QUERY\tTARGET"
30
+ best_aln.each do |k, hit|
31
+ #puts "#{k}\t#{hit.target_id}"
32
+ puts hit.data.join("\t")
33
+ end
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ found_cointigs = Set.new
5
+ Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model, :chunk=>chunk, :total_chunks=>}) do |aln|
6
+ if aln.identity > min_identity
7
+ exo_f.puts aln.line
8
+ unless found_cointigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
9
+ found_cointigs.add(aln.target_id)
10
+ entry = fasta_file.index.region_for_entry(aln.target_id)
11
+ raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
12
+ region = entry.get_full_region
13
+ seq = fasta_file.fetch_sequence(region)
14
+ contigs_f.puts(">#{aln.target_id}\n#{seq}")
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ #$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
6
+ #$: << File.expand_path('.')
7
+ #path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
8
+ #require path
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:blastx] = "-"
14
+
15
+ OptionParser.new do |opts|
16
+
17
+ opts.banner = "Usage: filter_blat.rb [options]"
18
+
19
+ opts.on("-p", "--blastx FILE", "BLAST XML file") do |o|
20
+ options[:blastx] = o
21
+ end
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ end.parse!
34
+
35
+ valid_pairs_A_B = Hash.new
36
+ valid_pairs_A_D = Hash.new
37
+ valid_pairs_B_D = Hash.new
38
+
39
+ CSV.foreach(options[:triads], headers:true ) do |row|
40
+ valid_pairs_A_B[row['A']] = row['B']
41
+ valid_pairs_A_D[row['A']] = row['D']
42
+ valid_pairs_B_D[row['B']] = row['D']
43
+ end
44
+
45
+ stream = ARGF
46
+ stream = IO.open(options[:blastx]) unless options[:blastx] == "-"
47
+ puts "Loaded #{valid_pairs_B_D.length} triads"
48
+ $stdout.flush
49
+
50
+ blast_report = Bio::FlatFile.new(Bio::Blast::Report, stream)
51
+
52
+ blast_report.each_entry do |report|
53
+ puts "Hits for " + report.query_def + " against " + report.db
54
+ $stdout.flush
55
+ report.each do |hit|
56
+ query = hit.query_id.split("-")[0]
57
+ target = hit.target_id.split("-")[0]
58
+ if valid_pairs_A_B[query] == target or valid_pairs_A_D[query] == target or valid_pairs_B_D[query] == target
59
+ puts hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
60
+ puts hit.inspect
61
+ end
62
+
63
+ end
64
+ end
65
+
66
+ stream.close unless options[:blat_file] == "-"
@@ -0,0 +1,168 @@
1
+ #!
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+
7
+ require 'set'
8
+
9
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
10
+ $: << File.expand_path('.')
11
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
12
+ require path
13
+
14
+
15
+ #TODO: Use temporary files somewhere in the file system and add traps to delete them/forward them as a result.
16
+ #TODO: Make all this parameters
17
+ path_to_contigs="/Users/ramirezr/Documents/PHD/201305_Databases/iwgcs"
18
+ #path_to_contigs=path_to_chromosomes
19
+ snp_in="A"
20
+ original_name="B"
21
+ fasta_reference = nil
22
+ #test_file="/Users/ramirezr/Dropbox/JIC/PrimersToTest/test_primers_nick_and_james_1.csv"
23
+ test_file=ARGV[0]
24
+ fasta_reference = ARGV[1] if ARGV[1]
25
+ output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
26
+ Dir.mkdir(output_folder)
27
+ #TODO Make this tmp files
28
+ temp_fasta_query="#{output_folder}to_align.fa"
29
+ temp_contigs="#{output_folder}contigs_tmp.fa"
30
+ exonerate_file="#{output_folder}exonerate_tmp.tab"
31
+ primer_3_input="#{output_folder}primer_3_input_temp"
32
+ primer_3_output="#{output_folder}primer_3_output_temp"
33
+ exons_filename="#{output_folder}exons_genes_and_contigs.fa"
34
+ output_primers="#{output_folder}primers.csv"
35
+
36
+ primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
37
+ model="est2genome"
38
+
39
+
40
+ min_identity= 92
41
+ snps = Array.new
42
+
43
+ #0. Load the fasta index
44
+ fasta_reference_db = nil
45
+ if fasta_reference
46
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
47
+ fasta_reference_db.load_fai_entries
48
+ p "Fasta reference: #{fasta_reference}"
49
+ end
50
+
51
+
52
+ #1. Read all the SNP files
53
+ #All the SNPs should be on the same chromosome as the first SNP.
54
+ chromosome = nil
55
+ File.open(test_file) do | f |
56
+ f.each_line do | line |
57
+ # p line.chomp!
58
+ snp = nil
59
+ if ARGV.size == 1 #List with Sequence
60
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
61
+ elsif ARGV.size == 2 #List and fasta file
62
+ snp = Bio::PolyploidTools::SNP.parse(line)
63
+ region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
64
+ snp.template_sequence = fasta_reference_db.fetch_sequence(region)
65
+ else
66
+ raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
67
+ end
68
+ raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
69
+ snp.snp_in = snp_in
70
+ snp.original_name = original_name
71
+ snps << snp
72
+ chromosome = snp.chromosome unless chromosome
73
+ raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
74
+ end
75
+ end
76
+
77
+ #1.1 Close fasta file
78
+ #fasta_reference_db.close() if fasta_reference_db
79
+ #2. Generate all the fasta files
80
+
81
+ written_seqs = Set.new
82
+ file = File.open(temp_fasta_query, "w")
83
+ snps.each do |snp|
84
+ unless written_seqs.include?(snp.gene)
85
+ written_seqs << snp.gene
86
+ file.puts snp.to_fasta
87
+ end
88
+ end
89
+ file.close
90
+
91
+ #3. Run exonerate on each of the possible chromosomes for the SNP
92
+ puts chromosome
93
+ chr_group = chromosome[0]
94
+ exo_f = File.open(exonerate_file, "w")
95
+ contigs_f = File.open(temp_contigs, "w")
96
+ Dir.foreach(path_to_contigs) do |filename |
97
+ #puts filename
98
+ if File.fnmatch("#{chr_group}*.fa", filename)
99
+ puts filename
100
+ target="#{path_to_contigs}/#{filename}"
101
+
102
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
103
+ fasta_file.load_fai_entries
104
+ Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
105
+ if aln.identity > min_identity
106
+ exo_f.puts aln.line
107
+ region = fasta_file.index.region_for_entry(aln.target_id).get_full_region
108
+ seq = fasta_file.fetch_sequence(region)
109
+ contigs_f.puts(">#{aln.target_id}\n#{seq}")
110
+ end
111
+
112
+ end
113
+ end
114
+ end
115
+
116
+ exo_f.close()
117
+ contigs_f.close()
118
+
119
+ #4. Load all the results from exonerate and get the input filename for primer3
120
+ #Custom arm selection function that only uses the first two characters. Maybe
121
+ #we want to make it a bit more cleaver
122
+ arm_selection = lambda do | contig_name |
123
+ ret = contig_name[0,2]
124
+ return ret
125
+ end
126
+
127
+ container= Bio::PolyploidTools::ExonContainer.new
128
+ container.flanking_size=100
129
+ container.gene_models(temp_fasta_query)
130
+ container.chromosomes(temp_contigs)
131
+ container.add_parental({:name=>snp_in})
132
+ container.add_parental({:name=>original_name})
133
+ snps.each do |snp|
134
+ snp.container = container
135
+ snp.flanking_size = container.flanking_size
136
+ container.add_snp(snp)
137
+ end
138
+ container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection})
139
+
140
+ file = File.open(exons_filename, "w")
141
+ container.print_fasta_snp_exones(file)
142
+ file.close
143
+
144
+ file = File.open(primer_3_input, "w")
145
+ file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
146
+ file.puts("PRIMER_MAX_SIZE=25")
147
+ file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
148
+ file.puts("PRIMER_LIBERAL_BASE=1")
149
+ file.puts("PRIMER_NUM_RETURN=5")
150
+ file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
151
+ container.print_primer_3_exons(file, chromosome,snp_in)
152
+ file.close
153
+
154
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
155
+
156
+ #5. Pick the best primer and make the primer3 output
157
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
158
+ kasp_container.line_1=snp_in
159
+ kasp_container.line_2=original_name
160
+
161
+ snps.each do |snp|
162
+ kasp_container.add_snp(snp)
163
+ end
164
+
165
+ kasp_container.add_primers_file(primer_3_output)
166
+ header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
167
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
168
+