bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'optparse'
4
+
5
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
6
+ $: << File.expand_path('.')
7
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
8
+ require path
9
+
10
+
11
+ def log(msg)
12
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
13
+ puts "#{time}: #{msg}"
14
+ end
15
+
16
+ markers = nil
17
+
18
+ options = {}
19
+ OptionParser.new do |opts|
20
+
21
+ opts.banner = "Usage: polymarker.rb [options]"
22
+
23
+ opts.on("-c", "--chromosome CHR", "chromosome (1A, 3B, etc)") do |o|
24
+ options[:chromosome] = o.upcase
25
+ end
26
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
27
+ options[:reference] = o
28
+ end
29
+ opts.on("-m", "--map CSV", "File with the map and sequence \n Header: INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE") do |o|
30
+ options[:map] = o
31
+ end
32
+
33
+ end.parse!
34
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
35
+ reference = options[:reference] if options[:reference]
36
+ throw raise Exception.new(), "Reference has to be provided" unless reference
37
+
38
+ map = Bio::PolyploidTools::ArmMap.new
39
+ map.chromosome = options[:chromosome]
40
+ map.global_reference(reference)
41
+ log "Reading markers file"
42
+ Bio::PolyploidTools::Marker.parse(options[:map]) do |marker|
43
+ if options[:chromosome] == marker.chr
44
+ map.markers[marker.snp_name] = marker
45
+ end
46
+ end
47
+
48
+
49
+
50
+ fasta_tmp="markers_#{options[:chromosome]}.fa"
51
+ contigs_tmp="contigs_#{options[:chromosome]}.fa"
52
+ aln_tmp="align_#{options[:chromosome]}.psl"
53
+ contigs_map="contigs_map_#{options[:chromosome]}.fa"
54
+ map_with_contigs="contigs_map_#{options[:chromosome]}.csv"
55
+
56
+ #1. Prints the sequences to print according to the chromosome to search
57
+ log "Writing markers: #{fasta_tmp}"
58
+ map.print_fasta_markers(fasta_tmp)
59
+ log "Writing contigs: #{contigs_tmp}"
60
+ map.print_fasta_contigs_from_reference(contigs_tmp)
61
+ log "Aligning markers #{aln_tmp}"
62
+ map.align_markers(aln_tmp)
63
+ log "printing contigs with markers #{contigs_map}"
64
+ map.print_fasta_contigs_for_markers(contigs_map)
65
+ log "printing map with contigs #{map_with_contigs}"
66
+ map.print_map_with_contigs(map_with_contigs)
@@ -0,0 +1,241 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+ require 'optparse'
7
+ require 'set'
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
11
+ require path
12
+
13
+ options = {}
14
+ options[:min_identity] = 90
15
+ options[:filter_best] = false
16
+ options[:debug] = false
17
+
18
+ OptionParser.new do |opts|
19
+ opts.banner = "Usage: marler_to_vcf.rb [options]"
20
+
21
+ opts.on("-c", "--contigs FILE", "File with contigs to use as database") do |o|
22
+ options[:path_to_contigs] = o
23
+ end
24
+
25
+ opts.on("-m", "--marker_list FILE", "File with the list of markers to search from") do |o|
26
+ options[:marker_list] = o
27
+ end
28
+
29
+ opts.on("-b", "--filter_best", "If set, only keep the best alignment for each chromosome") do
30
+ options[:filter_best] = false
31
+ end
32
+
33
+ opts.on("-D", "--debug", "Validate that the flanking sequences are correct") do
34
+ options[:debug] = true
35
+ end
36
+
37
+ opts.on("-i", "--min_identity INT", "Minimum identity to consider a hit (default 90)") do |o|
38
+ options[:min_identity] = o.to_i
39
+ end
40
+
41
+ opts.on("-o", "--output FOLDER", "Output folder") do |o|
42
+ options[:output_folder] = o
43
+ end
44
+
45
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
46
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
47
+ end
48
+
49
+ opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: blast") do |o|
50
+ raise "Invalid aligner" unless o == "exonerate" or o == "blast"
51
+ options[:aligner] = o.to_sym
52
+ end
53
+
54
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
55
+ options[:database] = o
56
+ end
57
+
58
+ end.parse!
59
+ options[:database] = options[:path_to_contigs]
60
+ p options
61
+ p ARGV
62
+
63
+
64
+ path_to_contigs=options[:path_to_contigs]
65
+
66
+ original_name="A"
67
+ snp_in="B"
68
+
69
+ fasta_reference = nil
70
+ test_file=options[:marker_list]
71
+
72
+ output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}"
73
+ output_folder= options[:output_folder] if options[:output_folder]
74
+ Dir.mkdir(output_folder)
75
+ #T
76
+ temp_fasta_query="#{output_folder}/to_align.fa"
77
+ temp_contigs="#{output_folder}/contigs_tmp.fa"
78
+ exonerate_file="#{output_folder}/exonerate_tmp.tab"
79
+ vcf_file="#{output_folder}/snp_positions.vcf"
80
+
81
+ min_identity= options[:min_identity]
82
+
83
+ @status_file="#{output_folder}/status.txt"
84
+
85
+
86
+ def write_status(status)
87
+ f=File.open(@status_file, "a")
88
+ f.puts "#{Time.now.to_s},#{status}"
89
+ f.close
90
+ end
91
+
92
+
93
+ snps = Hash.new
94
+
95
+ fasta_reference_db=nil
96
+
97
+ #if options[:debug]
98
+ write_status "Loading Reference"
99
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path_to_contigs})
100
+ fasta_reference_db.load_fai_entries
101
+ write_status "Fasta reference: #{fasta_reference}"
102
+ #end
103
+
104
+ #1. Read all the SNP files
105
+ #chromosome = nil
106
+ write_status "Reading SNPs"
107
+
108
+ File.open(test_file) do | f |
109
+ f.each_line do | line |
110
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
111
+ snp.genomes_count = options[:genomes_count]
112
+ snp.snp_in = snp_in
113
+ snp.original_name = original_name
114
+ if snp.position
115
+ snps[snp.gene] = snp
116
+ else
117
+ $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
118
+ end
119
+ end
120
+ end
121
+
122
+ #2. Generate all the fasta files
123
+ write_status "Writing sequences to align"
124
+ written_seqs = Set.new
125
+ file = File.open(temp_fasta_query, "w")
126
+ snps.each_pair do |k,snp|
127
+ unless written_seqs.include?(snp.gene)
128
+ written_seqs << snp.gene
129
+ file.puts snp.to_fasta
130
+ end
131
+ end
132
+ file.close
133
+
134
+
135
+ #3. Run exonerate on each of the possible chromosomes for the SNP
136
+ #puts chromosome
137
+ #chr_group = chromosome[0]
138
+ write_status "Searching markers in genome"
139
+ exo_f = File.open(exonerate_file, "w")
140
+ contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
141
+ filename=path_to_contigs
142
+ #puts filename
143
+ target=filename
144
+
145
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
146
+ fasta_file.load_fai_entries
147
+ found_contigs = Set.new
148
+
149
+ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
150
+ if aln.identity > min_identity
151
+ exo_f.puts aln.line
152
+ end
153
+ end
154
+
155
+ Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database]}) do |aln|
156
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
157
+ end
158
+
159
+ exo_f.close()
160
+
161
+ def print_positions(min_identity:90, filter_best:false, exonerate_filename:"test.exo", snps:{}, reference:nil, out:$stdout)
162
+ marker_count=Hash.new { |h, k| h[k] = 1 }
163
+ File.open(exonerate_filename) do |f|
164
+ f.each_line do | line |
165
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
166
+ next unless record and record.identity >= min_identity
167
+ snp = snps[record.query_id]
168
+ next unless snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
169
+ begin
170
+
171
+ position = record.query_position_on_target(snp.position)
172
+ q_strand = record.query_strand
173
+ t_strand = record.target_strand
174
+ template = snp.template_sequence
175
+
176
+ vulgar = record.exon_on_gene_position(snp.position)
177
+ tr = vulgar.target_region
178
+ qr = vulgar.query_region
179
+ template_pre = template[qr.start - 1 .. snp.position - 1 ]
180
+ tr.orientation == :forward ? tr.end = position : tr.start = position
181
+ region = tr
182
+ target_seq = reference.fetch_sequence(region)
183
+ target_seq[-1] = target_seq[-1].upcase
184
+ ref_base = target_seq[-1]
185
+ ma = ref_base
186
+ alt_base = [snp.snp, snp.original].join(",")
187
+
188
+ if snp.original == ref_base
189
+ alt_base = snp.snp
190
+ elsif snp.snp == ref_base
191
+ alt_base = snp.original
192
+ end
193
+
194
+ if record.target_strand == :reverse
195
+ alt_base = Bio::Sequence::NA.new(alt_base)
196
+ ref_base = Bio::Sequence::NA.new(ref_base)
197
+ alt_base.complement!.upcase!
198
+ ref_base.complement!.upcase!
199
+ end
200
+
201
+ info = ["OR=#{record.target_strand}"]
202
+ info << "SC=#{record.score}"
203
+ info << "PI=#{record.pi}"
204
+ info << "MA=#{ma}"
205
+ info << "TS=#{target_seq}"
206
+ vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
207
+ #snp2 = Bio::PolyploidTools::SNP.parseVCF( vcf_line )
208
+ #snp2.setTemplateFromFastaFile(reference)
209
+ #seq2=snp2.to_polymarker_sequence(50)
210
+ #info << "PS=#{seq2}"
211
+ vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
212
+ out.puts(vcf_line)
213
+
214
+ marker_count[record.query_id] += 1
215
+ rescue Bio::DB::Exonerate::ExonerateException
216
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
217
+ end
218
+ end
219
+ end
220
+ end
221
+
222
+
223
+ write_status "Printing VCF file"
224
+ #puts snps.inspect
225
+ out = File.open(vcf_file, "w")
226
+ out.puts "##fileformat=VCFv4.2"
227
+ out.puts "##fileDate=#{Time.now.strftime("%Y%m%d")}"
228
+ out.puts "##source=#{$0}"
229
+ out.puts "##reference=file://#{options[:path_to_contigs]}"
230
+ out.puts "##INFO=<ID=OR,Number=1,Type=String,Description=\"Orientation of the alignment of the marker\">"
231
+ out.puts "##INFO=<ID=SC,Number=1,Type=Float,Description=\"Alignment score of the marker\">"
232
+ out.puts "##INFO=<ID=PI,Number=1,Type=Float,Description=\"Percentage of identity of the alignment to the marker\">"
233
+ out.puts "##INFO=<ID=PS,Number=1,Type=String,Description=\"SNP sequence for PolyMarker\">"
234
+ out.puts "##INFO=<ID=MA,Number=1,Type=String,Description=\"Allele based on the original marker sequence\">"
235
+ out.puts "##INFO=<ID=TS,Number=1,Type=String,Description=\"Target sequence before the SNP from the reference\">"
236
+ out.puts "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
237
+ print_positions(exonerate_filename:exonerate_file, min_identity:95, snps:snps, reference: fasta_reference_db, out:out)
238
+ out.close
239
+ write_status "DONE"
240
+
241
+
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #This uses the map output from map_markers_to_contigs.rb
4
+ #You need a reference with the name of the contigs, containing the chromosome
5
+ #arm and a list of sequences to map. The algorithm creates a smaller reference
6
+ #file, so the search only spans across the contigs in the region. This should
7
+ #allow to use a refined mapping algorithm.
8
+ require 'bio'
9
+ require 'optparse'
10
+
11
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
12
+ $: << File.expand_path('.')
13
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
14
+ require path
15
+
16
+
17
+ def log(msg)
18
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
19
+ puts "#{time}: #{msg}"
20
+ end
21
+
22
+ markers = nil
23
+
24
+ options = {}
25
+ OptionParser.new do |opts|
26
+
27
+ opts.banner = "Usage: markers_in_region.rb [options]"
28
+
29
+ opts.on("-c", "--chromosome CHR", "chromosome (1A, 3B, etc)") do |o|
30
+ options[:chromosome] = o.upcase
31
+ end
32
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
33
+ options[:reference] = o
34
+ end
35
+ opts.on("-m", "--map CSV", "File with the map and sequence \n Header: INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE") do |o|
36
+ options[:map] = o
37
+ end
38
+
39
+ end.parse!
40
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
41
+ reference = options[:reference] if options[:reference]
42
+ throw raise Exception.new(), "Reference has to be provided" unless reference
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+
4
+ require 'csv'
5
+ require 'fileutils'
6
+ require 'tmpdir'
7
+ require 'bio-samtools-wrapper'
8
+ require 'bio'
9
+
10
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
11
+ $: << File.expand_path('.')
12
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
13
+ require path
14
+ opts = {}
15
+ opts[:identity] = 50
16
+ opts[:min_bases] = 200
17
+ opts[:split_token] = "."
18
+ opts[:tmp_folder] = Dir.mktmpdir
19
+ opts[:random_sample] = 0
20
+ opts[:output_folder] = "."
21
+
22
+ OptionParser.new do |o|
23
+
24
+ o.banner = "Usage: mask_triads.rb [options]"
25
+
26
+ o.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
27
+ opts[:triads] = o
28
+ end
29
+
30
+ o.on("-f", "--fasta FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
31
+ opts[:fasta] = o
32
+ end
33
+
34
+ o.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
35
+ opts[:split_token] = o
36
+ end
37
+
38
+ o.on("-o", "--output_folder DIR", "Location to save the alignment masks. If the alignment exists, it is recycled to avoid calling MAFFT again") do |o|
39
+ opts[:output_folder] = o
40
+ end
41
+ end.parse!
42
+
43
+
44
+ split_token = opts[:split_token]
45
+ reference_name = File.basename opts[:fasta]
46
+ output_folder = opts[:output_folder]
47
+ @fasta_reference_db = Bio::DB::Fasta::FastaFile.new(fasta: opts[:fasta])
48
+ @fasta_reference_db.load_fai_entries
49
+ #puts @fasta_reference_db.index.entries
50
+ @cannonical = Hash.new
51
+ @fasta_reference_db.index.entries.each do |e|
52
+ gene = e.id.split(split_token)[0]
53
+ @cannonical[gene] = e unless @cannonical[gene]
54
+ @cannonical[gene] = e if e.length > @cannonical[gene].length
55
+ end
56
+
57
+ $stderr.puts "#Loaded #{@cannonical.length} canonical sequences from #{@fasta_reference_db.index.size} in reference"
58
+
59
+ $stderr.puts "TMP dir: #{opts[:tmp_folder]}"
60
+
61
+ def write_fasta_from_hash(sequences, filename)
62
+ out = File.new(filename, "w")
63
+ sequences.each_pair do | chromosome, exon_seq |
64
+ out.puts ">#{chromosome}\n#{exon_seq}\n"
65
+ end
66
+ out.close
67
+ end
68
+
69
+ def mafft_align(a, b, d)
70
+ to_align = Bio::Alignment::SequenceHash.new
71
+ seq_a = @fasta_reference_db.fetch_sequence(@cannonical[a].get_full_region)
72
+ seq_b = @fasta_reference_db.fetch_sequence(@cannonical[b].get_full_region)
73
+ seq_d = @fasta_reference_db.fetch_sequence(@cannonical[d].get_full_region)
74
+ to_align[a] = seq_a
75
+ to_align[b] = seq_b
76
+ to_align[d] = seq_d
77
+ report = mafft.query_alignment(to_align)
78
+ aln = report.alignment
79
+ aln
80
+ end
81
+
82
+ def read_alignment(path)
83
+ aln = Bio::Alignment::SequenceHash.new
84
+ i = 0
85
+ Bio::FlatFile.open(Bio::FastaFormat, path) do |fasta_file|
86
+ fasta_file.each do |entry|
87
+ aln[entry.entry_id] = entry.seq if i < 3
88
+ i += 1
89
+ end
90
+ end
91
+ aln
92
+ end
93
+
94
+
95
+ mafft_opts = ['--maxiterate', '1000', '--localpair', '--quiet']
96
+ mafft = Bio::MAFFT.new( "mafft" , mafft_opts)
97
+ header_printed = false
98
+ stats = File.open("#{output_folder}/#{reference_name}.identity_stats.csv", "w")
99
+ distances = File.open("#{output_folder}/#{reference_name}.distance_between_snps.csv.gz", "w")
100
+ gz = Zlib::GzipWriter.new(distances)
101
+ gz.write "triad,gene,genome,reference,type,distance\n"
102
+ #gz.close
103
+
104
+ def write_distances(distances, triad, gene, genome, reference, type, out)
105
+ distances.each { |e| out.write "#{triad},#{gene},#{genome},#{reference},#{type},#{e}\n" }
106
+ end
107
+
108
+ i = 0
109
+ CSV.foreach(opts[:triads], headers:true ) do |row|
110
+ next unless row["cardinality_abs"] == "1:1:1" and row["HC.LC"] == "HC-only"
111
+ a = row['A']
112
+ b = row['B']
113
+ d = row['D']
114
+ triad = row['group_id']
115
+ cent_triad = triad.to_i / 100
116
+ folder = "#{output_folder}/alignments/#{reference_name}/#{cent_triad}/"
117
+ save_cds = "#{folder}/#{triad}.fa"
118
+ aligned = File.file?(save_cds)
119
+ aln = aligned ? read_alignment(save_cds) : mafft_align(a,b,d)
120
+ folder = "#{output_folder}/alignments_new/#{reference_name}/#{cent_triad}/" if aligned
121
+ FileUtils.mkdir_p folder
122
+ save_cds = "#{folder}/#{triad}.fa"
123
+
124
+ aln2 = Bio::Alignment.new aln
125
+ seq_start = Bio::PolyploidTools::Mask.find_start(aln)
126
+ seq_end = Bio::PolyploidTools::Mask.find_end(aln)
127
+ #puts "#{triad}: #{seq_start}-#{seq_end}"
128
+
129
+
130
+ aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: a), "A")
131
+ aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: b), "B")
132
+ aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: d), "D")
133
+
134
+ a_stats = Bio::PolyploidTools::Mask.stats(aln2["A"], triad, a, "A", reference_name)
135
+ b_stats = Bio::PolyploidTools::Mask.stats(aln2["B"], triad, b, "B", reference_name)
136
+ d_stats = Bio::PolyploidTools::Mask.stats(aln2["D"], triad, d, "D", reference_name)
137
+
138
+ write_distances(a_stats[:specific], triad, a, "A", reference_name, "specific", gz)
139
+ write_distances(b_stats[:specific], triad, b, "B", reference_name, "specific", gz)
140
+ write_distances(d_stats[:specific], triad, d, "D", reference_name, "specific", gz)
141
+
142
+ write_distances(a_stats[:semispecific], triad, a, "A", reference_name, "semispecific", gz)
143
+ write_distances(b_stats[:semispecific], triad, b, "B", reference_name, "semispecific", gz)
144
+ write_distances(d_stats[:semispecific], triad, d, "D", reference_name, "semispecific", gz)
145
+
146
+ a_stats.delete(:semispecific)
147
+ b_stats.delete(:semispecific)
148
+ d_stats.delete(:semispecific)
149
+
150
+ a_stats.delete(:specific)
151
+ b_stats.delete(:specific)
152
+ d_stats.delete(:specific)
153
+
154
+ a_stats[:length] = @cannonical[a].length
155
+ b_stats[:length] = @cannonical[b].length
156
+ d_stats[:length] = @cannonical[d].length
157
+
158
+ stats.puts a_stats.keys.join(",") unless header_printed
159
+ stats.puts a_stats.values.join(",")
160
+ stats.puts b_stats.values.join(",")
161
+ stats.puts d_stats.values.join(",")
162
+ header_printed = true
163
+
164
+ write_fasta_from_hash(aln2, save_cds)
165
+ i += 1
166
+ end
167
+ gz.close
168
+ distances.close
169
+ stats.close