bio-polymarker 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'optparse'
4
+
5
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
6
+ $: << File.expand_path('.')
7
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
8
+ require path
9
+
10
+
11
+ def log(msg)
12
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
13
+ puts "#{time}: #{msg}"
14
+ end
15
+
16
+ markers = nil
17
+
18
+ options = {}
19
+ OptionParser.new do |opts|
20
+
21
+ opts.banner = "Usage: polymarker.rb [options]"
22
+
23
+ opts.on("-c", "--chromosome CHR", "chromosome (1A, 3B, etc)") do |o|
24
+ options[:chromosome] = o.upcase
25
+ end
26
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
27
+ options[:reference] = o
28
+ end
29
+ opts.on("-m", "--map CSV", "File with the map and sequence \n Header: INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE") do |o|
30
+ options[:map] = o
31
+ end
32
+
33
+ end.parse!
34
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
35
+ reference = options[:reference] if options[:reference]
36
+ throw raise Exception.new(), "Reference has to be provided" unless reference
37
+
38
+ map = Bio::PolyploidTools::ArmMap.new
39
+ map.chromosome = options[:chromosome]
40
+ map.global_reference(reference)
41
+ log "Reading markers file"
42
+ Bio::PolyploidTools::Marker.parse(options[:map]) do |marker|
43
+ if options[:chromosome] == marker.chr
44
+ map.markers[marker.snp_name] = marker
45
+ end
46
+ end
47
+
48
+
49
+
50
+ fasta_tmp="markers_#{options[:chromosome]}.fa"
51
+ contigs_tmp="contigs_#{options[:chromosome]}.fa"
52
+ aln_tmp="align_#{options[:chromosome]}.psl"
53
+ contigs_map="contigs_map_#{options[:chromosome]}.fa"
54
+ map_with_contigs="contigs_map_#{options[:chromosome]}.csv"
55
+
56
+ #1. Prints the sequences to print according to the chromosome to search
57
+ log "Writing markers: #{fasta_tmp}"
58
+ map.print_fasta_markers(fasta_tmp)
59
+ log "Writing contigs: #{contigs_tmp}"
60
+ map.print_fasta_contigs_from_reference(contigs_tmp)
61
+ log "Aligning markers #{aln_tmp}"
62
+ map.align_markers(aln_tmp)
63
+ log "printing contigs with markers #{contigs_map}"
64
+ map.print_fasta_contigs_for_markers(contigs_map)
65
+ log "printing map with contigs #{map_with_contigs}"
66
+ map.print_map_with_contigs(map_with_contigs)
@@ -0,0 +1,241 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+ require 'optparse'
7
+ require 'set'
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
11
+ require path
12
+
13
+ options = {}
14
+ options[:min_identity] = 90
15
+ options[:filter_best] = false
16
+ options[:debug] = false
17
+
18
+ OptionParser.new do |opts|
19
+ opts.banner = "Usage: marler_to_vcf.rb [options]"
20
+
21
+ opts.on("-c", "--contigs FILE", "File with contigs to use as database") do |o|
22
+ options[:path_to_contigs] = o
23
+ end
24
+
25
+ opts.on("-m", "--marker_list FILE", "File with the list of markers to search from") do |o|
26
+ options[:marker_list] = o
27
+ end
28
+
29
+ opts.on("-b", "--filter_best", "If set, only keep the best alignment for each chromosome") do
30
+ options[:filter_best] = false
31
+ end
32
+
33
+ opts.on("-D", "--debug", "Validate that the flanking sequences are correct") do
34
+ options[:debug] = true
35
+ end
36
+
37
+ opts.on("-i", "--min_identity INT", "Minimum identity to consider a hit (default 90)") do |o|
38
+ options[:min_identity] = o.to_i
39
+ end
40
+
41
+ opts.on("-o", "--output FOLDER", "Output folder") do |o|
42
+ options[:output_folder] = o
43
+ end
44
+
45
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
46
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
47
+ end
48
+
49
+ opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: blast") do |o|
50
+ raise "Invalid aligner" unless o == "exonerate" or o == "blast"
51
+ options[:aligner] = o.to_sym
52
+ end
53
+
54
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
55
+ options[:database] = o
56
+ end
57
+
58
+ end.parse!
59
+ options[:database] = options[:path_to_contigs]
60
+ p options
61
+ p ARGV
62
+
63
+
64
+ path_to_contigs=options[:path_to_contigs]
65
+
66
+ original_name="A"
67
+ snp_in="B"
68
+
69
+ fasta_reference = nil
70
+ test_file=options[:marker_list]
71
+
72
+ output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}"
73
+ output_folder= options[:output_folder] if options[:output_folder]
74
+ Dir.mkdir(output_folder)
75
+ #T
76
+ temp_fasta_query="#{output_folder}/to_align.fa"
77
+ temp_contigs="#{output_folder}/contigs_tmp.fa"
78
+ exonerate_file="#{output_folder}/exonerate_tmp.tab"
79
+ vcf_file="#{output_folder}/snp_positions.vcf"
80
+
81
+ min_identity= options[:min_identity]
82
+
83
+ @status_file="#{output_folder}/status.txt"
84
+
85
+
86
+ def write_status(status)
87
+ f=File.open(@status_file, "a")
88
+ f.puts "#{Time.now.to_s},#{status}"
89
+ f.close
90
+ end
91
+
92
+
93
+ snps = Hash.new
94
+
95
+ fasta_reference_db=nil
96
+
97
+ #if options[:debug]
98
+ write_status "Loading Reference"
99
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path_to_contigs})
100
+ fasta_reference_db.load_fai_entries
101
+ write_status "Fasta reference: #{fasta_reference}"
102
+ #end
103
+
104
+ #1. Read all the SNP files
105
+ #chromosome = nil
106
+ write_status "Reading SNPs"
107
+
108
+ File.open(test_file) do | f |
109
+ f.each_line do | line |
110
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
111
+ snp.genomes_count = options[:genomes_count]
112
+ snp.snp_in = snp_in
113
+ snp.original_name = original_name
114
+ if snp.position
115
+ snps[snp.gene] = snp
116
+ else
117
+ $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
118
+ end
119
+ end
120
+ end
121
+
122
+ #2. Generate all the fasta files
123
+ write_status "Writing sequences to align"
124
+ written_seqs = Set.new
125
+ file = File.open(temp_fasta_query, "w")
126
+ snps.each_pair do |k,snp|
127
+ unless written_seqs.include?(snp.gene)
128
+ written_seqs << snp.gene
129
+ file.puts snp.to_fasta
130
+ end
131
+ end
132
+ file.close
133
+
134
+
135
+ #3. Run exonerate on each of the possible chromosomes for the SNP
136
+ #puts chromosome
137
+ #chr_group = chromosome[0]
138
+ write_status "Searching markers in genome"
139
+ exo_f = File.open(exonerate_file, "w")
140
+ contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
141
+ filename=path_to_contigs
142
+ #puts filename
143
+ target=filename
144
+
145
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
146
+ fasta_file.load_fai_entries
147
+ found_contigs = Set.new
148
+
149
+ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
150
+ if aln.identity > min_identity
151
+ exo_f.puts aln.line
152
+ end
153
+ end
154
+
155
+ Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database]}) do |aln|
156
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
157
+ end
158
+
159
+ exo_f.close()
160
+
161
+ def print_positions(min_identity:90, filter_best:false, exonerate_filename:"test.exo", snps:{}, reference:nil, out:$stdout)
162
+ marker_count=Hash.new { |h, k| h[k] = 1 }
163
+ File.open(exonerate_filename) do |f|
164
+ f.each_line do | line |
165
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
166
+ next unless record and record.identity >= min_identity
167
+ snp = snps[record.query_id]
168
+ next unless snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
169
+ begin
170
+
171
+ position = record.query_position_on_target(snp.position)
172
+ q_strand = record.query_strand
173
+ t_strand = record.target_strand
174
+ template = snp.template_sequence
175
+
176
+ vulgar = record.exon_on_gene_position(snp.position)
177
+ tr = vulgar.target_region
178
+ qr = vulgar.query_region
179
+ template_pre = template[qr.start - 1 .. snp.position - 1 ]
180
+ tr.orientation == :forward ? tr.end = position : tr.start = position
181
+ region = tr
182
+ target_seq = reference.fetch_sequence(region)
183
+ target_seq[-1] = target_seq[-1].upcase
184
+ ref_base = target_seq[-1]
185
+ ma = ref_base
186
+ alt_base = [snp.snp, snp.original].join(",")
187
+
188
+ if snp.original == ref_base
189
+ alt_base = snp.snp
190
+ elsif snp.snp == ref_base
191
+ alt_base = snp.original
192
+ end
193
+
194
+ if record.target_strand == :reverse
195
+ alt_base = Bio::Sequence::NA.new(alt_base)
196
+ ref_base = Bio::Sequence::NA.new(ref_base)
197
+ alt_base.complement!.upcase!
198
+ ref_base.complement!.upcase!
199
+ end
200
+
201
+ info = ["OR=#{record.target_strand}"]
202
+ info << "SC=#{record.score}"
203
+ info << "PI=#{record.pi}"
204
+ info << "MA=#{ma}"
205
+ info << "TS=#{target_seq}"
206
+ vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
207
+ #snp2 = Bio::PolyploidTools::SNP.parseVCF( vcf_line )
208
+ #snp2.setTemplateFromFastaFile(reference)
209
+ #seq2=snp2.to_polymarker_sequence(50)
210
+ #info << "PS=#{seq2}"
211
+ vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
212
+ out.puts(vcf_line)
213
+
214
+ marker_count[record.query_id] += 1
215
+ rescue Bio::DB::Exonerate::ExonerateException
216
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
217
+ end
218
+ end
219
+ end
220
+ end
221
+
222
+
223
+ write_status "Printing VCF file"
224
+ #puts snps.inspect
225
+ out = File.open(vcf_file, "w")
226
+ out.puts "##fileformat=VCFv4.2"
227
+ out.puts "##fileDate=#{Time.now.strftime("%Y%m%d")}"
228
+ out.puts "##source=#{$0}"
229
+ out.puts "##reference=file://#{options[:path_to_contigs]}"
230
+ out.puts "##INFO=<ID=OR,Number=1,Type=String,Description=\"Orientation of the alignment of the marker\">"
231
+ out.puts "##INFO=<ID=SC,Number=1,Type=Float,Description=\"Alignment score of the marker\">"
232
+ out.puts "##INFO=<ID=PI,Number=1,Type=Float,Description=\"Percentage of identity of the alignment to the marker\">"
233
+ out.puts "##INFO=<ID=PS,Number=1,Type=String,Description=\"SNP sequence for PolyMarker\">"
234
+ out.puts "##INFO=<ID=MA,Number=1,Type=String,Description=\"Allele based on the original marker sequence\">"
235
+ out.puts "##INFO=<ID=TS,Number=1,Type=String,Description=\"Target sequence before the SNP from the reference\">"
236
+ out.puts "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
237
+ print_positions(exonerate_filename:exonerate_file, min_identity:95, snps:snps, reference: fasta_reference_db, out:out)
238
+ out.close
239
+ write_status "DONE"
240
+
241
+
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #This uses the map output from map_markers_to_contigs.rb
4
+ #You need a reference with the name of the contigs, containing the chromosome
5
+ #arm and a list of sequences to map. The algorithm creates a smaller reference
6
+ #file, so the search only spans across the contigs in the region. This should
7
+ #allow to use a refined mapping algorithm.
8
+ require 'bio'
9
+ require 'optparse'
10
+
11
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
12
+ $: << File.expand_path('.')
13
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
14
+ require path
15
+
16
+
17
+ def log(msg)
18
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
19
+ puts "#{time}: #{msg}"
20
+ end
21
+
22
+ markers = nil
23
+
24
+ options = {}
25
+ OptionParser.new do |opts|
26
+
27
+ opts.banner = "Usage: markers_in_region.rb [options]"
28
+
29
+ opts.on("-c", "--chromosome CHR", "chromosome (1A, 3B, etc)") do |o|
30
+ options[:chromosome] = o.upcase
31
+ end
32
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
33
+ options[:reference] = o
34
+ end
35
+ opts.on("-m", "--map CSV", "File with the map and sequence \n Header: INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE") do |o|
36
+ options[:map] = o
37
+ end
38
+
39
+ end.parse!
40
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
41
+ reference = options[:reference] if options[:reference]
42
+ throw raise Exception.new(), "Reference has to be provided" unless reference
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+
4
+ require 'csv'
5
+ require 'fileutils'
6
+ require 'tmpdir'
7
+ require 'bio-samtools-wrapper'
8
+ require 'bio'
9
+
10
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
11
+ $: << File.expand_path('.')
12
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
13
+ require path
14
+ opts = {}
15
+ opts[:identity] = 50
16
+ opts[:min_bases] = 200
17
+ opts[:split_token] = "."
18
+ opts[:tmp_folder] = Dir.mktmpdir
19
+ opts[:random_sample] = 0
20
+ opts[:output_folder] = "."
21
+
22
+ OptionParser.new do |o|
23
+
24
+ o.banner = "Usage: mask_triads.rb [options]"
25
+
26
+ o.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
27
+ opts[:triads] = o
28
+ end
29
+
30
+ o.on("-f", "--fasta FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
31
+ opts[:fasta] = o
32
+ end
33
+
34
+ o.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
35
+ opts[:split_token] = o
36
+ end
37
+
38
+ o.on("-o", "--output_folder DIR", "Location to save the alignment masks. If the alignment exists, it is recycled to avoid calling MAFFT again") do |o|
39
+ opts[:output_folder] = o
40
+ end
41
+ end.parse!
42
+
43
+
44
+ split_token = opts[:split_token]
45
+ reference_name = File.basename opts[:fasta]
46
+ output_folder = opts[:output_folder]
47
+ @fasta_reference_db = Bio::DB::Fasta::FastaFile.new(fasta: opts[:fasta])
48
+ @fasta_reference_db.load_fai_entries
49
+ #puts @fasta_reference_db.index.entries
50
+ @cannonical = Hash.new
51
+ @fasta_reference_db.index.entries.each do |e|
52
+ gene = e.id.split(split_token)[0]
53
+ @cannonical[gene] = e unless @cannonical[gene]
54
+ @cannonical[gene] = e if e.length > @cannonical[gene].length
55
+ end
56
+
57
+ $stderr.puts "#Loaded #{@cannonical.length} canonical sequences from #{@fasta_reference_db.index.size} in reference"
58
+
59
+ $stderr.puts "TMP dir: #{opts[:tmp_folder]}"
60
+
61
+ def write_fasta_from_hash(sequences, filename)
62
+ out = File.new(filename, "w")
63
+ sequences.each_pair do | chromosome, exon_seq |
64
+ out.puts ">#{chromosome}\n#{exon_seq}\n"
65
+ end
66
+ out.close
67
+ end
68
+
69
+ def mafft_align(a, b, d)
70
+ to_align = Bio::Alignment::SequenceHash.new
71
+ seq_a = @fasta_reference_db.fetch_sequence(@cannonical[a].get_full_region)
72
+ seq_b = @fasta_reference_db.fetch_sequence(@cannonical[b].get_full_region)
73
+ seq_d = @fasta_reference_db.fetch_sequence(@cannonical[d].get_full_region)
74
+ to_align[a] = seq_a
75
+ to_align[b] = seq_b
76
+ to_align[d] = seq_d
77
+ report = mafft.query_alignment(to_align)
78
+ aln = report.alignment
79
+ aln
80
+ end
81
+
82
+ def read_alignment(path)
83
+ aln = Bio::Alignment::SequenceHash.new
84
+ i = 0
85
+ Bio::FlatFile.open(Bio::FastaFormat, path) do |fasta_file|
86
+ fasta_file.each do |entry|
87
+ aln[entry.entry_id] = entry.seq if i < 3
88
+ i += 1
89
+ end
90
+ end
91
+ aln
92
+ end
93
+
94
+
95
+ mafft_opts = ['--maxiterate', '1000', '--localpair', '--quiet']
96
+ mafft = Bio::MAFFT.new( "mafft" , mafft_opts)
97
+ header_printed = false
98
+ stats = File.open("#{output_folder}/#{reference_name}.identity_stats.csv", "w")
99
+ distances = File.open("#{output_folder}/#{reference_name}.distance_between_snps.csv.gz", "w")
100
+ gz = Zlib::GzipWriter.new(distances)
101
+ gz.write "triad,gene,genome,reference,type,distance\n"
102
+ #gz.close
103
+
104
+ def write_distances(distances, triad, gene, genome, reference, type, out)
105
+ distances.each { |e| out.write "#{triad},#{gene},#{genome},#{reference},#{type},#{e}\n" }
106
+ end
107
+
108
+ i = 0
109
+ CSV.foreach(opts[:triads], headers:true ) do |row|
110
+ next unless row["cardinality_abs"] == "1:1:1" and row["HC.LC"] == "HC-only"
111
+ a = row['A']
112
+ b = row['B']
113
+ d = row['D']
114
+ triad = row['group_id']
115
+ cent_triad = triad.to_i / 100
116
+ folder = "#{output_folder}/alignments/#{reference_name}/#{cent_triad}/"
117
+ save_cds = "#{folder}/#{triad}.fa"
118
+ aligned = File.file?(save_cds)
119
+ aln = aligned ? read_alignment(save_cds) : mafft_align(a,b,d)
120
+ folder = "#{output_folder}/alignments_new/#{reference_name}/#{cent_triad}/" if aligned
121
+ FileUtils.mkdir_p folder
122
+ save_cds = "#{folder}/#{triad}.fa"
123
+
124
+ aln2 = Bio::Alignment.new aln
125
+ seq_start = Bio::PolyploidTools::Mask.find_start(aln)
126
+ seq_end = Bio::PolyploidTools::Mask.find_end(aln)
127
+ #puts "#{triad}: #{seq_start}-#{seq_end}"
128
+
129
+
130
+ aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: a), "A")
131
+ aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: b), "B")
132
+ aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: d), "D")
133
+
134
+ a_stats = Bio::PolyploidTools::Mask.stats(aln2["A"], triad, a, "A", reference_name)
135
+ b_stats = Bio::PolyploidTools::Mask.stats(aln2["B"], triad, b, "B", reference_name)
136
+ d_stats = Bio::PolyploidTools::Mask.stats(aln2["D"], triad, d, "D", reference_name)
137
+
138
+ write_distances(a_stats[:specific], triad, a, "A", reference_name, "specific", gz)
139
+ write_distances(b_stats[:specific], triad, b, "B", reference_name, "specific", gz)
140
+ write_distances(d_stats[:specific], triad, d, "D", reference_name, "specific", gz)
141
+
142
+ write_distances(a_stats[:semispecific], triad, a, "A", reference_name, "semispecific", gz)
143
+ write_distances(b_stats[:semispecific], triad, b, "B", reference_name, "semispecific", gz)
144
+ write_distances(d_stats[:semispecific], triad, d, "D", reference_name, "semispecific", gz)
145
+
146
+ a_stats.delete(:semispecific)
147
+ b_stats.delete(:semispecific)
148
+ d_stats.delete(:semispecific)
149
+
150
+ a_stats.delete(:specific)
151
+ b_stats.delete(:specific)
152
+ d_stats.delete(:specific)
153
+
154
+ a_stats[:length] = @cannonical[a].length
155
+ b_stats[:length] = @cannonical[b].length
156
+ d_stats[:length] = @cannonical[d].length
157
+
158
+ stats.puts a_stats.keys.join(",") unless header_printed
159
+ stats.puts a_stats.values.join(",")
160
+ stats.puts b_stats.values.join(",")
161
+ stats.puts d_stats.values.join(",")
162
+ header_printed = true
163
+
164
+ write_fasta_from_hash(aln2, save_cds)
165
+ i += 1
166
+ end
167
+ gz.close
168
+ distances.close
169
+ stats.close