bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+
7
+ require 'set'
8
+
9
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
10
+ $: << File.expand_path('.')
11
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
12
+ require path
13
+
14
+
15
+ #@snp_map=Hash.new
16
+
17
+ class HomokaryotContainer < Bio::PolyploidTools::ExonContainer
18
+
19
+
20
+ def add_snp_file(filename, chromosome, snp_in, original_name)
21
+ flanking_size = 100
22
+ File.open(filename) do | f |
23
+ f.each_line do | line |
24
+ if ARGV.size == 1 #List with Sequence
25
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
26
+ snp.use_reference = false
27
+ elsif ARGV.size == 2 #List and fasta file
28
+ snp = Bio::PolyploidTools::SNP.parse(line)
29
+ snp.use_reference = true
30
+ end
31
+ #snp = Bio::PolyploidTools::SNP.parse(line)
32
+ # puts snp.gene
33
+ snp.flanking_size = flanking_size
34
+ if snp.position > 0
35
+ snp.container = self
36
+ snp.chromosome = chromosome
37
+ snp.snp_in = snp_in
38
+ snp.original_name = original_name
39
+
40
+ snp.container = self
41
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
42
+ @snp_map[snp.gene] << snp
43
+ end
44
+ end
45
+ end
46
+
47
+
48
+ end
49
+
50
+ def print_primer_3_exons (file, target_chromosome , parental )
51
+ @snp_map.each do | gene, snp_array|
52
+ snp_array.each do |snp|
53
+ string = snp.primer_3_string( snp.chromosome, parental )
54
+ file.puts string if string.size > 0
55
+
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ class Bio::PolyploidTools::SNP
62
+
63
+ @aligned = false
64
+
65
+ def aligned_snp_position
66
+ return local_position
67
+
68
+ end
69
+
70
+ def aligned_sequences
71
+
72
+ @aligned_sequences = parental_sequences
73
+ @aligned_sequences["A"][local_position] = original
74
+ @aligned_sequences["B"][local_position] = snp
75
+ return @aligned_sequences
76
+ end
77
+ end
78
+
79
+
80
+
81
+
82
+
83
+ snp_file = ARGV[0]
84
+ reference_file = ARGV[1]
85
+
86
+ snp_in="A"
87
+ original_name="B"
88
+ snps = Array.new
89
+
90
+ #0. Load the fasta index
91
+ fasta_reference_db = nil
92
+ if reference_file
93
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference_file})
94
+ fasta_reference_db.load_fai_entries
95
+ p "Fasta reference: #{reference_file}"
96
+ end
97
+ #1. Read all the SNP files
98
+ #All the SNPs should be on the same chromosome as the first SNP.
99
+ chromosome = nil
100
+ File.open(snp_file) do | f |
101
+ f.each_line do | line |
102
+ # p line.chomp!
103
+ snp = nil
104
+ if ARGV.size == 1 #List with Sequence
105
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
106
+
107
+ elsif ARGV.size == 2 #List and fasta file
108
+ snp = Bio::PolyploidTools::SNP.parse(line)
109
+ region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
110
+ snp.template_sequence = fasta_reference_db.fetch_sequence(region)
111
+ else
112
+ raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
113
+ end
114
+ raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
115
+ snp.snp_in = snp_in
116
+ snp.original_name = original_name
117
+ snps << snp
118
+ chromosome = snp.chromosome unless chromosome
119
+ raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
120
+ end
121
+ end
122
+
123
+
124
+ output_folder="#{snp_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
125
+ Dir.mkdir(output_folder)
126
+ seqs_file= output_folder + "sequences.fa"
127
+ written_seqs = Set.new
128
+ reference_file = seqs_file unless reference_file
129
+
130
+
131
+ file = File.open(seqs_file, "w")
132
+ snps.each do |snp|
133
+ unless written_seqs.include?(snp.gene)
134
+ written_seqs << snp.gene
135
+ file.puts snp.to_fasta
136
+ end
137
+ end
138
+ file.close
139
+
140
+
141
+ container = HomokaryotContainer.new
142
+ container.add_parental({:name=>snp_in})
143
+ container.add_parental({:name=>original_name})
144
+ container.gene_models(reference_file) if reference_file
145
+
146
+
147
+
148
+
149
+
150
+ primer_3_input="#{output_folder}primer_3_input_temp"
151
+ primer_3_output="#{output_folder}primer_3_output_temp"
152
+ container.add_snp_file(snp_file, "PST130", snp_in, original_name)
153
+ primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
154
+ output_primers="#{output_folder}primers.csv"
155
+
156
+ file = File.open(primer_3_input, "w")
157
+ file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
158
+ file.puts("PRIMER_MAX_SIZE=25")
159
+ file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
160
+ file.puts("PRIMER_LIBERAL_BASE=1")
161
+ file.puts("PRIMER_NUM_RETURN=5")
162
+ file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
163
+
164
+
165
+ container.print_primer_3_exons(file, "PST130",snp_in)
166
+
167
+ file.close
168
+
169
+
170
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
171
+
172
+ #2. Pick the best primer and make the primer3 output
173
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
174
+ kasp_container.line_1=original_name
175
+ kasp_container.line_2=snp_in
176
+
177
+ snps.each do |snp|
178
+ kasp_container.add_snp(snp)
179
+ end
180
+
181
+ kasp_container.add_primers_file(primer_3_output)
182
+ header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
183
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
@@ -0,0 +1,120 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+
18
+ OptionParser.new do |opts|
19
+
20
+ opts.banner = "Usage: mafft_triads.rb [options]"
21
+
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+
26
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
27
+ options[:triads] = o
28
+ end
29
+
30
+ opts.on("-f", "--pep FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
31
+ options[:pep] = o
32
+ end
33
+
34
+ opts.on("-s", "--cds FILE" , "FASTA file containing all the possible CDS sequences. ") do |o|
35
+ options[:cds] = o
36
+ end
37
+
38
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
39
+ options[:split_token] = o
40
+ end
41
+
42
+ end.parse!
43
+
44
+
45
+ def peptide_alignment(sequences_to_align)
46
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
47
+ mafft = Bio::MAFFT.new( "mafft" , options)
48
+ report = mafft.query_align(sequences_to_align)
49
+ report.alignment
50
+ end
51
+
52
+
53
+ split_token = options[:split_token]
54
+
55
+ pep_seq = Hash.new
56
+ pep_seq_count=0
57
+ Bio::FlatFile.open(Bio::FastaFormat, options[:pep]) do |fasta_file|
58
+ fasta_file.each do |entry|
59
+ gene_name = entry.entry_id.split(split_token)[0]
60
+ pep_seq[gene_name] = entry unless pep_seq[gene_name]
61
+ pep_seq[gene_name] = entry if entry.length > pep_seq[gene_name].length
62
+ pep_seq_count += 1
63
+ end
64
+ end
65
+ $stderr.puts "#Loaded #{pep_seq.length} genes from #{pep_seq_count} pep_seq"
66
+
67
+ cds_seq = Hash.new
68
+ cds_seq_count=0
69
+ Bio::FlatFile.open(Bio::FastaFormat, options[:cds]) do |fasta_file|
70
+ fasta_file.each do |entry|
71
+ gene_name = entry.entry_id.split(split_token)[0]
72
+ cds_seq[gene_name] = entry unless cds_seq[gene_name]
73
+ cds_seq[gene_name] = entry if entry.length > cds_seq[gene_name].length
74
+ cds_seq_count += 1
75
+ end
76
+ end
77
+ $stderr.puts "#Loaded #{cds_seq.length} genes from #{cds_seq_count} cds_seq"
78
+
79
+
80
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
81
+
82
+ def write_fasta_from_hash(sequences, filename)
83
+ out = File.new(filename, "w")
84
+ #puts sequences.inspect
85
+ sequences.each_pair do | chromosome, exon_seq |
86
+ out.puts ">#{chromosome}\n#{exon_seq}\n"
87
+ end
88
+ out.close
89
+ end
90
+
91
+
92
+ CSV.foreach(options[:triads], headers:true ) do |row|
93
+ a = row['A']
94
+ b = row['B']
95
+ d = row['D']
96
+ triad = row['group_id']
97
+
98
+ to_align = Bio::Alignment::SequenceHash.new
99
+ to_align[a] = pep_seq[a]
100
+ to_align[b] = pep_seq[b]
101
+ to_align[d] = pep_seq[d]
102
+
103
+ cds_seqs = Bio::Alignment::SequenceHash.new
104
+ cds_seqs[a] = cds_seq[a].to_biosequence
105
+ cds_seqs[b] = cds_seq[b].to_biosequence
106
+ cds_seqs[d] = cds_seq[d].to_biosequence
107
+
108
+ cent_triad = triad.to_i / 100
109
+ folder = "alignments/#{cent_triad}/"
110
+ FileUtils.mkdir_p folder
111
+
112
+ pep_aln = peptide_alignment(to_align)
113
+
114
+ save_pep = "#{folder}/#{triad}.pep.fa"
115
+ write_fasta_from_hash(pep_aln, save_pep)
116
+
117
+ save_cds = "#{folder}/#{triad}.cds.fa"
118
+ write_fasta_from_hash(cds_seqs, save_cds)
119
+ #break
120
+ end
@@ -0,0 +1,403 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:output_folder] = "."
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+
18
+ OptionParser.new do |opts|
19
+
20
+ opts.banner = "Usage: filter_blat.rb [options]"
21
+
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
34
+ options[:fasta] = o
35
+ end
36
+
37
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
38
+ options[:split_token] = o
39
+ end
40
+
41
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
42
+ options[:program] = o
43
+ end
44
+
45
+ opts.on("-o", "--output_folder DIR", "Folder to save the output") do |o|
46
+ options[:output_folder] = o
47
+ end
48
+
49
+
50
+ end.parse!
51
+
52
+ module Bio::Alignment::EnumerableExtension
53
+ def each_base_alignment
54
+ names = self.keys
55
+
56
+ i = 0
57
+ len = 0
58
+ len = self[names[0]].length if names[0]
59
+ total_alignments = names.size
60
+ while i < len do
61
+ yield names.map { | chr| self[chr][i] }
62
+ i += 1
63
+ end
64
+ end
65
+
66
+ def cut_alignment(start, length)
67
+ a = Bio::Alignment::SequenceHash.new
68
+ a.set_all_property(get_all_property)
69
+ each_pair do |key, str|
70
+ seq = ""
71
+ seq = str[start, length] if str != nil
72
+ a.store(key, seq)
73
+ end
74
+ a
75
+ end
76
+
77
+ def best_block
78
+ best_start = 0
79
+ best_score = 0
80
+ best_end = 0
81
+ best_length = 0
82
+ current_start = 0
83
+ current_score = 0
84
+ current_length = 0
85
+
86
+ each_base_alignment_with_index do |bases, i|
87
+ current_start = i if current_length == 0
88
+ current_length += 1
89
+ current_score += sum_of_pair bases
90
+ if current_score > best_score
91
+ best_score = current_score
92
+ best_length = current_length
93
+ best_end = i
94
+ best_start = current_start
95
+ end
96
+
97
+ if current_score < 0
98
+ current_length = 0
99
+ current_score = 0
100
+ end
101
+
102
+ end
103
+
104
+ [best_start, best_length, len - best_start - best_length , len - best_start ]
105
+ end
106
+
107
+ def each_base_alignment_with_index
108
+ names = self.keys
109
+ total_alignments = names.size
110
+ i = 0
111
+ while i < len do
112
+ yield names.map { | chr| self[chr][i] } , i
113
+ i += 1
114
+ end
115
+ end
116
+
117
+ def each_base_alignment
118
+ each_base_alignment_with_index do |chr, i|
119
+ yield chr
120
+ end
121
+ end
122
+
123
+ def sum_of_all_pairs
124
+ return @sum_of_all_pairs if @sum_of_all_pairs
125
+ @sum_of_all_pairs = 0
126
+ self.each_base_alignment do |bases|
127
+ @sum_of_all_pairs += sum_of_pair bases
128
+ end
129
+ @sum_of_all_pairs
130
+ end
131
+
132
+ def sum_of_identities
133
+ return @sum_of_identities if @sum_of_identities
134
+ @sum_of_identities = 0
135
+ self.each_base_alignment do |bases|
136
+ @sum_of_identities += s_o_i bases
137
+ end
138
+ @sum_of_identities
139
+ end
140
+
141
+ def len
142
+ return @len if @len
143
+ names = self.keys
144
+ @len = 0
145
+ @len = self[names[0]].length if names[0] and self[names[0]] != nil
146
+ @len
147
+ end
148
+
149
+ def pairwise_comparaisons
150
+ names = self.keys
151
+ n = names.size
152
+ c = n * (n-1)/2
153
+ c
154
+ end
155
+
156
+ def identity
157
+ max_score = len * pairwise_comparaisons
158
+ sum_of_identities.to_f/max_score
159
+ end
160
+
161
+ def normalized_sum_of_all_pairs
162
+ max_score = len * pairwise_comparaisons
163
+ sum_of_all_pairs.to_f/max_score
164
+ end
165
+
166
+ def sum_of_pair(bases)
167
+ x = bases.length - 1
168
+ total = 0
169
+ for i in 0..x
170
+ y = i + 1
171
+ for j in y..x
172
+ case
173
+ when (bases[i] == "-" and bases[j] == "-")
174
+ total += 0
175
+ when (bases[i] == "N" and bases[j] == "N")
176
+ total += 0
177
+ when (bases[i] == "n" and bases[j] == "n")
178
+ total += 0
179
+ when (bases[i] == "-" or bases[j] == "-")
180
+ total -= 2
181
+ when bases[i] == bases[j]
182
+ total += 1
183
+ when bases[i] != bases[j]
184
+ total -= 1
185
+ else
186
+ $stderr.puts "Invalid comparaison! sum_of_all_pairs(#{bases})"
187
+ end
188
+ end
189
+ end
190
+ total
191
+ end
192
+
193
+ def s_o_i(bases)
194
+ x = bases.length - 1
195
+ total = 0
196
+ for i in 0..x
197
+ y = i + 1
198
+ for j in y..x
199
+ total += 1 if bases[i] == bases[j]
200
+ end
201
+ end
202
+ total
203
+ end
204
+
205
+ def window_identities(window_size=100, offset=25)
206
+ steps = (0..len).step(offset).to_a.map {|a| a + len%offset }.reverse
207
+ ret = []
208
+ steps.each_with_index do |e, i|
209
+ start = e - window_size
210
+ tmp_aln = self.cut_alignment start, window_size
211
+ tmp_arr = [
212
+ i * offset,
213
+ i * offset + window_size,
214
+ tmp_aln.sum_of_all_pairs,
215
+ tmp_aln.normalized_sum_of_all_pairs,
216
+ tmp_aln.sum_of_identities,
217
+ tmp_aln.identity]
218
+ ret << tmp_arr
219
+ end
220
+ ret
221
+ end
222
+ end
223
+
224
+ def promoter_alignment(sequences_to_align)
225
+ process = true
226
+ sequences_to_align.each_value { |val| process &= val != nil }
227
+ return sequences_to_align unless process
228
+ #options = ['--maxiterate', '1000', '--ep', '0', '--genafpair', '--quiet']
229
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
230
+ @mafft = Bio::MAFFT.new( "mafft" , options) unless @mafft
231
+ report = @mafft.query_align(sequences_to_align)
232
+ report.alignment
233
+ end
234
+
235
+ def write_fasta_from_hash(sequences, filename)
236
+ out = File.new(filename, "w")
237
+ sequences.each_pair do | chromosome, exon_seq |
238
+ out.puts ">#{chromosome}\n#{exon_seq}\n"
239
+ end
240
+ out.close
241
+ end
242
+
243
+ def get_longest_aln(aln, max_gap: 10)
244
+ names = aln.keys
245
+ i = 0
246
+ len = 0
247
+ len = aln[names[0]].length if names[0] and aln[names[0]] != nil
248
+ total_alignments = names.size
249
+ masked_snps = "-" * len
250
+ longest_start = -1
251
+ longest_length = 0
252
+ current_start = -1
253
+ current_length = 0
254
+ current_gap = 0
255
+ longest_gaps = 0
256
+ gaps = 0
257
+ while i < len do
258
+ different = 0
259
+ cov = 0
260
+ names.each do | chr |
261
+ if aln[chr][i] != "-"
262
+ cov += 1
263
+ end
264
+ end
265
+ if cov == total_alignments
266
+ current_start = i if current_length == 0
267
+ current_length += 1
268
+ current_gap = 0
269
+ else
270
+ gaps += 1
271
+ current_gap += 1
272
+ end
273
+
274
+ if current_length > longest_length
275
+ longest_length = current_length
276
+ longest_start = current_start
277
+ longest_gaps = gaps - current_gap
278
+ end
279
+ if current_gap > max_gap
280
+ current_length = 0
281
+ gaps = 0
282
+ end
283
+ i += 1
284
+ end
285
+ longest_length += longest_gaps
286
+ [longest_start, longest_length, len - longest_start - longest_length, len - longest_start]
287
+ end
288
+
289
+ split_token = options[:split_token]
290
+
291
+ def read_alignments(fasta_path, split_token)
292
+ sequences = Hash.new
293
+ sequence_count=0
294
+ Bio::FlatFile.open(Bio::FastaFormat, fasta_path) do |fasta_file|
295
+ fasta_file.each do |entry|
296
+ #puts entry
297
+ gene_name = entry.entry_id.split(split_token)[0]
298
+ sequences[gene_name] = entry unless sequences[gene_name]
299
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
300
+ sequence_count += 1
301
+ end
302
+ end
303
+ [sequences,sequence_count]
304
+ end
305
+
306
+ sequences, sequence_count = read_alignments(options[:fasta], split_token)
307
+
308
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
309
+ output_folder = options[:output_folder]
310
+
311
+ FileUtils.mkdir_p output_folder
312
+ summary_file = "#{output_folder}/identities.txt"
313
+ long_table_file = "#{output_folder}/sliding_window_identities.txt"
314
+
315
+ out = File.open(summary_file, "w")
316
+ long_table = File.open(long_table_file, "w")
317
+
318
+ i =0
319
+
320
+ header = ["triad", "total_aln_length"]
321
+ header << ["longest_start", "longest_length", "longest_start_from_CDS","longest_end_from_CDS", "longest_sum_of_all_pairs","longest_norm_sum_of_all_pairs","longest_sum_of_identities", "longest_identity"]
322
+ header << ["best_start", "best_length" , "best_start_from_CDS","best_end_from_CDS", "best_sum_of_all_pairs","best_norm_sum_of_all_pairs","best_sum_of_identities", "best_identity"]
323
+ out.puts header.join("\t")
324
+ long_table.puts ["triad", "type", "start_from_CDS", "end_from_cds" , "sum_of_all_pairs","norm_sum_of_all_pairs","sum_of_identities", "identity"].join("\t")
325
+ CSV.foreach( options[:triads], headers:true ) do |row|
326
+ a = row['A']
327
+ b = row['B']
328
+ d = row['D']
329
+ triad = row['group_id']
330
+
331
+ cent_triad = triad.to_i / 100
332
+ folder = "#{output_folder}/prom_aln/#{cent_triad}/"
333
+ save_prom = "#{folder}/#{triad}.prom.fa"
334
+
335
+ to_align = Bio::Alignment::SequenceHash.new
336
+ to_align[a] = sequences[a]
337
+ to_align[b] = sequences[b]
338
+ to_align[d] = sequences[d]
339
+
340
+ prom_aln = nil
341
+ unless File.file? save_prom
342
+ prom_aln = promoter_alignment to_align
343
+ else
344
+ ff, seqs_cnt = read_alignments save_prom, split_token
345
+ seqs = Bio::Alignment::SequenceHash.new
346
+ prom_aln = Bio::Alignment.new(ff)
347
+ end
348
+ print_arr = [triad, prom_aln.len]
349
+ aln_stats = get_longest_aln prom_aln
350
+ print_arr << aln_stats
351
+ cut_seqs = prom_aln.cut_alignment aln_stats[0], aln_stats[1]
352
+
353
+
354
+
355
+ print_arr << cut_seqs.sum_of_all_pairs
356
+ print_arr << cut_seqs.normalized_sum_of_all_pairs
357
+
358
+ print_arr << cut_seqs.sum_of_identities
359
+ print_arr << cut_seqs.identity
360
+
361
+ best_aln_stats = prom_aln.best_block
362
+ best_aln_cut = prom_aln.cut_alignment best_aln_stats[0], best_aln_stats[1]
363
+
364
+ print_arr << best_aln_stats
365
+
366
+ print_arr << best_aln_cut.sum_of_all_pairs
367
+ print_arr << best_aln_cut.normalized_sum_of_all_pairs
368
+
369
+ print_arr << best_aln_cut.sum_of_identities
370
+ print_arr << best_aln_cut.identity
371
+
372
+ base = [triad, "cut_longest_region"]
373
+ cut_seqs.window_identities.each do |e|
374
+ long_table.puts [base, e].flatten.join("\t")
375
+ end
376
+
377
+ base = [triad, "cut_best_region"]
378
+ best_aln_cut.window_identities.each do |e|
379
+ long_table.puts [base, e].flatten.join("\t")
380
+ end
381
+
382
+ base = [triad, "full_promoter"]
383
+ prom_aln.window_identities.each do |e|
384
+ long_table.puts [base, e].flatten.join("\t")
385
+ end
386
+
387
+ out.puts print_arr.join("\t")
388
+
389
+ FileUtils.mkdir_p folder
390
+
391
+ write_fasta_from_hash(prom_aln, save_prom) unless File.file?(save_prom)
392
+
393
+ save_prom_cut = "#{folder}/#{triad}.prom.cut.fa"
394
+ write_fasta_from_hash(cut_seqs, save_prom_cut) unless File.file?(save_prom)
395
+
396
+ save_prom_cut_best = "#{folder}/#{triad}.prom.cut.best.fa"
397
+ write_fasta_from_hash(best_aln_cut, save_prom_cut_best)
398
+
399
+ i += 1
400
+ #break if i > 10
401
+ end
402
+ long_table.close
403
+ out.close