bio-polymarker 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+
7
+ require 'set'
8
+
9
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
10
+ $: << File.expand_path('.')
11
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
12
+ require path
13
+
14
+
15
+ #@snp_map=Hash.new
16
+
17
+ class HomokaryotContainer < Bio::PolyploidTools::ExonContainer
18
+
19
+
20
+ def add_snp_file(filename, chromosome, snp_in, original_name)
21
+ flanking_size = 100
22
+ File.open(filename) do | f |
23
+ f.each_line do | line |
24
+ if ARGV.size == 1 #List with Sequence
25
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
26
+ snp.use_reference = false
27
+ elsif ARGV.size == 2 #List and fasta file
28
+ snp = Bio::PolyploidTools::SNP.parse(line)
29
+ snp.use_reference = true
30
+ end
31
+ #snp = Bio::PolyploidTools::SNP.parse(line)
32
+ # puts snp.gene
33
+ snp.flanking_size = flanking_size
34
+ if snp.position > 0
35
+ snp.container = self
36
+ snp.chromosome = chromosome
37
+ snp.snp_in = snp_in
38
+ snp.original_name = original_name
39
+
40
+ snp.container = self
41
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
42
+ @snp_map[snp.gene] << snp
43
+ end
44
+ end
45
+ end
46
+
47
+
48
+ end
49
+
50
+ def print_primer_3_exons (file, target_chromosome , parental )
51
+ @snp_map.each do | gene, snp_array|
52
+ snp_array.each do |snp|
53
+ string = snp.primer_3_string( snp.chromosome, parental )
54
+ file.puts string if string.size > 0
55
+
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ class Bio::PolyploidTools::SNP
62
+
63
+ @aligned = false
64
+
65
+ def aligned_snp_position
66
+ return local_position
67
+
68
+ end
69
+
70
+ def aligned_sequences
71
+
72
+ @aligned_sequences = parental_sequences
73
+ @aligned_sequences["A"][local_position] = original
74
+ @aligned_sequences["B"][local_position] = snp
75
+ return @aligned_sequences
76
+ end
77
+ end
78
+
79
+
80
+
81
+
82
+
83
+ snp_file = ARGV[0]
84
+ reference_file = ARGV[1]
85
+
86
+ snp_in="A"
87
+ original_name="B"
88
+ snps = Array.new
89
+
90
+ #0. Load the fasta index
91
+ fasta_reference_db = nil
92
+ if reference_file
93
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference_file})
94
+ fasta_reference_db.load_fai_entries
95
+ p "Fasta reference: #{reference_file}"
96
+ end
97
+ #1. Read all the SNP files
98
+ #All the SNPs should be on the same chromosome as the first SNP.
99
+ chromosome = nil
100
+ File.open(snp_file) do | f |
101
+ f.each_line do | line |
102
+ # p line.chomp!
103
+ snp = nil
104
+ if ARGV.size == 1 #List with Sequence
105
+ snp = Bio::PolyploidTools::SNPSequence.parse(line)
106
+
107
+ elsif ARGV.size == 2 #List and fasta file
108
+ snp = Bio::PolyploidTools::SNP.parse(line)
109
+ region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
110
+ snp.template_sequence = fasta_reference_db.fetch_sequence(region)
111
+ else
112
+ raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
113
+ end
114
+ raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
115
+ snp.snp_in = snp_in
116
+ snp.original_name = original_name
117
+ snps << snp
118
+ chromosome = snp.chromosome unless chromosome
119
+ raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
120
+ end
121
+ end
122
+
123
+
124
+ output_folder="#{snp_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
125
+ Dir.mkdir(output_folder)
126
+ seqs_file= output_folder + "sequences.fa"
127
+ written_seqs = Set.new
128
+ reference_file = seqs_file unless reference_file
129
+
130
+
131
+ file = File.open(seqs_file, "w")
132
+ snps.each do |snp|
133
+ unless written_seqs.include?(snp.gene)
134
+ written_seqs << snp.gene
135
+ file.puts snp.to_fasta
136
+ end
137
+ end
138
+ file.close
139
+
140
+
141
+ container = HomokaryotContainer.new
142
+ container.add_parental({:name=>snp_in})
143
+ container.add_parental({:name=>original_name})
144
+ container.gene_models(reference_file) if reference_file
145
+
146
+
147
+
148
+
149
+
150
+ primer_3_input="#{output_folder}primer_3_input_temp"
151
+ primer_3_output="#{output_folder}primer_3_output_temp"
152
+ container.add_snp_file(snp_file, "PST130", snp_in, original_name)
153
+ primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
154
+ output_primers="#{output_folder}primers.csv"
155
+
156
+ file = File.open(primer_3_input, "w")
157
+ file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
158
+ file.puts("PRIMER_MAX_SIZE=25")
159
+ file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
160
+ file.puts("PRIMER_LIBERAL_BASE=1")
161
+ file.puts("PRIMER_NUM_RETURN=5")
162
+ file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
163
+
164
+
165
+ container.print_primer_3_exons(file, "PST130",snp_in)
166
+
167
+ file.close
168
+
169
+
170
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
171
+
172
+ #2. Pick the best primer and make the primer3 output
173
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
174
+ kasp_container.line_1=original_name
175
+ kasp_container.line_2=snp_in
176
+
177
+ snps.each do |snp|
178
+ kasp_container.add_snp(snp)
179
+ end
180
+
181
+ kasp_container.add_primers_file(primer_3_output)
182
+ header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
183
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
@@ -0,0 +1,120 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+
18
+ OptionParser.new do |opts|
19
+
20
+ opts.banner = "Usage: mafft_triads.rb [options]"
21
+
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+
26
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
27
+ options[:triads] = o
28
+ end
29
+
30
+ opts.on("-f", "--pep FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
31
+ options[:pep] = o
32
+ end
33
+
34
+ opts.on("-s", "--cds FILE" , "FASTA file containing all the possible CDS sequences. ") do |o|
35
+ options[:cds] = o
36
+ end
37
+
38
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
39
+ options[:split_token] = o
40
+ end
41
+
42
+ end.parse!
43
+
44
+
45
+ def peptide_alignment(sequences_to_align)
46
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
47
+ mafft = Bio::MAFFT.new( "mafft" , options)
48
+ report = mafft.query_align(sequences_to_align)
49
+ report.alignment
50
+ end
51
+
52
+
53
+ split_token = options[:split_token]
54
+
55
+ pep_seq = Hash.new
56
+ pep_seq_count=0
57
+ Bio::FlatFile.open(Bio::FastaFormat, options[:pep]) do |fasta_file|
58
+ fasta_file.each do |entry|
59
+ gene_name = entry.entry_id.split(split_token)[0]
60
+ pep_seq[gene_name] = entry unless pep_seq[gene_name]
61
+ pep_seq[gene_name] = entry if entry.length > pep_seq[gene_name].length
62
+ pep_seq_count += 1
63
+ end
64
+ end
65
+ $stderr.puts "#Loaded #{pep_seq.length} genes from #{pep_seq_count} pep_seq"
66
+
67
+ cds_seq = Hash.new
68
+ cds_seq_count=0
69
+ Bio::FlatFile.open(Bio::FastaFormat, options[:cds]) do |fasta_file|
70
+ fasta_file.each do |entry|
71
+ gene_name = entry.entry_id.split(split_token)[0]
72
+ cds_seq[gene_name] = entry unless cds_seq[gene_name]
73
+ cds_seq[gene_name] = entry if entry.length > cds_seq[gene_name].length
74
+ cds_seq_count += 1
75
+ end
76
+ end
77
+ $stderr.puts "#Loaded #{cds_seq.length} genes from #{cds_seq_count} cds_seq"
78
+
79
+
80
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
81
+
82
+ def write_fasta_from_hash(sequences, filename)
83
+ out = File.new(filename, "w")
84
+ #puts sequences.inspect
85
+ sequences.each_pair do | chromosome, exon_seq |
86
+ out.puts ">#{chromosome}\n#{exon_seq}\n"
87
+ end
88
+ out.close
89
+ end
90
+
91
+
92
+ CSV.foreach(options[:triads], headers:true ) do |row|
93
+ a = row['A']
94
+ b = row['B']
95
+ d = row['D']
96
+ triad = row['group_id']
97
+
98
+ to_align = Bio::Alignment::SequenceHash.new
99
+ to_align[a] = pep_seq[a]
100
+ to_align[b] = pep_seq[b]
101
+ to_align[d] = pep_seq[d]
102
+
103
+ cds_seqs = Bio::Alignment::SequenceHash.new
104
+ cds_seqs[a] = cds_seq[a].to_biosequence
105
+ cds_seqs[b] = cds_seq[b].to_biosequence
106
+ cds_seqs[d] = cds_seq[d].to_biosequence
107
+
108
+ cent_triad = triad.to_i / 100
109
+ folder = "alignments/#{cent_triad}/"
110
+ FileUtils.mkdir_p folder
111
+
112
+ pep_aln = peptide_alignment(to_align)
113
+
114
+ save_pep = "#{folder}/#{triad}.pep.fa"
115
+ write_fasta_from_hash(pep_aln, save_pep)
116
+
117
+ save_cds = "#{folder}/#{triad}.cds.fa"
118
+ write_fasta_from_hash(cds_seqs, save_cds)
119
+ #break
120
+ end
@@ -0,0 +1,403 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:output_folder] = "."
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+
18
+ OptionParser.new do |opts|
19
+
20
+ opts.banner = "Usage: filter_blat.rb [options]"
21
+
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
34
+ options[:fasta] = o
35
+ end
36
+
37
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
38
+ options[:split_token] = o
39
+ end
40
+
41
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
42
+ options[:program] = o
43
+ end
44
+
45
+ opts.on("-o", "--output_folder DIR", "Folder to save the output") do |o|
46
+ options[:output_folder] = o
47
+ end
48
+
49
+
50
+ end.parse!
51
+
52
+ module Bio::Alignment::EnumerableExtension
53
+ def each_base_alignment
54
+ names = self.keys
55
+
56
+ i = 0
57
+ len = 0
58
+ len = self[names[0]].length if names[0]
59
+ total_alignments = names.size
60
+ while i < len do
61
+ yield names.map { | chr| self[chr][i] }
62
+ i += 1
63
+ end
64
+ end
65
+
66
+ def cut_alignment(start, length)
67
+ a = Bio::Alignment::SequenceHash.new
68
+ a.set_all_property(get_all_property)
69
+ each_pair do |key, str|
70
+ seq = ""
71
+ seq = str[start, length] if str != nil
72
+ a.store(key, seq)
73
+ end
74
+ a
75
+ end
76
+
77
+ def best_block
78
+ best_start = 0
79
+ best_score = 0
80
+ best_end = 0
81
+ best_length = 0
82
+ current_start = 0
83
+ current_score = 0
84
+ current_length = 0
85
+
86
+ each_base_alignment_with_index do |bases, i|
87
+ current_start = i if current_length == 0
88
+ current_length += 1
89
+ current_score += sum_of_pair bases
90
+ if current_score > best_score
91
+ best_score = current_score
92
+ best_length = current_length
93
+ best_end = i
94
+ best_start = current_start
95
+ end
96
+
97
+ if current_score < 0
98
+ current_length = 0
99
+ current_score = 0
100
+ end
101
+
102
+ end
103
+
104
+ [best_start, best_length, len - best_start - best_length , len - best_start ]
105
+ end
106
+
107
+ def each_base_alignment_with_index
108
+ names = self.keys
109
+ total_alignments = names.size
110
+ i = 0
111
+ while i < len do
112
+ yield names.map { | chr| self[chr][i] } , i
113
+ i += 1
114
+ end
115
+ end
116
+
117
+ def each_base_alignment
118
+ each_base_alignment_with_index do |chr, i|
119
+ yield chr
120
+ end
121
+ end
122
+
123
+ def sum_of_all_pairs
124
+ return @sum_of_all_pairs if @sum_of_all_pairs
125
+ @sum_of_all_pairs = 0
126
+ self.each_base_alignment do |bases|
127
+ @sum_of_all_pairs += sum_of_pair bases
128
+ end
129
+ @sum_of_all_pairs
130
+ end
131
+
132
+ def sum_of_identities
133
+ return @sum_of_identities if @sum_of_identities
134
+ @sum_of_identities = 0
135
+ self.each_base_alignment do |bases|
136
+ @sum_of_identities += s_o_i bases
137
+ end
138
+ @sum_of_identities
139
+ end
140
+
141
+ def len
142
+ return @len if @len
143
+ names = self.keys
144
+ @len = 0
145
+ @len = self[names[0]].length if names[0] and self[names[0]] != nil
146
+ @len
147
+ end
148
+
149
+ def pairwise_comparaisons
150
+ names = self.keys
151
+ n = names.size
152
+ c = n * (n-1)/2
153
+ c
154
+ end
155
+
156
+ def identity
157
+ max_score = len * pairwise_comparaisons
158
+ sum_of_identities.to_f/max_score
159
+ end
160
+
161
+ def normalized_sum_of_all_pairs
162
+ max_score = len * pairwise_comparaisons
163
+ sum_of_all_pairs.to_f/max_score
164
+ end
165
+
166
+ def sum_of_pair(bases)
167
+ x = bases.length - 1
168
+ total = 0
169
+ for i in 0..x
170
+ y = i + 1
171
+ for j in y..x
172
+ case
173
+ when (bases[i] == "-" and bases[j] == "-")
174
+ total += 0
175
+ when (bases[i] == "N" and bases[j] == "N")
176
+ total += 0
177
+ when (bases[i] == "n" and bases[j] == "n")
178
+ total += 0
179
+ when (bases[i] == "-" or bases[j] == "-")
180
+ total -= 2
181
+ when bases[i] == bases[j]
182
+ total += 1
183
+ when bases[i] != bases[j]
184
+ total -= 1
185
+ else
186
+ $stderr.puts "Invalid comparaison! sum_of_all_pairs(#{bases})"
187
+ end
188
+ end
189
+ end
190
+ total
191
+ end
192
+
193
+ def s_o_i(bases)
194
+ x = bases.length - 1
195
+ total = 0
196
+ for i in 0..x
197
+ y = i + 1
198
+ for j in y..x
199
+ total += 1 if bases[i] == bases[j]
200
+ end
201
+ end
202
+ total
203
+ end
204
+
205
+ def window_identities(window_size=100, offset=25)
206
+ steps = (0..len).step(offset).to_a.map {|a| a + len%offset }.reverse
207
+ ret = []
208
+ steps.each_with_index do |e, i|
209
+ start = e - window_size
210
+ tmp_aln = self.cut_alignment start, window_size
211
+ tmp_arr = [
212
+ i * offset,
213
+ i * offset + window_size,
214
+ tmp_aln.sum_of_all_pairs,
215
+ tmp_aln.normalized_sum_of_all_pairs,
216
+ tmp_aln.sum_of_identities,
217
+ tmp_aln.identity]
218
+ ret << tmp_arr
219
+ end
220
+ ret
221
+ end
222
+ end
223
+
224
+ def promoter_alignment(sequences_to_align)
225
+ process = true
226
+ sequences_to_align.each_value { |val| process &= val != nil }
227
+ return sequences_to_align unless process
228
+ #options = ['--maxiterate', '1000', '--ep', '0', '--genafpair', '--quiet']
229
+ options = ['--maxiterate', '1000', '--localpair', '--quiet']
230
+ @mafft = Bio::MAFFT.new( "mafft" , options) unless @mafft
231
+ report = @mafft.query_align(sequences_to_align)
232
+ report.alignment
233
+ end
234
+
235
+ def write_fasta_from_hash(sequences, filename)
236
+ out = File.new(filename, "w")
237
+ sequences.each_pair do | chromosome, exon_seq |
238
+ out.puts ">#{chromosome}\n#{exon_seq}\n"
239
+ end
240
+ out.close
241
+ end
242
+
243
+ def get_longest_aln(aln, max_gap: 10)
244
+ names = aln.keys
245
+ i = 0
246
+ len = 0
247
+ len = aln[names[0]].length if names[0] and aln[names[0]] != nil
248
+ total_alignments = names.size
249
+ masked_snps = "-" * len
250
+ longest_start = -1
251
+ longest_length = 0
252
+ current_start = -1
253
+ current_length = 0
254
+ current_gap = 0
255
+ longest_gaps = 0
256
+ gaps = 0
257
+ while i < len do
258
+ different = 0
259
+ cov = 0
260
+ names.each do | chr |
261
+ if aln[chr][i] != "-"
262
+ cov += 1
263
+ end
264
+ end
265
+ if cov == total_alignments
266
+ current_start = i if current_length == 0
267
+ current_length += 1
268
+ current_gap = 0
269
+ else
270
+ gaps += 1
271
+ current_gap += 1
272
+ end
273
+
274
+ if current_length > longest_length
275
+ longest_length = current_length
276
+ longest_start = current_start
277
+ longest_gaps = gaps - current_gap
278
+ end
279
+ if current_gap > max_gap
280
+ current_length = 0
281
+ gaps = 0
282
+ end
283
+ i += 1
284
+ end
285
+ longest_length += longest_gaps
286
+ [longest_start, longest_length, len - longest_start - longest_length, len - longest_start]
287
+ end
288
+
289
+ split_token = options[:split_token]
290
+
291
+ def read_alignments(fasta_path, split_token)
292
+ sequences = Hash.new
293
+ sequence_count=0
294
+ Bio::FlatFile.open(Bio::FastaFormat, fasta_path) do |fasta_file|
295
+ fasta_file.each do |entry|
296
+ #puts entry
297
+ gene_name = entry.entry_id.split(split_token)[0]
298
+ sequences[gene_name] = entry unless sequences[gene_name]
299
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
300
+ sequence_count += 1
301
+ end
302
+ end
303
+ [sequences,sequence_count]
304
+ end
305
+
306
+ sequences, sequence_count = read_alignments(options[:fasta], split_token)
307
+
308
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
309
+ output_folder = options[:output_folder]
310
+
311
+ FileUtils.mkdir_p output_folder
312
+ summary_file = "#{output_folder}/identities.txt"
313
+ long_table_file = "#{output_folder}/sliding_window_identities.txt"
314
+
315
+ out = File.open(summary_file, "w")
316
+ long_table = File.open(long_table_file, "w")
317
+
318
+ i =0
319
+
320
+ header = ["triad", "total_aln_length"]
321
+ header << ["longest_start", "longest_length", "longest_start_from_CDS","longest_end_from_CDS", "longest_sum_of_all_pairs","longest_norm_sum_of_all_pairs","longest_sum_of_identities", "longest_identity"]
322
+ header << ["best_start", "best_length" , "best_start_from_CDS","best_end_from_CDS", "best_sum_of_all_pairs","best_norm_sum_of_all_pairs","best_sum_of_identities", "best_identity"]
323
+ out.puts header.join("\t")
324
+ long_table.puts ["triad", "type", "start_from_CDS", "end_from_cds" , "sum_of_all_pairs","norm_sum_of_all_pairs","sum_of_identities", "identity"].join("\t")
325
+ CSV.foreach( options[:triads], headers:true ) do |row|
326
+ a = row['A']
327
+ b = row['B']
328
+ d = row['D']
329
+ triad = row['group_id']
330
+
331
+ cent_triad = triad.to_i / 100
332
+ folder = "#{output_folder}/prom_aln/#{cent_triad}/"
333
+ save_prom = "#{folder}/#{triad}.prom.fa"
334
+
335
+ to_align = Bio::Alignment::SequenceHash.new
336
+ to_align[a] = sequences[a]
337
+ to_align[b] = sequences[b]
338
+ to_align[d] = sequences[d]
339
+
340
+ prom_aln = nil
341
+ unless File.file? save_prom
342
+ prom_aln = promoter_alignment to_align
343
+ else
344
+ ff, seqs_cnt = read_alignments save_prom, split_token
345
+ seqs = Bio::Alignment::SequenceHash.new
346
+ prom_aln = Bio::Alignment.new(ff)
347
+ end
348
+ print_arr = [triad, prom_aln.len]
349
+ aln_stats = get_longest_aln prom_aln
350
+ print_arr << aln_stats
351
+ cut_seqs = prom_aln.cut_alignment aln_stats[0], aln_stats[1]
352
+
353
+
354
+
355
+ print_arr << cut_seqs.sum_of_all_pairs
356
+ print_arr << cut_seqs.normalized_sum_of_all_pairs
357
+
358
+ print_arr << cut_seqs.sum_of_identities
359
+ print_arr << cut_seqs.identity
360
+
361
+ best_aln_stats = prom_aln.best_block
362
+ best_aln_cut = prom_aln.cut_alignment best_aln_stats[0], best_aln_stats[1]
363
+
364
+ print_arr << best_aln_stats
365
+
366
+ print_arr << best_aln_cut.sum_of_all_pairs
367
+ print_arr << best_aln_cut.normalized_sum_of_all_pairs
368
+
369
+ print_arr << best_aln_cut.sum_of_identities
370
+ print_arr << best_aln_cut.identity
371
+
372
+ base = [triad, "cut_longest_region"]
373
+ cut_seqs.window_identities.each do |e|
374
+ long_table.puts [base, e].flatten.join("\t")
375
+ end
376
+
377
+ base = [triad, "cut_best_region"]
378
+ best_aln_cut.window_identities.each do |e|
379
+ long_table.puts [base, e].flatten.join("\t")
380
+ end
381
+
382
+ base = [triad, "full_promoter"]
383
+ prom_aln.window_identities.each do |e|
384
+ long_table.puts [base, e].flatten.join("\t")
385
+ end
386
+
387
+ out.puts print_arr.join("\t")
388
+
389
+ FileUtils.mkdir_p folder
390
+
391
+ write_fasta_from_hash(prom_aln, save_prom) unless File.file?(save_prom)
392
+
393
+ save_prom_cut = "#{folder}/#{triad}.prom.cut.fa"
394
+ write_fasta_from_hash(cut_seqs, save_prom_cut) unless File.file?(save_prom)
395
+
396
+ save_prom_cut_best = "#{folder}/#{triad}.prom.cut.best.fa"
397
+ write_fasta_from_hash(best_aln_cut, save_prom_cut_best)
398
+
399
+ i += 1
400
+ #break if i > 10
401
+ end
402
+ long_table.close
403
+ out.close