bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,153 @@
1
+
2
+
3
+ class Bio::Blat
4
+ def self.align(database , query , output)
5
+ cmdline = "blat #{database} #{query} #{output}"
6
+ puts $stderr.puts cmdline
7
+ status, stdout, stderr = systemu cmdline
8
+ if status.exitstatus == 0
9
+ alns = Array.new unless block_given?
10
+ blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(output).to_io)
11
+ #p blat_aln
12
+ blat_aln.each_hit() do |hit|
13
+ if block_given?
14
+ yield hit
15
+ else
16
+ alns << hit
17
+ end
18
+ end
19
+ return alns unless block_given?
20
+ else
21
+ raise Exception.new(), "Error running exonerate. Command line was '#{cmdline}'\nBlat STDERR was:\n#{stderr}"
22
+ end
23
+ end
24
+ end
25
+
26
+ class Bio::Blat::Report::Hit
27
+
28
+ #Function to parse stuff like: IWGSC_CSS_1AL_scaff_110
29
+ def wheat_chr_arm
30
+ @wheat_chr_arm if @wheat_chr_arm
31
+ @wheat_chr_arm = target_id.split('_')[2]
32
+ end
33
+
34
+ def wheat_chr
35
+ wheat_chr_arm[0,2]
36
+ end
37
+
38
+ def wheat_chr_group
39
+ raise Exception.new(), "No wheat group for #{target_id} #{self.inspect}" unless wheat_chr
40
+ wheat_chr_arm[0]
41
+ end
42
+
43
+ def wheat_genome
44
+ wheat_chr_arm[1]
45
+ end
46
+
47
+ def wheat_arm
48
+ wheat_chr_arm[2]
49
+ end
50
+
51
+ def percentage_covered
52
+ ( match + mismatch ) * 100.0 / query_len.to_f
53
+ end
54
+
55
+ end
56
+
57
+
58
+ class Hash
59
+ def join(keyvaldelim=$,, entrydelim=$,)
60
+ map {|e| e.join(keyvaldelim) }.join(entrydelim)
61
+ end
62
+ end
63
+
64
+
65
+ class Bio::NucleicAcid
66
+
67
+ IUPAC_CODES ||= {
68
+
69
+ 'y' => 'ct',
70
+ 'r' => 'ag',
71
+ 'w' => 'at',
72
+ 's' => 'cg',
73
+ 'k' => 'gt',
74
+ 'm' => 'ac',
75
+
76
+ 'b' => 'cgt',
77
+ 'd' => 'agt',
78
+ 'h' => 'act',
79
+ 'v' => 'acg',
80
+
81
+ 'n' => 'acgt',
82
+
83
+ 'a' => 'a',
84
+ 't' => 't',
85
+ 'g' => 'g',
86
+ 'c' => 'c',
87
+ 'u' => 'u',
88
+
89
+ 'ct' => 'y',
90
+ 'ag' => 'r',
91
+ 'at' => 'w',
92
+ 'cg' => 's',
93
+ 'gt' => 'k',
94
+ 'ac' => 'm',
95
+
96
+ 'cgt' => 'b',
97
+ 'agt' => 'd',
98
+ 'act' => 'h',
99
+ 'acg' => 'v',
100
+
101
+ 'acgt' => 'n'
102
+ }
103
+
104
+
105
+ def self.is_unambiguous(base)
106
+ "acgtACGT".match(base)
107
+ end
108
+
109
+ def self.to_IUAPC(bases)
110
+ base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
111
+ if base == nil
112
+ p "Invalid base! #{base}"
113
+ base = 'n' #This is a patch... as one of the scripts failed here.
114
+ end
115
+ base.upcase
116
+ end
117
+
118
+ def self.is_valid(code, base)
119
+ IUPAC_CODES[code.downcase].chars.include? base.downcase
120
+ end
121
+
122
+ end
123
+
124
+ #Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
125
+ #sequences are already aligned and doesn't check if a base on the first sequence is
126
+ #valid on the second.
127
+ class Bio::Sequence
128
+ def self.snps_between(seq1, seq2)
129
+ snps=0
130
+ for i in (0..seq1.size-1)
131
+ snps += 1 if seq1[i] != seq2[i]
132
+ end
133
+ snps
134
+ end
135
+ end
136
+
137
+ class String
138
+ #Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
139
+ def count_ambiguities
140
+ snps=0
141
+
142
+ for i in (0..self.size-1)
143
+
144
+ snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
145
+ end
146
+ snps
147
+ end
148
+
149
+ #Counts how many bases are uppercase
150
+ def upper_case_count
151
+ match(/[^A-Z]*/).to_s.size
152
+ end
153
+ end
@@ -0,0 +1,63 @@
1
+ module Bio::PolyploidTools
2
+ class ChromosomeArm
3
+
4
+ @@arm_selection_functions = Hash.new;
5
+
6
+ #example format: chr2A
7
+ @@arm_selection_functions[:nrgene] = lambda do | contig_name |
8
+ ret = contig_name[3,2]
9
+ return ret
10
+ end
11
+
12
+ @@arm_selection_functions[:first_two] = lambda do | contig_name |
13
+ contig_name.gsub!(/chr/,"")
14
+ ret = contig_name[0,2]
15
+ return ret
16
+ end
17
+
18
+ #Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
19
+ #Or the first two characters in the contig name, to deal with
20
+ #pseudomolecules that start with headers like: "1A"
21
+ #And with the cases when 3B is named with the prefix: v443
22
+ @@arm_selection_functions[:embl] = lambda do | contig_name|
23
+
24
+ arr = contig_name.split('_')
25
+ ret = "U"
26
+ ret = arr[2][0,2] if arr.size >= 3
27
+ ret = "3B" if arr.size == 2 and arr[0] == "v443"
28
+ ret = arr[0][0,2] if arr.size == 1
29
+ return ret
30
+ end
31
+
32
+ @@arm_selection_functions[:morex] = lambda do | contig_name |
33
+ ret = contig_name.split(':')[0].split("_")[1];
34
+ return ret
35
+ end
36
+
37
+ @@arm_selection_functions[:scaffold] = lambda do | contig_name |
38
+ ret = contig_name;
39
+ return ret
40
+ end
41
+
42
+ def self.getArmSelection(name)
43
+ arr = name.split(",")
44
+ if arr.size == 2
45
+ @@arm_selection_functions[name.to_sym] = lambda do |contig_name|
46
+ separator, field = arr
47
+ field = field.to_i
48
+ ret = contig_name.split(separator)[field]
49
+ return ret
50
+ end
51
+ end
52
+ @@arm_selection_functions[name.to_sym]
53
+ end
54
+
55
+ def self.getValidFunctions
56
+ tmp = @@arm_selection_functions.keys.map { |e| e.to_s }
57
+ tmp.unshift "<sep>,<index>"
58
+ tmp
59
+ end
60
+
61
+
62
+ end
63
+ end
@@ -0,0 +1,245 @@
1
+ #puts "Loading ExonCointainer..."
2
+ module Bio::PolyploidTools
3
+ class ExonContainer
4
+ attr_reader :parental_1_sam, :parental_2_sam
5
+ attr_reader :parental_1_name, :parental_2_name, :gene_models_db
6
+ attr_reader :chromosomes, :snp_map
7
+ attr_reader :parents
8
+ attr_accessor :flanking_size , :primer_3_min_seq_length, :max_hits
9
+
10
+ BASES = [:A, :C, :G, :T]
11
+ #Sets the reference file for the gene models
12
+
13
+ def initialize
14
+ @parents=Hash.new
15
+ @snp_map = Hash.new
16
+ @primer_3_min_seq_length = 50
17
+ @max_hits = 10
18
+ end
19
+
20
+ def gene_models(path)
21
+ @gene_models_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
22
+ @gene_models_db.index
23
+ @gene_models_path = path
24
+ end
25
+
26
+ #Returns the sequence for a region in the gene models (exon)
27
+ def gene_model_sequence(region)
28
+ #puts "Region: "
29
+ #puts region.inspect
30
+ target_reg = @gene_models_db.index.region_for_entry(region.entry)
31
+ #puts target_reg.inspect
32
+ region.end = target_reg.length if region.end > target_reg.length
33
+ #entries[region.entry]
34
+
35
+ seq=@gene_models_db.fetch_sequence(region)
36
+ #puts "sequence: "
37
+ #This is a patch that we need to fix in biosamtools:
38
+ #puts seq
39
+ index = seq.index('>')
40
+ if(index )
41
+ index -= 1
42
+ #puts "Index: #{index}"
43
+ seq = seq.slice(0..index)
44
+ end
45
+ #puts seq
46
+ seq
47
+ end
48
+
49
+ #Sets the reference file for the gene models
50
+ def chromosomes(path)
51
+ @chromosomes_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
52
+ @chromosomes_path = path
53
+ end
54
+
55
+ #Retunrs the sequence for a region in the gene models (exon)
56
+ def chromosome_sequence(region)
57
+ left_pad = 0
58
+ #TODO: Padd if it goes to the right
59
+ if(region.start < 1)
60
+ left_pad = region.start * -1
61
+ left_pad += 1
62
+ region.start = 1
63
+ end
64
+ str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
65
+ #str << "n" * (region.size - str.size + 1) if region.size > str.size
66
+ str
67
+ end
68
+
69
+
70
+ def add_chromosome_arm(opts)
71
+ @chromosomes = Hash.new unless @chromosomes
72
+ name = opts[:name]
73
+ path = opts[:reference_path]
74
+ path = opts[:alig_path]
75
+ chromosomes[name] = Bio::DB::Fasta::FastaFile.new(fasta: path)
76
+ end
77
+
78
+ def add_snp(snp)
79
+ snp.max_hits = self.max_hits
80
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
81
+ @snp_map[snp.gene] << snp
82
+
83
+ end
84
+
85
+ def add_snp_file(filename, chromosome, snp_in, original_name)
86
+
87
+ File.open(filename) do | f |
88
+ f.each_line do | line |
89
+ snp = SNP.parse(line)
90
+ snp.flanking_size = flanking_size
91
+ if snp.position > 0
92
+ snp.container = self
93
+ snp.chromosome = chromosome
94
+ snp.snp_in = snp_in
95
+ snp.original_name = original_name
96
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
97
+ @snp_map[snp.gene] << snp
98
+ end
99
+
100
+ end
101
+ end
102
+ end
103
+
104
+
105
+
106
+ def fasta_string_for_snp(snp)
107
+ gene_region = snp.covered_region
108
+ local_pos_in_gene = snp.local_position
109
+ ret_str = ""
110
+ @parents.each do |name, bam|
111
+ ret_str << ">#{gene_region.id}_SNP-#{snp.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
112
+ to_print = bam.consensus_with_ambiguities(region: gene_region).to_s
113
+ to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
114
+ ret_str << to_print << "\n"
115
+ end
116
+
117
+ snp.exon_list.each do | chromosome, exon |
118
+ target_region = exon.target_region
119
+ exon_start_offset = exon.query_region.start - gene_region.start
120
+ chr_local_pos=local_pos_in_gene + target_region.start + 1
121
+ ret_str << ">#{chromosome}_SNP-#{chr_local_pos} #{exon.to_s} #{target_region.orientation}\n"
122
+ to_print = "-" * exon_start_offset
123
+ chr_seq = chromosome_sequence(exon.target_region).to_s
124
+ l_pos = exon_start_offset + local_pos_in_gene
125
+ to_print << chr_seq
126
+ to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
127
+ ret_str << to_print
128
+ end
129
+ ret_str
130
+ end
131
+
132
+ def print_fasta_snp_exones (file)
133
+ @missing_exons = Set.new unless @missing_exons
134
+ @snp_map.each do | gene, snp_array|
135
+ snp_array.each do |snp|
136
+ #file.puts snp.primer_fasta_string
137
+ #puts "In print_fast_np_exones"
138
+ #puts snp.inspect
139
+
140
+ begin
141
+ file.puts snp.aligned_sequences_fasta
142
+ rescue Exception=>e
143
+ #puts snp.inspect
144
+ @missing_exons << snp.to_s
145
+ $stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
146
+ $stderr.puts "Local position: #{snp.local_position}"
147
+ $stderr.puts "Local position: #{snp.parental_sequences.to_s}"
148
+ $stderr.puts e.backtrace
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ def print_primer_3_exons (file, target_chromosome , parental, max_specific_primers: 20 )
155
+ added = 0
156
+
157
+ @snp_map.each do | gene, snp_array|
158
+ snp_array.each do |snp|
159
+ string = ""
160
+ begin
161
+ primer_3_min_seq_length
162
+ string = snp.primer_3_string( snp.chromosome, parental, max_specific_primers: max_specific_primers )
163
+ #TODO: add tan error to the SNP this snp has more than max_hits.
164
+ #Or maybe inside the SNP file.
165
+ if string.size > 0
166
+ file.puts string
167
+ added += 1
168
+ end
169
+ rescue Exception=>e
170
+ @missing_exons << snp.to_s
171
+ # $stderr.puts ""
172
+
173
+ $stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
174
+ $stderr.puts e.backtrace
175
+ end
176
+ end
177
+ end
178
+ return added
179
+ end
180
+
181
+ def add_alignments(opts=Hash.new)
182
+ opts = { :min_identity=>90, filter_best:false }.merge!(opts)
183
+ exonerate_filename = opts[:exonerate_file]
184
+ arm_selection = opts[:arm_selection]
185
+ filter_best = opts[:filter_best]
186
+
187
+ unless arm_selection
188
+ arm_selection = lambda do | contig_name |
189
+ ret = contig_name[0,3]
190
+ return ret
191
+ end
192
+ end
193
+
194
+
195
+ File.open(exonerate_filename) do |f|
196
+ f.each_line do | line |
197
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
198
+ if record and record.identity >= opts[:min_identity]
199
+ snp_array = @snp_map[record.query_id]
200
+ if snp_array != nil
201
+ snp_array.each do |snp|
202
+ if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
203
+ begin
204
+ exon = record.exon_on_gene_position(snp.position)
205
+ snp.add_exon(exon, arm_selection.call(record.target_id), filter_best:filter_best)
206
+ rescue Bio::DB::Exonerate::ExonerateException
207
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
208
+ end
209
+ end
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end
215
+ remove_alignments_over_max_hits
216
+ end
217
+
218
+ def remove_alignments_over_max_hits
219
+ @snp_map.each_pair do | gene, snp_array|
220
+ snp_array.each do |snp|
221
+ total_hits = snp.exon_list.map {|e| e[1].size}.reduce(0,:+)
222
+ snp.hit_count = total_hits
223
+ if total_hits > max_hits
224
+ snp.exon_list = {}
225
+ snp.repetitive = true
226
+ snp.errors << "The marker is in a repetitive region (#{total_hits} hits to reference)"
227
+ end
228
+ end
229
+ end
230
+ end
231
+
232
+ def add_parental(opts=Hash.new)
233
+ # opts = { :name=>opts[:path]}.merge!(opts)
234
+ sam = nil
235
+ name = opts[:name] ? opts[:name] : "Unknown"
236
+ if opts[:path]
237
+ path = opts[:path]
238
+ name = opts[:name] ? opts[:name] : path.basename(".bam")
239
+ sam = Bio::DB::Sam.new({:fasta=>@gene_models_path, :bam=>opts[:path]})
240
+ end
241
+ @parents[name] = sam
242
+ end
243
+ end
244
+
245
+ end
@@ -0,0 +1,175 @@
1
+ module Bio::PolyploidTools
2
+ class Marker
3
+ include Comparable
4
+ #include Virgola
5
+ attr_reader :template_sequence, :original, :snp
6
+ attr_accessor :best_hit
7
+ attr_accessor :index_90k
8
+ attr_accessor :snp_id
9
+ attr_accessor :snp_name
10
+ attr_accessor :chr
11
+ attr_accessor :coordinates_chr
12
+ attr_accessor :map_order
13
+ attr_accessor :chr_arm
14
+ attr_accessor :distance_cm
15
+ attr_accessor :sequence
16
+ attr_writer :contig
17
+
18
+
19
+
20
+ #after_map :parse_sequence_snp
21
+
22
+ def to_fasta
23
+ ">#{self.snp_name}\n#{self.template_sequence}"
24
+ end
25
+
26
+ def contig
27
+ @contig = best_hit.target_id.chomp if best_hit
28
+ @contig
29
+ end
30
+
31
+ def to_csv
32
+ "#{index_90k},#{snp_id},#{snp_name},#{chr},#{coordinates_chr},#{map_order},#{chr_arm},#{distance_cm},#{sequence},#{contig}"
33
+ end
34
+
35
+ def <=>(anOter)
36
+ return 0 if anOter.snp_name == @snp_name
37
+ return @chr_arm <=> anOter.chr_arm if anOter.chr_arm != @chr_arm
38
+ return @snp_name <=> anOter.snp_name if anOter.coordinates_chr == @coordinates_chr
39
+ return @coordinates_chr <=> anOter.coordinates_chr
40
+ end
41
+
42
+ def initialize(line)
43
+ line.chomp!
44
+ @template_sequence = nil
45
+ #INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE
46
+ @index_90k, @snp_id, @snp_name, @chr, @coordinates_chr, @map_order, @chr_arm, @distance_cm, @sequence, @contig = line.split(',')
47
+ parse_sequence_snp
48
+ end
49
+
50
+ def self.parse(filename)
51
+ f = File.open(filename, "r").read
52
+ f.each_line do |line|
53
+ m = Marker.new(line)
54
+ yield m if m.template_sequence
55
+
56
+ end
57
+ end
58
+
59
+ protected
60
+ def parse_sequence_snp
61
+ pos = 0
62
+ @chr.upcase!
63
+ match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence)
64
+ if match_data
65
+ @position = Regexp.last_match(:pre).size + 1
66
+ @original = Regexp.last_match(:org)
67
+ @snp = Regexp.last_match(:snp)
68
+ amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
69
+ @template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
70
+ return @template_sequence
71
+ end
72
+ return nil
73
+ end
74
+ end
75
+
76
+
77
+ #The map hast to come sorted.
78
+ class ArmMap
79
+ attr_reader :markers , :global_reference, :reference
80
+ attr_accessor :chromosome
81
+ def initialize
82
+ @markers = Hash.new
83
+ end
84
+
85
+ def align_markers(output)
86
+ Bio::Blat.align(@reference.fasta_path, @fasta_markers, output) do |hit|
87
+ marker = markers[hit.query_id]
88
+ best = marker.best_hit
89
+ unless marker.best_hit
90
+ markers[hit.query_id].best_hit = hit
91
+ else
92
+ marker.best_hit = hit if hit.score > marker.best_hit.score
93
+ end
94
+ end
95
+ end
96
+
97
+ def print_fasta_contigs_for_markers(contigs_file)
98
+
99
+ contigs = Set.new
100
+ markers.each do |k, marker|
101
+
102
+ if marker.best_hit
103
+ contigs << marker.best_hit.target_id
104
+ end
105
+ end
106
+
107
+ fasta=File.open(contigs_file, "w")
108
+ contigs.each do |contig_id|
109
+ reg = @reference.index.region_for_entry(contig_id)
110
+ fasta.puts ">#{contig_id}\n#{@reference.fetch_sequence(reg.get_full_region)}"
111
+ end
112
+ fasta.close
113
+ end
114
+
115
+
116
+
117
+
118
+ def print_fasta_markers(filename)
119
+ @fasta_markers = filename
120
+ fasta=File.open(filename, "w")
121
+
122
+ markers.each do |k, marker|
123
+ fasta.puts marker.to_fasta
124
+ end
125
+ fasta.close
126
+ end
127
+
128
+ def global_reference(reference)
129
+ @global_reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
130
+ @global_reference.load_fai_entries
131
+ end
132
+
133
+ def reference(reference)
134
+ @reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
135
+ @reference.load_fai_entries
136
+ end
137
+
138
+ def print_fasta_contigs_from_reference(filename)
139
+ if File.exist?(filename)
140
+ reference(filename)
141
+ return
142
+ end
143
+
144
+ #puts "loaded"
145
+
146
+ fasta=File.open(filename, "w")
147
+
148
+ Bio::FlatFile.auto( @global_reference.fasta_path) do |ff|
149
+ ff.each do |f|
150
+ chr_reg = arm_selection_embl(f.entry_id)
151
+ if chr_reg == chromosome
152
+ fasta.puts f.entry
153
+ end
154
+ end
155
+ end
156
+ fasta.close
157
+ reference(filename)
158
+ end
159
+
160
+
161
+ def print_map_with_contigs(filename)
162
+ file = File.open(filename, "w")
163
+ markers.values.sort { |x,y| x.map_order <=> y.map_order }.each do | marker |
164
+ file.puts marker.to_csv
165
+ end
166
+ file.close
167
+ end
168
+
169
+ protected
170
+ def arm_selection_embl(contig_name)
171
+ ret = contig_name.split('_')[2][0,2]
172
+ return ret
173
+ end
174
+ end
175
+ end