bio-polymarker 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,153 @@
1
+
2
+
3
+ class Bio::Blat
4
+ def self.align(database , query , output)
5
+ cmdline = "blat #{database} #{query} #{output}"
6
+ puts $stderr.puts cmdline
7
+ status, stdout, stderr = systemu cmdline
8
+ if status.exitstatus == 0
9
+ alns = Array.new unless block_given?
10
+ blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(output).to_io)
11
+ #p blat_aln
12
+ blat_aln.each_hit() do |hit|
13
+ if block_given?
14
+ yield hit
15
+ else
16
+ alns << hit
17
+ end
18
+ end
19
+ return alns unless block_given?
20
+ else
21
+ raise Exception.new(), "Error running exonerate. Command line was '#{cmdline}'\nBlat STDERR was:\n#{stderr}"
22
+ end
23
+ end
24
+ end
25
+
26
+ class Bio::Blat::Report::Hit
27
+
28
+ #Function to parse stuff like: IWGSC_CSS_1AL_scaff_110
29
+ def wheat_chr_arm
30
+ @wheat_chr_arm if @wheat_chr_arm
31
+ @wheat_chr_arm = target_id.split('_')[2]
32
+ end
33
+
34
+ def wheat_chr
35
+ wheat_chr_arm[0,2]
36
+ end
37
+
38
+ def wheat_chr_group
39
+ raise Exception.new(), "No wheat group for #{target_id} #{self.inspect}" unless wheat_chr
40
+ wheat_chr_arm[0]
41
+ end
42
+
43
+ def wheat_genome
44
+ wheat_chr_arm[1]
45
+ end
46
+
47
+ def wheat_arm
48
+ wheat_chr_arm[2]
49
+ end
50
+
51
+ def percentage_covered
52
+ ( match + mismatch ) * 100.0 / query_len.to_f
53
+ end
54
+
55
+ end
56
+
57
+
58
+ class Hash
59
+ def join(keyvaldelim=$,, entrydelim=$,)
60
+ map {|e| e.join(keyvaldelim) }.join(entrydelim)
61
+ end
62
+ end
63
+
64
+
65
+ class Bio::NucleicAcid
66
+
67
+ IUPAC_CODES ||= {
68
+
69
+ 'y' => 'ct',
70
+ 'r' => 'ag',
71
+ 'w' => 'at',
72
+ 's' => 'cg',
73
+ 'k' => 'gt',
74
+ 'm' => 'ac',
75
+
76
+ 'b' => 'cgt',
77
+ 'd' => 'agt',
78
+ 'h' => 'act',
79
+ 'v' => 'acg',
80
+
81
+ 'n' => 'acgt',
82
+
83
+ 'a' => 'a',
84
+ 't' => 't',
85
+ 'g' => 'g',
86
+ 'c' => 'c',
87
+ 'u' => 'u',
88
+
89
+ 'ct' => 'y',
90
+ 'ag' => 'r',
91
+ 'at' => 'w',
92
+ 'cg' => 's',
93
+ 'gt' => 'k',
94
+ 'ac' => 'm',
95
+
96
+ 'cgt' => 'b',
97
+ 'agt' => 'd',
98
+ 'act' => 'h',
99
+ 'acg' => 'v',
100
+
101
+ 'acgt' => 'n'
102
+ }
103
+
104
+
105
+ def self.is_unambiguous(base)
106
+ "acgtACGT".match(base)
107
+ end
108
+
109
+ def self.to_IUAPC(bases)
110
+ base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
111
+ if base == nil
112
+ p "Invalid base! #{base}"
113
+ base = 'n' #This is a patch... as one of the scripts failed here.
114
+ end
115
+ base.upcase
116
+ end
117
+
118
+ def self.is_valid(code, base)
119
+ IUPAC_CODES[code.downcase].chars.include? base.downcase
120
+ end
121
+
122
+ end
123
+
124
+ #Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
125
+ #sequences are already aligned and doesn't check if a base on the first sequence is
126
+ #valid on the second.
127
+ class Bio::Sequence
128
+ def self.snps_between(seq1, seq2)
129
+ snps=0
130
+ for i in (0..seq1.size-1)
131
+ snps += 1 if seq1[i] != seq2[i]
132
+ end
133
+ snps
134
+ end
135
+ end
136
+
137
+ class String
138
+ #Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
139
+ def count_ambiguities
140
+ snps=0
141
+
142
+ for i in (0..self.size-1)
143
+
144
+ snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
145
+ end
146
+ snps
147
+ end
148
+
149
+ #Counts how many bases are uppercase
150
+ def upper_case_count
151
+ match(/[^A-Z]*/).to_s.size
152
+ end
153
+ end
@@ -0,0 +1,63 @@
1
+ module Bio::PolyploidTools
2
+ class ChromosomeArm
3
+
4
+ @@arm_selection_functions = Hash.new;
5
+
6
+ #example format: chr2A
7
+ @@arm_selection_functions[:nrgene] = lambda do | contig_name |
8
+ ret = contig_name[3,2]
9
+ return ret
10
+ end
11
+
12
+ @@arm_selection_functions[:first_two] = lambda do | contig_name |
13
+ contig_name.gsub!(/chr/,"")
14
+ ret = contig_name[0,2]
15
+ return ret
16
+ end
17
+
18
+ #Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
19
+ #Or the first two characters in the contig name, to deal with
20
+ #pseudomolecules that start with headers like: "1A"
21
+ #And with the cases when 3B is named with the prefix: v443
22
+ @@arm_selection_functions[:embl] = lambda do | contig_name|
23
+
24
+ arr = contig_name.split('_')
25
+ ret = "U"
26
+ ret = arr[2][0,2] if arr.size >= 3
27
+ ret = "3B" if arr.size == 2 and arr[0] == "v443"
28
+ ret = arr[0][0,2] if arr.size == 1
29
+ return ret
30
+ end
31
+
32
+ @@arm_selection_functions[:morex] = lambda do | contig_name |
33
+ ret = contig_name.split(':')[0].split("_")[1];
34
+ return ret
35
+ end
36
+
37
+ @@arm_selection_functions[:scaffold] = lambda do | contig_name |
38
+ ret = contig_name;
39
+ return ret
40
+ end
41
+
42
+ def self.getArmSelection(name)
43
+ arr = name.split(",")
44
+ if arr.size == 2
45
+ @@arm_selection_functions[name.to_sym] = lambda do |contig_name|
46
+ separator, field = arr
47
+ field = field.to_i
48
+ ret = contig_name.split(separator)[field]
49
+ return ret
50
+ end
51
+ end
52
+ @@arm_selection_functions[name.to_sym]
53
+ end
54
+
55
+ def self.getValidFunctions
56
+ tmp = @@arm_selection_functions.keys.map { |e| e.to_s }
57
+ tmp.unshift "<sep>,<index>"
58
+ tmp
59
+ end
60
+
61
+
62
+ end
63
+ end
@@ -0,0 +1,245 @@
1
+ #puts "Loading ExonCointainer..."
2
+ module Bio::PolyploidTools
3
+ class ExonContainer
4
+ attr_reader :parental_1_sam, :parental_2_sam
5
+ attr_reader :parental_1_name, :parental_2_name, :gene_models_db
6
+ attr_reader :chromosomes, :snp_map
7
+ attr_reader :parents
8
+ attr_accessor :flanking_size , :primer_3_min_seq_length, :max_hits
9
+
10
+ BASES = [:A, :C, :G, :T]
11
+ #Sets the reference file for the gene models
12
+
13
+ def initialize
14
+ @parents=Hash.new
15
+ @snp_map = Hash.new
16
+ @primer_3_min_seq_length = 50
17
+ @max_hits = 10
18
+ end
19
+
20
+ def gene_models(path)
21
+ @gene_models_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
22
+ @gene_models_db.index
23
+ @gene_models_path = path
24
+ end
25
+
26
+ #Returns the sequence for a region in the gene models (exon)
27
+ def gene_model_sequence(region)
28
+ #puts "Region: "
29
+ #puts region.inspect
30
+ target_reg = @gene_models_db.index.region_for_entry(region.entry)
31
+ #puts target_reg.inspect
32
+ region.end = target_reg.length if region.end > target_reg.length
33
+ #entries[region.entry]
34
+
35
+ seq=@gene_models_db.fetch_sequence(region)
36
+ #puts "sequence: "
37
+ #This is a patch that we need to fix in biosamtools:
38
+ #puts seq
39
+ index = seq.index('>')
40
+ if(index )
41
+ index -= 1
42
+ #puts "Index: #{index}"
43
+ seq = seq.slice(0..index)
44
+ end
45
+ #puts seq
46
+ seq
47
+ end
48
+
49
+ #Sets the reference file for the gene models
50
+ def chromosomes(path)
51
+ @chromosomes_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
52
+ @chromosomes_path = path
53
+ end
54
+
55
+ #Retunrs the sequence for a region in the gene models (exon)
56
+ def chromosome_sequence(region)
57
+ left_pad = 0
58
+ #TODO: Padd if it goes to the right
59
+ if(region.start < 1)
60
+ left_pad = region.start * -1
61
+ left_pad += 1
62
+ region.start = 1
63
+ end
64
+ str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
65
+ #str << "n" * (region.size - str.size + 1) if region.size > str.size
66
+ str
67
+ end
68
+
69
+
70
+ def add_chromosome_arm(opts)
71
+ @chromosomes = Hash.new unless @chromosomes
72
+ name = opts[:name]
73
+ path = opts[:reference_path]
74
+ path = opts[:alig_path]
75
+ chromosomes[name] = Bio::DB::Fasta::FastaFile.new(fasta: path)
76
+ end
77
+
78
+ def add_snp(snp)
79
+ snp.max_hits = self.max_hits
80
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
81
+ @snp_map[snp.gene] << snp
82
+
83
+ end
84
+
85
+ def add_snp_file(filename, chromosome, snp_in, original_name)
86
+
87
+ File.open(filename) do | f |
88
+ f.each_line do | line |
89
+ snp = SNP.parse(line)
90
+ snp.flanking_size = flanking_size
91
+ if snp.position > 0
92
+ snp.container = self
93
+ snp.chromosome = chromosome
94
+ snp.snp_in = snp_in
95
+ snp.original_name = original_name
96
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
97
+ @snp_map[snp.gene] << snp
98
+ end
99
+
100
+ end
101
+ end
102
+ end
103
+
104
+
105
+
106
+ def fasta_string_for_snp(snp)
107
+ gene_region = snp.covered_region
108
+ local_pos_in_gene = snp.local_position
109
+ ret_str = ""
110
+ @parents.each do |name, bam|
111
+ ret_str << ">#{gene_region.id}_SNP-#{snp.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
112
+ to_print = bam.consensus_with_ambiguities(region: gene_region).to_s
113
+ to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
114
+ ret_str << to_print << "\n"
115
+ end
116
+
117
+ snp.exon_list.each do | chromosome, exon |
118
+ target_region = exon.target_region
119
+ exon_start_offset = exon.query_region.start - gene_region.start
120
+ chr_local_pos=local_pos_in_gene + target_region.start + 1
121
+ ret_str << ">#{chromosome}_SNP-#{chr_local_pos} #{exon.to_s} #{target_region.orientation}\n"
122
+ to_print = "-" * exon_start_offset
123
+ chr_seq = chromosome_sequence(exon.target_region).to_s
124
+ l_pos = exon_start_offset + local_pos_in_gene
125
+ to_print << chr_seq
126
+ to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
127
+ ret_str << to_print
128
+ end
129
+ ret_str
130
+ end
131
+
132
+ def print_fasta_snp_exones (file)
133
+ @missing_exons = Set.new unless @missing_exons
134
+ @snp_map.each do | gene, snp_array|
135
+ snp_array.each do |snp|
136
+ #file.puts snp.primer_fasta_string
137
+ #puts "In print_fast_np_exones"
138
+ #puts snp.inspect
139
+
140
+ begin
141
+ file.puts snp.aligned_sequences_fasta
142
+ rescue Exception=>e
143
+ #puts snp.inspect
144
+ @missing_exons << snp.to_s
145
+ $stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
146
+ $stderr.puts "Local position: #{snp.local_position}"
147
+ $stderr.puts "Local position: #{snp.parental_sequences.to_s}"
148
+ $stderr.puts e.backtrace
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ def print_primer_3_exons (file, target_chromosome , parental, max_specific_primers: 20 )
155
+ added = 0
156
+
157
+ @snp_map.each do | gene, snp_array|
158
+ snp_array.each do |snp|
159
+ string = ""
160
+ begin
161
+ primer_3_min_seq_length
162
+ string = snp.primer_3_string( snp.chromosome, parental, max_specific_primers: max_specific_primers )
163
+ #TODO: add tan error to the SNP this snp has more than max_hits.
164
+ #Or maybe inside the SNP file.
165
+ if string.size > 0
166
+ file.puts string
167
+ added += 1
168
+ end
169
+ rescue Exception=>e
170
+ @missing_exons << snp.to_s
171
+ # $stderr.puts ""
172
+
173
+ $stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
174
+ $stderr.puts e.backtrace
175
+ end
176
+ end
177
+ end
178
+ return added
179
+ end
180
+
181
+ def add_alignments(opts=Hash.new)
182
+ opts = { :min_identity=>90, filter_best:false }.merge!(opts)
183
+ exonerate_filename = opts[:exonerate_file]
184
+ arm_selection = opts[:arm_selection]
185
+ filter_best = opts[:filter_best]
186
+
187
+ unless arm_selection
188
+ arm_selection = lambda do | contig_name |
189
+ ret = contig_name[0,3]
190
+ return ret
191
+ end
192
+ end
193
+
194
+
195
+ File.open(exonerate_filename) do |f|
196
+ f.each_line do | line |
197
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
198
+ if record and record.identity >= opts[:min_identity]
199
+ snp_array = @snp_map[record.query_id]
200
+ if snp_array != nil
201
+ snp_array.each do |snp|
202
+ if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
203
+ begin
204
+ exon = record.exon_on_gene_position(snp.position)
205
+ snp.add_exon(exon, arm_selection.call(record.target_id), filter_best:filter_best)
206
+ rescue Bio::DB::Exonerate::ExonerateException
207
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
208
+ end
209
+ end
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end
215
+ remove_alignments_over_max_hits
216
+ end
217
+
218
+ def remove_alignments_over_max_hits
219
+ @snp_map.each_pair do | gene, snp_array|
220
+ snp_array.each do |snp|
221
+ total_hits = snp.exon_list.map {|e| e[1].size}.reduce(0,:+)
222
+ snp.hit_count = total_hits
223
+ if total_hits > max_hits
224
+ snp.exon_list = {}
225
+ snp.repetitive = true
226
+ snp.errors << "The marker is in a repetitive region (#{total_hits} hits to reference)"
227
+ end
228
+ end
229
+ end
230
+ end
231
+
232
+ def add_parental(opts=Hash.new)
233
+ # opts = { :name=>opts[:path]}.merge!(opts)
234
+ sam = nil
235
+ name = opts[:name] ? opts[:name] : "Unknown"
236
+ if opts[:path]
237
+ path = opts[:path]
238
+ name = opts[:name] ? opts[:name] : path.basename(".bam")
239
+ sam = Bio::DB::Sam.new({:fasta=>@gene_models_path, :bam=>opts[:path]})
240
+ end
241
+ @parents[name] = sam
242
+ end
243
+ end
244
+
245
+ end
@@ -0,0 +1,175 @@
1
+ module Bio::PolyploidTools
2
+ class Marker
3
+ include Comparable
4
+ #include Virgola
5
+ attr_reader :template_sequence, :original, :snp
6
+ attr_accessor :best_hit
7
+ attr_accessor :index_90k
8
+ attr_accessor :snp_id
9
+ attr_accessor :snp_name
10
+ attr_accessor :chr
11
+ attr_accessor :coordinates_chr
12
+ attr_accessor :map_order
13
+ attr_accessor :chr_arm
14
+ attr_accessor :distance_cm
15
+ attr_accessor :sequence
16
+ attr_writer :contig
17
+
18
+
19
+
20
+ #after_map :parse_sequence_snp
21
+
22
+ def to_fasta
23
+ ">#{self.snp_name}\n#{self.template_sequence}"
24
+ end
25
+
26
+ def contig
27
+ @contig = best_hit.target_id.chomp if best_hit
28
+ @contig
29
+ end
30
+
31
+ def to_csv
32
+ "#{index_90k},#{snp_id},#{snp_name},#{chr},#{coordinates_chr},#{map_order},#{chr_arm},#{distance_cm},#{sequence},#{contig}"
33
+ end
34
+
35
+ def <=>(anOter)
36
+ return 0 if anOter.snp_name == @snp_name
37
+ return @chr_arm <=> anOter.chr_arm if anOter.chr_arm != @chr_arm
38
+ return @snp_name <=> anOter.snp_name if anOter.coordinates_chr == @coordinates_chr
39
+ return @coordinates_chr <=> anOter.coordinates_chr
40
+ end
41
+
42
+ def initialize(line)
43
+ line.chomp!
44
+ @template_sequence = nil
45
+ #INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE
46
+ @index_90k, @snp_id, @snp_name, @chr, @coordinates_chr, @map_order, @chr_arm, @distance_cm, @sequence, @contig = line.split(',')
47
+ parse_sequence_snp
48
+ end
49
+
50
+ def self.parse(filename)
51
+ f = File.open(filename, "r").read
52
+ f.each_line do |line|
53
+ m = Marker.new(line)
54
+ yield m if m.template_sequence
55
+
56
+ end
57
+ end
58
+
59
+ protected
60
+ def parse_sequence_snp
61
+ pos = 0
62
+ @chr.upcase!
63
+ match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence)
64
+ if match_data
65
+ @position = Regexp.last_match(:pre).size + 1
66
+ @original = Regexp.last_match(:org)
67
+ @snp = Regexp.last_match(:snp)
68
+ amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
69
+ @template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
70
+ return @template_sequence
71
+ end
72
+ return nil
73
+ end
74
+ end
75
+
76
+
77
+ #The map hast to come sorted.
78
+ class ArmMap
79
+ attr_reader :markers , :global_reference, :reference
80
+ attr_accessor :chromosome
81
+ def initialize
82
+ @markers = Hash.new
83
+ end
84
+
85
+ def align_markers(output)
86
+ Bio::Blat.align(@reference.fasta_path, @fasta_markers, output) do |hit|
87
+ marker = markers[hit.query_id]
88
+ best = marker.best_hit
89
+ unless marker.best_hit
90
+ markers[hit.query_id].best_hit = hit
91
+ else
92
+ marker.best_hit = hit if hit.score > marker.best_hit.score
93
+ end
94
+ end
95
+ end
96
+
97
+ def print_fasta_contigs_for_markers(contigs_file)
98
+
99
+ contigs = Set.new
100
+ markers.each do |k, marker|
101
+
102
+ if marker.best_hit
103
+ contigs << marker.best_hit.target_id
104
+ end
105
+ end
106
+
107
+ fasta=File.open(contigs_file, "w")
108
+ contigs.each do |contig_id|
109
+ reg = @reference.index.region_for_entry(contig_id)
110
+ fasta.puts ">#{contig_id}\n#{@reference.fetch_sequence(reg.get_full_region)}"
111
+ end
112
+ fasta.close
113
+ end
114
+
115
+
116
+
117
+
118
+ def print_fasta_markers(filename)
119
+ @fasta_markers = filename
120
+ fasta=File.open(filename, "w")
121
+
122
+ markers.each do |k, marker|
123
+ fasta.puts marker.to_fasta
124
+ end
125
+ fasta.close
126
+ end
127
+
128
+ def global_reference(reference)
129
+ @global_reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
130
+ @global_reference.load_fai_entries
131
+ end
132
+
133
+ def reference(reference)
134
+ @reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
135
+ @reference.load_fai_entries
136
+ end
137
+
138
+ def print_fasta_contigs_from_reference(filename)
139
+ if File.exist?(filename)
140
+ reference(filename)
141
+ return
142
+ end
143
+
144
+ #puts "loaded"
145
+
146
+ fasta=File.open(filename, "w")
147
+
148
+ Bio::FlatFile.auto( @global_reference.fasta_path) do |ff|
149
+ ff.each do |f|
150
+ chr_reg = arm_selection_embl(f.entry_id)
151
+ if chr_reg == chromosome
152
+ fasta.puts f.entry
153
+ end
154
+ end
155
+ end
156
+ fasta.close
157
+ reference(filename)
158
+ end
159
+
160
+
161
+ def print_map_with_contigs(filename)
162
+ file = File.open(filename, "w")
163
+ markers.values.sort { |x,y| x.map_order <=> y.map_order }.each do | marker |
164
+ file.puts marker.to_csv
165
+ end
166
+ file.close
167
+ end
168
+
169
+ protected
170
+ def arm_selection_embl(contig_name)
171
+ ret = contig_name.split('_')[2][0,2]
172
+ return ret
173
+ end
174
+ end
175
+ end