bio-polymarker 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,153 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class Bio::Blat
|
4
|
+
def self.align(database , query , output)
|
5
|
+
cmdline = "blat #{database} #{query} #{output}"
|
6
|
+
puts $stderr.puts cmdline
|
7
|
+
status, stdout, stderr = systemu cmdline
|
8
|
+
if status.exitstatus == 0
|
9
|
+
alns = Array.new unless block_given?
|
10
|
+
blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(output).to_io)
|
11
|
+
#p blat_aln
|
12
|
+
blat_aln.each_hit() do |hit|
|
13
|
+
if block_given?
|
14
|
+
yield hit
|
15
|
+
else
|
16
|
+
alns << hit
|
17
|
+
end
|
18
|
+
end
|
19
|
+
return alns unless block_given?
|
20
|
+
else
|
21
|
+
raise Exception.new(), "Error running exonerate. Command line was '#{cmdline}'\nBlat STDERR was:\n#{stderr}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Bio::Blat::Report::Hit
|
27
|
+
|
28
|
+
#Function to parse stuff like: IWGSC_CSS_1AL_scaff_110
|
29
|
+
def wheat_chr_arm
|
30
|
+
@wheat_chr_arm if @wheat_chr_arm
|
31
|
+
@wheat_chr_arm = target_id.split('_')[2]
|
32
|
+
end
|
33
|
+
|
34
|
+
def wheat_chr
|
35
|
+
wheat_chr_arm[0,2]
|
36
|
+
end
|
37
|
+
|
38
|
+
def wheat_chr_group
|
39
|
+
raise Exception.new(), "No wheat group for #{target_id} #{self.inspect}" unless wheat_chr
|
40
|
+
wheat_chr_arm[0]
|
41
|
+
end
|
42
|
+
|
43
|
+
def wheat_genome
|
44
|
+
wheat_chr_arm[1]
|
45
|
+
end
|
46
|
+
|
47
|
+
def wheat_arm
|
48
|
+
wheat_chr_arm[2]
|
49
|
+
end
|
50
|
+
|
51
|
+
def percentage_covered
|
52
|
+
( match + mismatch ) * 100.0 / query_len.to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
class Hash
|
59
|
+
def join(keyvaldelim=$,, entrydelim=$,)
|
60
|
+
map {|e| e.join(keyvaldelim) }.join(entrydelim)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
class Bio::NucleicAcid
|
66
|
+
|
67
|
+
IUPAC_CODES ||= {
|
68
|
+
|
69
|
+
'y' => 'ct',
|
70
|
+
'r' => 'ag',
|
71
|
+
'w' => 'at',
|
72
|
+
's' => 'cg',
|
73
|
+
'k' => 'gt',
|
74
|
+
'm' => 'ac',
|
75
|
+
|
76
|
+
'b' => 'cgt',
|
77
|
+
'd' => 'agt',
|
78
|
+
'h' => 'act',
|
79
|
+
'v' => 'acg',
|
80
|
+
|
81
|
+
'n' => 'acgt',
|
82
|
+
|
83
|
+
'a' => 'a',
|
84
|
+
't' => 't',
|
85
|
+
'g' => 'g',
|
86
|
+
'c' => 'c',
|
87
|
+
'u' => 'u',
|
88
|
+
|
89
|
+
'ct' => 'y',
|
90
|
+
'ag' => 'r',
|
91
|
+
'at' => 'w',
|
92
|
+
'cg' => 's',
|
93
|
+
'gt' => 'k',
|
94
|
+
'ac' => 'm',
|
95
|
+
|
96
|
+
'cgt' => 'b',
|
97
|
+
'agt' => 'd',
|
98
|
+
'act' => 'h',
|
99
|
+
'acg' => 'v',
|
100
|
+
|
101
|
+
'acgt' => 'n'
|
102
|
+
}
|
103
|
+
|
104
|
+
|
105
|
+
def self.is_unambiguous(base)
|
106
|
+
"acgtACGT".match(base)
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.to_IUAPC(bases)
|
110
|
+
base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
|
111
|
+
if base == nil
|
112
|
+
p "Invalid base! #{base}"
|
113
|
+
base = 'n' #This is a patch... as one of the scripts failed here.
|
114
|
+
end
|
115
|
+
base.upcase
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.is_valid(code, base)
|
119
|
+
IUPAC_CODES[code.downcase].chars.include? base.downcase
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
#Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
|
125
|
+
#sequences are already aligned and doesn't check if a base on the first sequence is
|
126
|
+
#valid on the second.
|
127
|
+
class Bio::Sequence
|
128
|
+
def self.snps_between(seq1, seq2)
|
129
|
+
snps=0
|
130
|
+
for i in (0..seq1.size-1)
|
131
|
+
snps += 1 if seq1[i] != seq2[i]
|
132
|
+
end
|
133
|
+
snps
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
class String
|
138
|
+
#Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
|
139
|
+
def count_ambiguities
|
140
|
+
snps=0
|
141
|
+
|
142
|
+
for i in (0..self.size-1)
|
143
|
+
|
144
|
+
snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
|
145
|
+
end
|
146
|
+
snps
|
147
|
+
end
|
148
|
+
|
149
|
+
#Counts how many bases are uppercase
|
150
|
+
def upper_case_count
|
151
|
+
match(/[^A-Z]*/).to_s.size
|
152
|
+
end
|
153
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Bio::PolyploidTools
|
2
|
+
class ChromosomeArm
|
3
|
+
|
4
|
+
@@arm_selection_functions = Hash.new;
|
5
|
+
|
6
|
+
#example format: chr2A
|
7
|
+
@@arm_selection_functions[:nrgene] = lambda do | contig_name |
|
8
|
+
ret = contig_name[3,2]
|
9
|
+
return ret
|
10
|
+
end
|
11
|
+
|
12
|
+
@@arm_selection_functions[:first_two] = lambda do | contig_name |
|
13
|
+
contig_name.gsub!(/chr/,"")
|
14
|
+
ret = contig_name[0,2]
|
15
|
+
return ret
|
16
|
+
end
|
17
|
+
|
18
|
+
#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
|
19
|
+
#Or the first two characters in the contig name, to deal with
|
20
|
+
#pseudomolecules that start with headers like: "1A"
|
21
|
+
#And with the cases when 3B is named with the prefix: v443
|
22
|
+
@@arm_selection_functions[:embl] = lambda do | contig_name|
|
23
|
+
|
24
|
+
arr = contig_name.split('_')
|
25
|
+
ret = "U"
|
26
|
+
ret = arr[2][0,2] if arr.size >= 3
|
27
|
+
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
28
|
+
ret = arr[0][0,2] if arr.size == 1
|
29
|
+
return ret
|
30
|
+
end
|
31
|
+
|
32
|
+
@@arm_selection_functions[:morex] = lambda do | contig_name |
|
33
|
+
ret = contig_name.split(':')[0].split("_")[1];
|
34
|
+
return ret
|
35
|
+
end
|
36
|
+
|
37
|
+
@@arm_selection_functions[:scaffold] = lambda do | contig_name |
|
38
|
+
ret = contig_name;
|
39
|
+
return ret
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.getArmSelection(name)
|
43
|
+
arr = name.split(",")
|
44
|
+
if arr.size == 2
|
45
|
+
@@arm_selection_functions[name.to_sym] = lambda do |contig_name|
|
46
|
+
separator, field = arr
|
47
|
+
field = field.to_i
|
48
|
+
ret = contig_name.split(separator)[field]
|
49
|
+
return ret
|
50
|
+
end
|
51
|
+
end
|
52
|
+
@@arm_selection_functions[name.to_sym]
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.getValidFunctions
|
56
|
+
tmp = @@arm_selection_functions.keys.map { |e| e.to_s }
|
57
|
+
tmp.unshift "<sep>,<index>"
|
58
|
+
tmp
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
#puts "Loading ExonCointainer..."
|
2
|
+
module Bio::PolyploidTools
|
3
|
+
class ExonContainer
|
4
|
+
attr_reader :parental_1_sam, :parental_2_sam
|
5
|
+
attr_reader :parental_1_name, :parental_2_name, :gene_models_db
|
6
|
+
attr_reader :chromosomes, :snp_map
|
7
|
+
attr_reader :parents
|
8
|
+
attr_accessor :flanking_size , :primer_3_min_seq_length, :max_hits
|
9
|
+
|
10
|
+
BASES = [:A, :C, :G, :T]
|
11
|
+
#Sets the reference file for the gene models
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@parents=Hash.new
|
15
|
+
@snp_map = Hash.new
|
16
|
+
@primer_3_min_seq_length = 50
|
17
|
+
@max_hits = 10
|
18
|
+
end
|
19
|
+
|
20
|
+
def gene_models(path)
|
21
|
+
@gene_models_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
|
22
|
+
@gene_models_db.index
|
23
|
+
@gene_models_path = path
|
24
|
+
end
|
25
|
+
|
26
|
+
#Returns the sequence for a region in the gene models (exon)
|
27
|
+
def gene_model_sequence(region)
|
28
|
+
#puts "Region: "
|
29
|
+
#puts region.inspect
|
30
|
+
target_reg = @gene_models_db.index.region_for_entry(region.entry)
|
31
|
+
#puts target_reg.inspect
|
32
|
+
region.end = target_reg.length if region.end > target_reg.length
|
33
|
+
#entries[region.entry]
|
34
|
+
|
35
|
+
seq=@gene_models_db.fetch_sequence(region)
|
36
|
+
#puts "sequence: "
|
37
|
+
#This is a patch that we need to fix in biosamtools:
|
38
|
+
#puts seq
|
39
|
+
index = seq.index('>')
|
40
|
+
if(index )
|
41
|
+
index -= 1
|
42
|
+
#puts "Index: #{index}"
|
43
|
+
seq = seq.slice(0..index)
|
44
|
+
end
|
45
|
+
#puts seq
|
46
|
+
seq
|
47
|
+
end
|
48
|
+
|
49
|
+
#Sets the reference file for the gene models
|
50
|
+
def chromosomes(path)
|
51
|
+
@chromosomes_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
|
52
|
+
@chromosomes_path = path
|
53
|
+
end
|
54
|
+
|
55
|
+
#Retunrs the sequence for a region in the gene models (exon)
|
56
|
+
def chromosome_sequence(region)
|
57
|
+
left_pad = 0
|
58
|
+
#TODO: Padd if it goes to the right
|
59
|
+
if(region.start < 1)
|
60
|
+
left_pad = region.start * -1
|
61
|
+
left_pad += 1
|
62
|
+
region.start = 1
|
63
|
+
end
|
64
|
+
str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
|
65
|
+
#str << "n" * (region.size - str.size + 1) if region.size > str.size
|
66
|
+
str
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def add_chromosome_arm(opts)
|
71
|
+
@chromosomes = Hash.new unless @chromosomes
|
72
|
+
name = opts[:name]
|
73
|
+
path = opts[:reference_path]
|
74
|
+
path = opts[:alig_path]
|
75
|
+
chromosomes[name] = Bio::DB::Fasta::FastaFile.new(fasta: path)
|
76
|
+
end
|
77
|
+
|
78
|
+
def add_snp(snp)
|
79
|
+
snp.max_hits = self.max_hits
|
80
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
81
|
+
@snp_map[snp.gene] << snp
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
def add_snp_file(filename, chromosome, snp_in, original_name)
|
86
|
+
|
87
|
+
File.open(filename) do | f |
|
88
|
+
f.each_line do | line |
|
89
|
+
snp = SNP.parse(line)
|
90
|
+
snp.flanking_size = flanking_size
|
91
|
+
if snp.position > 0
|
92
|
+
snp.container = self
|
93
|
+
snp.chromosome = chromosome
|
94
|
+
snp.snp_in = snp_in
|
95
|
+
snp.original_name = original_name
|
96
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
97
|
+
@snp_map[snp.gene] << snp
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
def fasta_string_for_snp(snp)
|
107
|
+
gene_region = snp.covered_region
|
108
|
+
local_pos_in_gene = snp.local_position
|
109
|
+
ret_str = ""
|
110
|
+
@parents.each do |name, bam|
|
111
|
+
ret_str << ">#{gene_region.id}_SNP-#{snp.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
|
112
|
+
to_print = bam.consensus_with_ambiguities(region: gene_region).to_s
|
113
|
+
to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
|
114
|
+
ret_str << to_print << "\n"
|
115
|
+
end
|
116
|
+
|
117
|
+
snp.exon_list.each do | chromosome, exon |
|
118
|
+
target_region = exon.target_region
|
119
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
120
|
+
chr_local_pos=local_pos_in_gene + target_region.start + 1
|
121
|
+
ret_str << ">#{chromosome}_SNP-#{chr_local_pos} #{exon.to_s} #{target_region.orientation}\n"
|
122
|
+
to_print = "-" * exon_start_offset
|
123
|
+
chr_seq = chromosome_sequence(exon.target_region).to_s
|
124
|
+
l_pos = exon_start_offset + local_pos_in_gene
|
125
|
+
to_print << chr_seq
|
126
|
+
to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
|
127
|
+
ret_str << to_print
|
128
|
+
end
|
129
|
+
ret_str
|
130
|
+
end
|
131
|
+
|
132
|
+
def print_fasta_snp_exones (file)
|
133
|
+
@missing_exons = Set.new unless @missing_exons
|
134
|
+
@snp_map.each do | gene, snp_array|
|
135
|
+
snp_array.each do |snp|
|
136
|
+
#file.puts snp.primer_fasta_string
|
137
|
+
#puts "In print_fast_np_exones"
|
138
|
+
#puts snp.inspect
|
139
|
+
|
140
|
+
begin
|
141
|
+
file.puts snp.aligned_sequences_fasta
|
142
|
+
rescue Exception=>e
|
143
|
+
#puts snp.inspect
|
144
|
+
@missing_exons << snp.to_s
|
145
|
+
$stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
|
146
|
+
$stderr.puts "Local position: #{snp.local_position}"
|
147
|
+
$stderr.puts "Local position: #{snp.parental_sequences.to_s}"
|
148
|
+
$stderr.puts e.backtrace
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def print_primer_3_exons (file, target_chromosome , parental, max_specific_primers: 20 )
|
155
|
+
added = 0
|
156
|
+
|
157
|
+
@snp_map.each do | gene, snp_array|
|
158
|
+
snp_array.each do |snp|
|
159
|
+
string = ""
|
160
|
+
begin
|
161
|
+
primer_3_min_seq_length
|
162
|
+
string = snp.primer_3_string( snp.chromosome, parental, max_specific_primers: max_specific_primers )
|
163
|
+
#TODO: add tan error to the SNP this snp has more than max_hits.
|
164
|
+
#Or maybe inside the SNP file.
|
165
|
+
if string.size > 0
|
166
|
+
file.puts string
|
167
|
+
added += 1
|
168
|
+
end
|
169
|
+
rescue Exception=>e
|
170
|
+
@missing_exons << snp.to_s
|
171
|
+
# $stderr.puts ""
|
172
|
+
|
173
|
+
$stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
|
174
|
+
$stderr.puts e.backtrace
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
return added
|
179
|
+
end
|
180
|
+
|
181
|
+
def add_alignments(opts=Hash.new)
|
182
|
+
opts = { :min_identity=>90, filter_best:false }.merge!(opts)
|
183
|
+
exonerate_filename = opts[:exonerate_file]
|
184
|
+
arm_selection = opts[:arm_selection]
|
185
|
+
filter_best = opts[:filter_best]
|
186
|
+
|
187
|
+
unless arm_selection
|
188
|
+
arm_selection = lambda do | contig_name |
|
189
|
+
ret = contig_name[0,3]
|
190
|
+
return ret
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
File.open(exonerate_filename) do |f|
|
196
|
+
f.each_line do | line |
|
197
|
+
record = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
198
|
+
if record and record.identity >= opts[:min_identity]
|
199
|
+
snp_array = @snp_map[record.query_id]
|
200
|
+
if snp_array != nil
|
201
|
+
snp_array.each do |snp|
|
202
|
+
if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
|
203
|
+
begin
|
204
|
+
exon = record.exon_on_gene_position(snp.position)
|
205
|
+
snp.add_exon(exon, arm_selection.call(record.target_id), filter_best:filter_best)
|
206
|
+
rescue Bio::DB::Exonerate::ExonerateException
|
207
|
+
$stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
remove_alignments_over_max_hits
|
216
|
+
end
|
217
|
+
|
218
|
+
def remove_alignments_over_max_hits
|
219
|
+
@snp_map.each_pair do | gene, snp_array|
|
220
|
+
snp_array.each do |snp|
|
221
|
+
total_hits = snp.exon_list.map {|e| e[1].size}.reduce(0,:+)
|
222
|
+
snp.hit_count = total_hits
|
223
|
+
if total_hits > max_hits
|
224
|
+
snp.exon_list = {}
|
225
|
+
snp.repetitive = true
|
226
|
+
snp.errors << "The marker is in a repetitive region (#{total_hits} hits to reference)"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def add_parental(opts=Hash.new)
|
233
|
+
# opts = { :name=>opts[:path]}.merge!(opts)
|
234
|
+
sam = nil
|
235
|
+
name = opts[:name] ? opts[:name] : "Unknown"
|
236
|
+
if opts[:path]
|
237
|
+
path = opts[:path]
|
238
|
+
name = opts[:name] ? opts[:name] : path.basename(".bam")
|
239
|
+
sam = Bio::DB::Sam.new({:fasta=>@gene_models_path, :bam=>opts[:path]})
|
240
|
+
end
|
241
|
+
@parents[name] = sam
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
module Bio::PolyploidTools
|
2
|
+
class Marker
|
3
|
+
include Comparable
|
4
|
+
#include Virgola
|
5
|
+
attr_reader :template_sequence, :original, :snp
|
6
|
+
attr_accessor :best_hit
|
7
|
+
attr_accessor :index_90k
|
8
|
+
attr_accessor :snp_id
|
9
|
+
attr_accessor :snp_name
|
10
|
+
attr_accessor :chr
|
11
|
+
attr_accessor :coordinates_chr
|
12
|
+
attr_accessor :map_order
|
13
|
+
attr_accessor :chr_arm
|
14
|
+
attr_accessor :distance_cm
|
15
|
+
attr_accessor :sequence
|
16
|
+
attr_writer :contig
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
#after_map :parse_sequence_snp
|
21
|
+
|
22
|
+
def to_fasta
|
23
|
+
">#{self.snp_name}\n#{self.template_sequence}"
|
24
|
+
end
|
25
|
+
|
26
|
+
def contig
|
27
|
+
@contig = best_hit.target_id.chomp if best_hit
|
28
|
+
@contig
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_csv
|
32
|
+
"#{index_90k},#{snp_id},#{snp_name},#{chr},#{coordinates_chr},#{map_order},#{chr_arm},#{distance_cm},#{sequence},#{contig}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def <=>(anOter)
|
36
|
+
return 0 if anOter.snp_name == @snp_name
|
37
|
+
return @chr_arm <=> anOter.chr_arm if anOter.chr_arm != @chr_arm
|
38
|
+
return @snp_name <=> anOter.snp_name if anOter.coordinates_chr == @coordinates_chr
|
39
|
+
return @coordinates_chr <=> anOter.coordinates_chr
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize(line)
|
43
|
+
line.chomp!
|
44
|
+
@template_sequence = nil
|
45
|
+
#INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE
|
46
|
+
@index_90k, @snp_id, @snp_name, @chr, @coordinates_chr, @map_order, @chr_arm, @distance_cm, @sequence, @contig = line.split(',')
|
47
|
+
parse_sequence_snp
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.parse(filename)
|
51
|
+
f = File.open(filename, "r").read
|
52
|
+
f.each_line do |line|
|
53
|
+
m = Marker.new(line)
|
54
|
+
yield m if m.template_sequence
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
protected
|
60
|
+
def parse_sequence_snp
|
61
|
+
pos = 0
|
62
|
+
@chr.upcase!
|
63
|
+
match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence)
|
64
|
+
if match_data
|
65
|
+
@position = Regexp.last_match(:pre).size + 1
|
66
|
+
@original = Regexp.last_match(:org)
|
67
|
+
@snp = Regexp.last_match(:snp)
|
68
|
+
amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
|
69
|
+
@template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
|
70
|
+
return @template_sequence
|
71
|
+
end
|
72
|
+
return nil
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
#The map hast to come sorted.
|
78
|
+
class ArmMap
|
79
|
+
attr_reader :markers , :global_reference, :reference
|
80
|
+
attr_accessor :chromosome
|
81
|
+
def initialize
|
82
|
+
@markers = Hash.new
|
83
|
+
end
|
84
|
+
|
85
|
+
def align_markers(output)
|
86
|
+
Bio::Blat.align(@reference.fasta_path, @fasta_markers, output) do |hit|
|
87
|
+
marker = markers[hit.query_id]
|
88
|
+
best = marker.best_hit
|
89
|
+
unless marker.best_hit
|
90
|
+
markers[hit.query_id].best_hit = hit
|
91
|
+
else
|
92
|
+
marker.best_hit = hit if hit.score > marker.best_hit.score
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def print_fasta_contigs_for_markers(contigs_file)
|
98
|
+
|
99
|
+
contigs = Set.new
|
100
|
+
markers.each do |k, marker|
|
101
|
+
|
102
|
+
if marker.best_hit
|
103
|
+
contigs << marker.best_hit.target_id
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
fasta=File.open(contigs_file, "w")
|
108
|
+
contigs.each do |contig_id|
|
109
|
+
reg = @reference.index.region_for_entry(contig_id)
|
110
|
+
fasta.puts ">#{contig_id}\n#{@reference.fetch_sequence(reg.get_full_region)}"
|
111
|
+
end
|
112
|
+
fasta.close
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
def print_fasta_markers(filename)
|
119
|
+
@fasta_markers = filename
|
120
|
+
fasta=File.open(filename, "w")
|
121
|
+
|
122
|
+
markers.each do |k, marker|
|
123
|
+
fasta.puts marker.to_fasta
|
124
|
+
end
|
125
|
+
fasta.close
|
126
|
+
end
|
127
|
+
|
128
|
+
def global_reference(reference)
|
129
|
+
@global_reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
130
|
+
@global_reference.load_fai_entries
|
131
|
+
end
|
132
|
+
|
133
|
+
def reference(reference)
|
134
|
+
@reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
135
|
+
@reference.load_fai_entries
|
136
|
+
end
|
137
|
+
|
138
|
+
def print_fasta_contigs_from_reference(filename)
|
139
|
+
if File.exist?(filename)
|
140
|
+
reference(filename)
|
141
|
+
return
|
142
|
+
end
|
143
|
+
|
144
|
+
#puts "loaded"
|
145
|
+
|
146
|
+
fasta=File.open(filename, "w")
|
147
|
+
|
148
|
+
Bio::FlatFile.auto( @global_reference.fasta_path) do |ff|
|
149
|
+
ff.each do |f|
|
150
|
+
chr_reg = arm_selection_embl(f.entry_id)
|
151
|
+
if chr_reg == chromosome
|
152
|
+
fasta.puts f.entry
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
fasta.close
|
157
|
+
reference(filename)
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
def print_map_with_contigs(filename)
|
162
|
+
file = File.open(filename, "w")
|
163
|
+
markers.values.sort { |x,y| x.map_order <=> y.map_order }.each do | marker |
|
164
|
+
file.puts marker.to_csv
|
165
|
+
end
|
166
|
+
file.close
|
167
|
+
end
|
168
|
+
|
169
|
+
protected
|
170
|
+
def arm_selection_embl(contig_name)
|
171
|
+
ret = contig_name.split('_')[2][0,2]
|
172
|
+
return ret
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|