bio-polymarker 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,153 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class Bio::Blat
|
4
|
+
def self.align(database , query , output)
|
5
|
+
cmdline = "blat #{database} #{query} #{output}"
|
6
|
+
puts $stderr.puts cmdline
|
7
|
+
status, stdout, stderr = systemu cmdline
|
8
|
+
if status.exitstatus == 0
|
9
|
+
alns = Array.new unless block_given?
|
10
|
+
blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(output).to_io)
|
11
|
+
#p blat_aln
|
12
|
+
blat_aln.each_hit() do |hit|
|
13
|
+
if block_given?
|
14
|
+
yield hit
|
15
|
+
else
|
16
|
+
alns << hit
|
17
|
+
end
|
18
|
+
end
|
19
|
+
return alns unless block_given?
|
20
|
+
else
|
21
|
+
raise Exception.new(), "Error running exonerate. Command line was '#{cmdline}'\nBlat STDERR was:\n#{stderr}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Bio::Blat::Report::Hit
|
27
|
+
|
28
|
+
#Function to parse stuff like: IWGSC_CSS_1AL_scaff_110
|
29
|
+
def wheat_chr_arm
|
30
|
+
@wheat_chr_arm if @wheat_chr_arm
|
31
|
+
@wheat_chr_arm = target_id.split('_')[2]
|
32
|
+
end
|
33
|
+
|
34
|
+
def wheat_chr
|
35
|
+
wheat_chr_arm[0,2]
|
36
|
+
end
|
37
|
+
|
38
|
+
def wheat_chr_group
|
39
|
+
raise Exception.new(), "No wheat group for #{target_id} #{self.inspect}" unless wheat_chr
|
40
|
+
wheat_chr_arm[0]
|
41
|
+
end
|
42
|
+
|
43
|
+
def wheat_genome
|
44
|
+
wheat_chr_arm[1]
|
45
|
+
end
|
46
|
+
|
47
|
+
def wheat_arm
|
48
|
+
wheat_chr_arm[2]
|
49
|
+
end
|
50
|
+
|
51
|
+
def percentage_covered
|
52
|
+
( match + mismatch ) * 100.0 / query_len.to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
class Hash
|
59
|
+
def join(keyvaldelim=$,, entrydelim=$,)
|
60
|
+
map {|e| e.join(keyvaldelim) }.join(entrydelim)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
class Bio::NucleicAcid
|
66
|
+
|
67
|
+
IUPAC_CODES ||= {
|
68
|
+
|
69
|
+
'y' => 'ct',
|
70
|
+
'r' => 'ag',
|
71
|
+
'w' => 'at',
|
72
|
+
's' => 'cg',
|
73
|
+
'k' => 'gt',
|
74
|
+
'm' => 'ac',
|
75
|
+
|
76
|
+
'b' => 'cgt',
|
77
|
+
'd' => 'agt',
|
78
|
+
'h' => 'act',
|
79
|
+
'v' => 'acg',
|
80
|
+
|
81
|
+
'n' => 'acgt',
|
82
|
+
|
83
|
+
'a' => 'a',
|
84
|
+
't' => 't',
|
85
|
+
'g' => 'g',
|
86
|
+
'c' => 'c',
|
87
|
+
'u' => 'u',
|
88
|
+
|
89
|
+
'ct' => 'y',
|
90
|
+
'ag' => 'r',
|
91
|
+
'at' => 'w',
|
92
|
+
'cg' => 's',
|
93
|
+
'gt' => 'k',
|
94
|
+
'ac' => 'm',
|
95
|
+
|
96
|
+
'cgt' => 'b',
|
97
|
+
'agt' => 'd',
|
98
|
+
'act' => 'h',
|
99
|
+
'acg' => 'v',
|
100
|
+
|
101
|
+
'acgt' => 'n'
|
102
|
+
}
|
103
|
+
|
104
|
+
|
105
|
+
def self.is_unambiguous(base)
|
106
|
+
"acgtACGT".match(base)
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.to_IUAPC(bases)
|
110
|
+
base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
|
111
|
+
if base == nil
|
112
|
+
p "Invalid base! #{base}"
|
113
|
+
base = 'n' #This is a patch... as one of the scripts failed here.
|
114
|
+
end
|
115
|
+
base.upcase
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.is_valid(code, base)
|
119
|
+
IUPAC_CODES[code.downcase].chars.include? base.downcase
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
|
124
|
+
#Monkey patching to Bio::Sequence to find snps between sequences. It assumes the
|
125
|
+
#sequences are already aligned and doesn't check if a base on the first sequence is
|
126
|
+
#valid on the second.
|
127
|
+
class Bio::Sequence
|
128
|
+
def self.snps_between(seq1, seq2)
|
129
|
+
snps=0
|
130
|
+
for i in (0..seq1.size-1)
|
131
|
+
snps += 1 if seq1[i] != seq2[i]
|
132
|
+
end
|
133
|
+
snps
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
class String
|
138
|
+
#Monkey patching to count how many ambiguity codes are present in the string, for Nucleic Acids
|
139
|
+
def count_ambiguities
|
140
|
+
snps=0
|
141
|
+
|
142
|
+
for i in (0..self.size-1)
|
143
|
+
|
144
|
+
snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
|
145
|
+
end
|
146
|
+
snps
|
147
|
+
end
|
148
|
+
|
149
|
+
#Counts how many bases are uppercase
|
150
|
+
def upper_case_count
|
151
|
+
match(/[^A-Z]*/).to_s.size
|
152
|
+
end
|
153
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Bio::PolyploidTools
|
2
|
+
class ChromosomeArm
|
3
|
+
|
4
|
+
@@arm_selection_functions = Hash.new;
|
5
|
+
|
6
|
+
#example format: chr2A
|
7
|
+
@@arm_selection_functions[:nrgene] = lambda do | contig_name |
|
8
|
+
ret = contig_name[3,2]
|
9
|
+
return ret
|
10
|
+
end
|
11
|
+
|
12
|
+
@@arm_selection_functions[:first_two] = lambda do | contig_name |
|
13
|
+
contig_name.gsub!(/chr/,"")
|
14
|
+
ret = contig_name[0,2]
|
15
|
+
return ret
|
16
|
+
end
|
17
|
+
|
18
|
+
#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
|
19
|
+
#Or the first two characters in the contig name, to deal with
|
20
|
+
#pseudomolecules that start with headers like: "1A"
|
21
|
+
#And with the cases when 3B is named with the prefix: v443
|
22
|
+
@@arm_selection_functions[:embl] = lambda do | contig_name|
|
23
|
+
|
24
|
+
arr = contig_name.split('_')
|
25
|
+
ret = "U"
|
26
|
+
ret = arr[2][0,2] if arr.size >= 3
|
27
|
+
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
28
|
+
ret = arr[0][0,2] if arr.size == 1
|
29
|
+
return ret
|
30
|
+
end
|
31
|
+
|
32
|
+
@@arm_selection_functions[:morex] = lambda do | contig_name |
|
33
|
+
ret = contig_name.split(':')[0].split("_")[1];
|
34
|
+
return ret
|
35
|
+
end
|
36
|
+
|
37
|
+
@@arm_selection_functions[:scaffold] = lambda do | contig_name |
|
38
|
+
ret = contig_name;
|
39
|
+
return ret
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.getArmSelection(name)
|
43
|
+
arr = name.split(",")
|
44
|
+
if arr.size == 2
|
45
|
+
@@arm_selection_functions[name.to_sym] = lambda do |contig_name|
|
46
|
+
separator, field = arr
|
47
|
+
field = field.to_i
|
48
|
+
ret = contig_name.split(separator)[field]
|
49
|
+
return ret
|
50
|
+
end
|
51
|
+
end
|
52
|
+
@@arm_selection_functions[name.to_sym]
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.getValidFunctions
|
56
|
+
tmp = @@arm_selection_functions.keys.map { |e| e.to_s }
|
57
|
+
tmp.unshift "<sep>,<index>"
|
58
|
+
tmp
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
#puts "Loading ExonCointainer..."
|
2
|
+
module Bio::PolyploidTools
|
3
|
+
class ExonContainer
|
4
|
+
attr_reader :parental_1_sam, :parental_2_sam
|
5
|
+
attr_reader :parental_1_name, :parental_2_name, :gene_models_db
|
6
|
+
attr_reader :chromosomes, :snp_map
|
7
|
+
attr_reader :parents
|
8
|
+
attr_accessor :flanking_size , :primer_3_min_seq_length, :max_hits
|
9
|
+
|
10
|
+
BASES = [:A, :C, :G, :T]
|
11
|
+
#Sets the reference file for the gene models
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@parents=Hash.new
|
15
|
+
@snp_map = Hash.new
|
16
|
+
@primer_3_min_seq_length = 50
|
17
|
+
@max_hits = 10
|
18
|
+
end
|
19
|
+
|
20
|
+
def gene_models(path)
|
21
|
+
@gene_models_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
|
22
|
+
@gene_models_db.index
|
23
|
+
@gene_models_path = path
|
24
|
+
end
|
25
|
+
|
26
|
+
#Returns the sequence for a region in the gene models (exon)
|
27
|
+
def gene_model_sequence(region)
|
28
|
+
#puts "Region: "
|
29
|
+
#puts region.inspect
|
30
|
+
target_reg = @gene_models_db.index.region_for_entry(region.entry)
|
31
|
+
#puts target_reg.inspect
|
32
|
+
region.end = target_reg.length if region.end > target_reg.length
|
33
|
+
#entries[region.entry]
|
34
|
+
|
35
|
+
seq=@gene_models_db.fetch_sequence(region)
|
36
|
+
#puts "sequence: "
|
37
|
+
#This is a patch that we need to fix in biosamtools:
|
38
|
+
#puts seq
|
39
|
+
index = seq.index('>')
|
40
|
+
if(index )
|
41
|
+
index -= 1
|
42
|
+
#puts "Index: #{index}"
|
43
|
+
seq = seq.slice(0..index)
|
44
|
+
end
|
45
|
+
#puts seq
|
46
|
+
seq
|
47
|
+
end
|
48
|
+
|
49
|
+
#Sets the reference file for the gene models
|
50
|
+
def chromosomes(path)
|
51
|
+
@chromosomes_db = Bio::DB::Fasta::FastaFile.new(fasta: path)
|
52
|
+
@chromosomes_path = path
|
53
|
+
end
|
54
|
+
|
55
|
+
#Retunrs the sequence for a region in the gene models (exon)
|
56
|
+
def chromosome_sequence(region)
|
57
|
+
left_pad = 0
|
58
|
+
#TODO: Padd if it goes to the right
|
59
|
+
if(region.start < 1)
|
60
|
+
left_pad = region.start * -1
|
61
|
+
left_pad += 1
|
62
|
+
region.start = 1
|
63
|
+
end
|
64
|
+
str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
|
65
|
+
#str << "n" * (region.size - str.size + 1) if region.size > str.size
|
66
|
+
str
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def add_chromosome_arm(opts)
|
71
|
+
@chromosomes = Hash.new unless @chromosomes
|
72
|
+
name = opts[:name]
|
73
|
+
path = opts[:reference_path]
|
74
|
+
path = opts[:alig_path]
|
75
|
+
chromosomes[name] = Bio::DB::Fasta::FastaFile.new(fasta: path)
|
76
|
+
end
|
77
|
+
|
78
|
+
def add_snp(snp)
|
79
|
+
snp.max_hits = self.max_hits
|
80
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
81
|
+
@snp_map[snp.gene] << snp
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
def add_snp_file(filename, chromosome, snp_in, original_name)
|
86
|
+
|
87
|
+
File.open(filename) do | f |
|
88
|
+
f.each_line do | line |
|
89
|
+
snp = SNP.parse(line)
|
90
|
+
snp.flanking_size = flanking_size
|
91
|
+
if snp.position > 0
|
92
|
+
snp.container = self
|
93
|
+
snp.chromosome = chromosome
|
94
|
+
snp.snp_in = snp_in
|
95
|
+
snp.original_name = original_name
|
96
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
97
|
+
@snp_map[snp.gene] << snp
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
def fasta_string_for_snp(snp)
|
107
|
+
gene_region = snp.covered_region
|
108
|
+
local_pos_in_gene = snp.local_position
|
109
|
+
ret_str = ""
|
110
|
+
@parents.each do |name, bam|
|
111
|
+
ret_str << ">#{gene_region.id}_SNP-#{snp.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
|
112
|
+
to_print = bam.consensus_with_ambiguities(region: gene_region).to_s
|
113
|
+
to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
|
114
|
+
ret_str << to_print << "\n"
|
115
|
+
end
|
116
|
+
|
117
|
+
snp.exon_list.each do | chromosome, exon |
|
118
|
+
target_region = exon.target_region
|
119
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
120
|
+
chr_local_pos=local_pos_in_gene + target_region.start + 1
|
121
|
+
ret_str << ">#{chromosome}_SNP-#{chr_local_pos} #{exon.to_s} #{target_region.orientation}\n"
|
122
|
+
to_print = "-" * exon_start_offset
|
123
|
+
chr_seq = chromosome_sequence(exon.target_region).to_s
|
124
|
+
l_pos = exon_start_offset + local_pos_in_gene
|
125
|
+
to_print << chr_seq
|
126
|
+
to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
|
127
|
+
ret_str << to_print
|
128
|
+
end
|
129
|
+
ret_str
|
130
|
+
end
|
131
|
+
|
132
|
+
def print_fasta_snp_exones (file)
|
133
|
+
@missing_exons = Set.new unless @missing_exons
|
134
|
+
@snp_map.each do | gene, snp_array|
|
135
|
+
snp_array.each do |snp|
|
136
|
+
#file.puts snp.primer_fasta_string
|
137
|
+
#puts "In print_fast_np_exones"
|
138
|
+
#puts snp.inspect
|
139
|
+
|
140
|
+
begin
|
141
|
+
file.puts snp.aligned_sequences_fasta
|
142
|
+
rescue Exception=>e
|
143
|
+
#puts snp.inspect
|
144
|
+
@missing_exons << snp.to_s
|
145
|
+
$stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
|
146
|
+
$stderr.puts "Local position: #{snp.local_position}"
|
147
|
+
$stderr.puts "Local position: #{snp.parental_sequences.to_s}"
|
148
|
+
$stderr.puts e.backtrace
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def print_primer_3_exons (file, target_chromosome , parental, max_specific_primers: 20 )
|
155
|
+
added = 0
|
156
|
+
|
157
|
+
@snp_map.each do | gene, snp_array|
|
158
|
+
snp_array.each do |snp|
|
159
|
+
string = ""
|
160
|
+
begin
|
161
|
+
primer_3_min_seq_length
|
162
|
+
string = snp.primer_3_string( snp.chromosome, parental, max_specific_primers: max_specific_primers )
|
163
|
+
#TODO: add tan error to the SNP this snp has more than max_hits.
|
164
|
+
#Or maybe inside the SNP file.
|
165
|
+
if string.size > 0
|
166
|
+
file.puts string
|
167
|
+
added += 1
|
168
|
+
end
|
169
|
+
rescue Exception=>e
|
170
|
+
@missing_exons << snp.to_s
|
171
|
+
# $stderr.puts ""
|
172
|
+
|
173
|
+
$stderr.puts "print_primer_3_exons: #{e.to_s} : snp.to_s"
|
174
|
+
$stderr.puts e.backtrace
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
return added
|
179
|
+
end
|
180
|
+
|
181
|
+
def add_alignments(opts=Hash.new)
|
182
|
+
opts = { :min_identity=>90, filter_best:false }.merge!(opts)
|
183
|
+
exonerate_filename = opts[:exonerate_file]
|
184
|
+
arm_selection = opts[:arm_selection]
|
185
|
+
filter_best = opts[:filter_best]
|
186
|
+
|
187
|
+
unless arm_selection
|
188
|
+
arm_selection = lambda do | contig_name |
|
189
|
+
ret = contig_name[0,3]
|
190
|
+
return ret
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
|
195
|
+
File.open(exonerate_filename) do |f|
|
196
|
+
f.each_line do | line |
|
197
|
+
record = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
198
|
+
if record and record.identity >= opts[:min_identity]
|
199
|
+
snp_array = @snp_map[record.query_id]
|
200
|
+
if snp_array != nil
|
201
|
+
snp_array.each do |snp|
|
202
|
+
if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
|
203
|
+
begin
|
204
|
+
exon = record.exon_on_gene_position(snp.position)
|
205
|
+
snp.add_exon(exon, arm_selection.call(record.target_id), filter_best:filter_best)
|
206
|
+
rescue Bio::DB::Exonerate::ExonerateException
|
207
|
+
$stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
remove_alignments_over_max_hits
|
216
|
+
end
|
217
|
+
|
218
|
+
def remove_alignments_over_max_hits
|
219
|
+
@snp_map.each_pair do | gene, snp_array|
|
220
|
+
snp_array.each do |snp|
|
221
|
+
total_hits = snp.exon_list.map {|e| e[1].size}.reduce(0,:+)
|
222
|
+
snp.hit_count = total_hits
|
223
|
+
if total_hits > max_hits
|
224
|
+
snp.exon_list = {}
|
225
|
+
snp.repetitive = true
|
226
|
+
snp.errors << "The marker is in a repetitive region (#{total_hits} hits to reference)"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def add_parental(opts=Hash.new)
|
233
|
+
# opts = { :name=>opts[:path]}.merge!(opts)
|
234
|
+
sam = nil
|
235
|
+
name = opts[:name] ? opts[:name] : "Unknown"
|
236
|
+
if opts[:path]
|
237
|
+
path = opts[:path]
|
238
|
+
name = opts[:name] ? opts[:name] : path.basename(".bam")
|
239
|
+
sam = Bio::DB::Sam.new({:fasta=>@gene_models_path, :bam=>opts[:path]})
|
240
|
+
end
|
241
|
+
@parents[name] = sam
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
module Bio::PolyploidTools
|
2
|
+
class Marker
|
3
|
+
include Comparable
|
4
|
+
#include Virgola
|
5
|
+
attr_reader :template_sequence, :original, :snp
|
6
|
+
attr_accessor :best_hit
|
7
|
+
attr_accessor :index_90k
|
8
|
+
attr_accessor :snp_id
|
9
|
+
attr_accessor :snp_name
|
10
|
+
attr_accessor :chr
|
11
|
+
attr_accessor :coordinates_chr
|
12
|
+
attr_accessor :map_order
|
13
|
+
attr_accessor :chr_arm
|
14
|
+
attr_accessor :distance_cm
|
15
|
+
attr_accessor :sequence
|
16
|
+
attr_writer :contig
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
#after_map :parse_sequence_snp
|
21
|
+
|
22
|
+
def to_fasta
|
23
|
+
">#{self.snp_name}\n#{self.template_sequence}"
|
24
|
+
end
|
25
|
+
|
26
|
+
def contig
|
27
|
+
@contig = best_hit.target_id.chomp if best_hit
|
28
|
+
@contig
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_csv
|
32
|
+
"#{index_90k},#{snp_id},#{snp_name},#{chr},#{coordinates_chr},#{map_order},#{chr_arm},#{distance_cm},#{sequence},#{contig}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def <=>(anOter)
|
36
|
+
return 0 if anOter.snp_name == @snp_name
|
37
|
+
return @chr_arm <=> anOter.chr_arm if anOter.chr_arm != @chr_arm
|
38
|
+
return @snp_name <=> anOter.snp_name if anOter.coordinates_chr == @coordinates_chr
|
39
|
+
return @coordinates_chr <=> anOter.coordinates_chr
|
40
|
+
end
|
41
|
+
|
42
|
+
def initialize(line)
|
43
|
+
line.chomp!
|
44
|
+
@template_sequence = nil
|
45
|
+
#INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE
|
46
|
+
@index_90k, @snp_id, @snp_name, @chr, @coordinates_chr, @map_order, @chr_arm, @distance_cm, @sequence, @contig = line.split(',')
|
47
|
+
parse_sequence_snp
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.parse(filename)
|
51
|
+
f = File.open(filename, "r").read
|
52
|
+
f.each_line do |line|
|
53
|
+
m = Marker.new(line)
|
54
|
+
yield m if m.template_sequence
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
protected
|
60
|
+
def parse_sequence_snp
|
61
|
+
pos = 0
|
62
|
+
@chr.upcase!
|
63
|
+
match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence)
|
64
|
+
if match_data
|
65
|
+
@position = Regexp.last_match(:pre).size + 1
|
66
|
+
@original = Regexp.last_match(:org)
|
67
|
+
@snp = Regexp.last_match(:snp)
|
68
|
+
amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
|
69
|
+
@template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
|
70
|
+
return @template_sequence
|
71
|
+
end
|
72
|
+
return nil
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
#The map hast to come sorted.
|
78
|
+
class ArmMap
|
79
|
+
attr_reader :markers , :global_reference, :reference
|
80
|
+
attr_accessor :chromosome
|
81
|
+
def initialize
|
82
|
+
@markers = Hash.new
|
83
|
+
end
|
84
|
+
|
85
|
+
def align_markers(output)
|
86
|
+
Bio::Blat.align(@reference.fasta_path, @fasta_markers, output) do |hit|
|
87
|
+
marker = markers[hit.query_id]
|
88
|
+
best = marker.best_hit
|
89
|
+
unless marker.best_hit
|
90
|
+
markers[hit.query_id].best_hit = hit
|
91
|
+
else
|
92
|
+
marker.best_hit = hit if hit.score > marker.best_hit.score
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def print_fasta_contigs_for_markers(contigs_file)
|
98
|
+
|
99
|
+
contigs = Set.new
|
100
|
+
markers.each do |k, marker|
|
101
|
+
|
102
|
+
if marker.best_hit
|
103
|
+
contigs << marker.best_hit.target_id
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
fasta=File.open(contigs_file, "w")
|
108
|
+
contigs.each do |contig_id|
|
109
|
+
reg = @reference.index.region_for_entry(contig_id)
|
110
|
+
fasta.puts ">#{contig_id}\n#{@reference.fetch_sequence(reg.get_full_region)}"
|
111
|
+
end
|
112
|
+
fasta.close
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
def print_fasta_markers(filename)
|
119
|
+
@fasta_markers = filename
|
120
|
+
fasta=File.open(filename, "w")
|
121
|
+
|
122
|
+
markers.each do |k, marker|
|
123
|
+
fasta.puts marker.to_fasta
|
124
|
+
end
|
125
|
+
fasta.close
|
126
|
+
end
|
127
|
+
|
128
|
+
def global_reference(reference)
|
129
|
+
@global_reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
130
|
+
@global_reference.load_fai_entries
|
131
|
+
end
|
132
|
+
|
133
|
+
def reference(reference)
|
134
|
+
@reference = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
135
|
+
@reference.load_fai_entries
|
136
|
+
end
|
137
|
+
|
138
|
+
def print_fasta_contigs_from_reference(filename)
|
139
|
+
if File.exist?(filename)
|
140
|
+
reference(filename)
|
141
|
+
return
|
142
|
+
end
|
143
|
+
|
144
|
+
#puts "loaded"
|
145
|
+
|
146
|
+
fasta=File.open(filename, "w")
|
147
|
+
|
148
|
+
Bio::FlatFile.auto( @global_reference.fasta_path) do |ff|
|
149
|
+
ff.each do |f|
|
150
|
+
chr_reg = arm_selection_embl(f.entry_id)
|
151
|
+
if chr_reg == chromosome
|
152
|
+
fasta.puts f.entry
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
fasta.close
|
157
|
+
reference(filename)
|
158
|
+
end
|
159
|
+
|
160
|
+
|
161
|
+
def print_map_with_contigs(filename)
|
162
|
+
file = File.open(filename, "w")
|
163
|
+
markers.values.sort { |x,y| x.map_order <=> y.map_order }.each do | marker |
|
164
|
+
file.puts marker.to_csv
|
165
|
+
end
|
166
|
+
file.close
|
167
|
+
end
|
168
|
+
|
169
|
+
protected
|
170
|
+
def arm_selection_embl(contig_name)
|
171
|
+
ret = contig_name.split('_')[2][0,2]
|
172
|
+
return ret
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|