bio-polyploid-tools 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +67 -0
- data/README +21 -0
- data/Rakefile +61 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +133 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +15 -0
- data/bin/find_best_blat_hit.rb +32 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +155 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/polymarker.rb +219 -0
- data/bin/snps_between_bams.rb +106 -0
- data/bio-polyploid-tools.gemspec +139 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +698 -0
- data/lib/bio/BIOExtensions.rb +186 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
- data/lib/bio/PolyploidTools/SNP.rb +681 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
- data/lib/bio/SAMToolsExtensions.rb +284 -0
- data/lib/bio/db/exonerate.rb +272 -0
- data/lib/bio/db/fastadb.rb +164 -0
- data/lib/bio/db/primer3.rb +673 -0
- data/lib/bioruby-polyploid-tools.rb +25 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/Test3Aspecific.csv +1 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +51 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +53 -0
- data/test/test_snp_parsing.rb +40 -0
- metadata +201 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
|
|
2
|
+
module Bio::NucleicAcid::Data
|
|
3
|
+
IUPAC_CODES = {
|
|
4
|
+
|
|
5
|
+
'y' => 'ct',
|
|
6
|
+
'r' => 'ag',
|
|
7
|
+
'w' => 'at',
|
|
8
|
+
's' => 'cg',
|
|
9
|
+
'k' => 'gt',
|
|
10
|
+
'm' => 'ac',
|
|
11
|
+
|
|
12
|
+
'b' => 'cgt',
|
|
13
|
+
'd' => 'agt',
|
|
14
|
+
'h' => 'act',
|
|
15
|
+
'v' => 'acg',
|
|
16
|
+
|
|
17
|
+
'n' => 'acgt',
|
|
18
|
+
|
|
19
|
+
'a' => 'a',
|
|
20
|
+
't' => 't',
|
|
21
|
+
'g' => 'g',
|
|
22
|
+
'c' => 'c',
|
|
23
|
+
'u' => 'u',
|
|
24
|
+
|
|
25
|
+
'ct' => 'y',
|
|
26
|
+
'ag' => 'r',
|
|
27
|
+
'at' => 'w',
|
|
28
|
+
'cg' => 's',
|
|
29
|
+
'gt' => 'k',
|
|
30
|
+
'ac' => 'm',
|
|
31
|
+
|
|
32
|
+
'cgt' => 'b',
|
|
33
|
+
'agt' => 'd',
|
|
34
|
+
'act' => 'h',
|
|
35
|
+
'acg' => 'v',
|
|
36
|
+
|
|
37
|
+
'acgt' => 'n'
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
class Bio::NucleicAcid
|
|
44
|
+
|
|
45
|
+
IUPAC_CODES = {
|
|
46
|
+
|
|
47
|
+
'y' => 'ct',
|
|
48
|
+
'r' => 'ag',
|
|
49
|
+
'w' => 'at',
|
|
50
|
+
's' => 'cg',
|
|
51
|
+
'k' => 'gt',
|
|
52
|
+
'm' => 'ac',
|
|
53
|
+
|
|
54
|
+
'b' => 'cgt',
|
|
55
|
+
'd' => 'agt',
|
|
56
|
+
'h' => 'act',
|
|
57
|
+
'v' => 'acg',
|
|
58
|
+
|
|
59
|
+
'n' => 'acgt',
|
|
60
|
+
|
|
61
|
+
'a' => 'a',
|
|
62
|
+
't' => 't',
|
|
63
|
+
'g' => 'g',
|
|
64
|
+
'c' => 'c',
|
|
65
|
+
'u' => 'u',
|
|
66
|
+
|
|
67
|
+
'ct' => 'y',
|
|
68
|
+
'ag' => 'r',
|
|
69
|
+
'at' => 'w',
|
|
70
|
+
'cg' => 's',
|
|
71
|
+
'gt' => 'k',
|
|
72
|
+
'ac' => 'm',
|
|
73
|
+
|
|
74
|
+
'cgt' => 'b',
|
|
75
|
+
'agt' => 'd',
|
|
76
|
+
'act' => 'h',
|
|
77
|
+
'acg' => 'v',
|
|
78
|
+
|
|
79
|
+
'acgt' => 'n'
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
def self.is_unambiguous(base)
|
|
83
|
+
"acgtACGT".match(base)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def self.to_IUAPC(bases)
|
|
87
|
+
base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
|
|
88
|
+
if base == nil
|
|
89
|
+
p "Invalid base! #{base}"
|
|
90
|
+
base = 'n' #This is a patch... as one of the scripts failed here.
|
|
91
|
+
end
|
|
92
|
+
base.upcase
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def self.is_valid(code, base)
|
|
96
|
+
IUPAC_CODES[code.downcase].chars.include? base.downcase
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
class Bio::Sequence
|
|
102
|
+
def self.snps_between(seq1, seq2)
|
|
103
|
+
snps=0
|
|
104
|
+
for i in (0..seq1.size-1)
|
|
105
|
+
snps += 1 if seq1[i] != seq2[i]
|
|
106
|
+
end
|
|
107
|
+
snps
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
class String
|
|
112
|
+
def count_ambiguities
|
|
113
|
+
snps=0
|
|
114
|
+
|
|
115
|
+
for i in (0..self.size-1)
|
|
116
|
+
|
|
117
|
+
snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
|
|
118
|
+
end
|
|
119
|
+
snps
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def upper_case_count
|
|
123
|
+
match(/[^A-Z]*/).to_s.size
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
class Bio::Blat
|
|
128
|
+
def self.align(database , query , output)
|
|
129
|
+
cmdline = "blat #{database} #{query} #{output}"
|
|
130
|
+
puts $stderr.puts cmdline
|
|
131
|
+
status, stdout, stderr = systemu cmdline
|
|
132
|
+
if status.exitstatus == 0
|
|
133
|
+
alns = Array.new unless block_given?
|
|
134
|
+
blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(output).to_io)
|
|
135
|
+
#p blat_aln
|
|
136
|
+
blat_aln.each_hit() do |hit|
|
|
137
|
+
if block_given?
|
|
138
|
+
yield hit
|
|
139
|
+
else
|
|
140
|
+
alns << hit
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
return alns unless block_given?
|
|
144
|
+
else
|
|
145
|
+
raise Exception.new(), "Error running exonerate. Command line was '#{cmdline}'\nBlat STDERR was:\n#{stderr}"
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
class Bio::Blat::Report::Hit
|
|
151
|
+
|
|
152
|
+
#Function to parse stuff like: IWGSC_CSS_1AL_scaff_110
|
|
153
|
+
def wheat_chr_arm
|
|
154
|
+
@wheat_chr_arm if @wheat_chr_arm
|
|
155
|
+
@wheat_chr_arm = target_id.split('_')[2]
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def wheat_chr
|
|
159
|
+
wheat_chr_arm[0,2]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def wheat_chr_group
|
|
163
|
+
raise Exception.new(), "No wheat group for #{target_id} #{self.inspect}" unless wheat_chr
|
|
164
|
+
wheat_chr_arm[0]
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def wheat_genome
|
|
168
|
+
wheat_chr_arm[1]
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def wheat_arm
|
|
172
|
+
wheat_chr_arm[2]
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def percentage_covered
|
|
176
|
+
( match + mismatch ) * 100.0 / query_len.to_f
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class Hash
|
|
183
|
+
def join(keyvaldelim=$,, entrydelim=$,)
|
|
184
|
+
map {|e| e.join(keyvaldelim) }.join(entrydelim)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module Bio::PolyploidTools
|
|
2
|
+
|
|
3
|
+
class ChromosomeArm
|
|
4
|
+
attr_accessor :name
|
|
5
|
+
attr_reader :genes
|
|
6
|
+
attr_reader :loaded_entries
|
|
7
|
+
attr_reader :fasta_db
|
|
8
|
+
|
|
9
|
+
def initialize(name, path_to_fasta)
|
|
10
|
+
@name = name
|
|
11
|
+
@fasta_db = Bio::DB::Fasta::FastaFile.new(path_to_fasta)
|
|
12
|
+
#$stderr.puts "Loading entries for #{name}"
|
|
13
|
+
|
|
14
|
+
@genes = Hash.new
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def fetch_contig(contig_id)
|
|
18
|
+
|
|
19
|
+
@fasta_db.load_fai_entries unless @loaded_entries
|
|
20
|
+
@loaded_entries = true
|
|
21
|
+
entry = fasta_db.index.region_for_entry(contig_id)
|
|
22
|
+
# puts entry
|
|
23
|
+
@fasta_db.fetch_sequence(entry.get_full_region)
|
|
24
|
+
end
|
|
25
|
+
#Loads all the chromosome arms in a folder.
|
|
26
|
+
#The current version requires that all the references end with .fa, and start with XXX_*.fa
|
|
27
|
+
#Where XXX is the chromosome name
|
|
28
|
+
def self.load_from_folder(path_to_contigs)
|
|
29
|
+
chromosomeArms = Hash.new
|
|
30
|
+
|
|
31
|
+
Dir.foreach(path_to_contigs) do |filename |
|
|
32
|
+
|
|
33
|
+
if File.fnmatch("*.fa", filename)
|
|
34
|
+
|
|
35
|
+
parsed = /^(?<arm>\d\w+)/.match(filename)
|
|
36
|
+
|
|
37
|
+
target="#{path_to_contigs}/#{filename}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# fasta_file = Bio::DB::Fasta::FastaFile.new(target)
|
|
42
|
+
#fasta_file.load_fai_entries
|
|
43
|
+
arm = ChromosomeArm.new(parsed[:arm], target)
|
|
44
|
+
chromosomeArms[arm.name] = arm
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
return chromosomeArms
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
#puts "Loading ExonCointainer..."
|
|
2
|
+
module Bio::PolyploidTools
|
|
3
|
+
class ExonContainer
|
|
4
|
+
attr_reader :parental_1_sam, :parental_2_sam
|
|
5
|
+
attr_reader :parental_1_name, :parental_2_name, :gene_models_db
|
|
6
|
+
attr_reader :chromosomes, :snp_map
|
|
7
|
+
attr_reader :parents
|
|
8
|
+
attr_accessor :flanking_size
|
|
9
|
+
|
|
10
|
+
BASES = [:A, :C, :G, :T]
|
|
11
|
+
#Sets the reference file for the gene models
|
|
12
|
+
|
|
13
|
+
def initialize
|
|
14
|
+
@parents=Hash.new
|
|
15
|
+
@snp_map = Hash.new
|
|
16
|
+
@snp_contigs
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def gene_models(path)
|
|
20
|
+
@gene_models_db = Bio::DB::Fasta::FastaFile.new(path)
|
|
21
|
+
@gene_models_path = path
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
#Retunrs the sequence for a region in the gene models (exon)
|
|
25
|
+
def gene_model_sequence(region)
|
|
26
|
+
seq=@gene_models_db.fetch_sequence(region)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
#Sets the reference file for the gene models
|
|
32
|
+
def chromosomes(path)
|
|
33
|
+
@chromosomes_db = Bio::DB::Fasta::FastaFile.new(path)
|
|
34
|
+
@chromosomes_path = path
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
#Retunrs the sequence for a region in the gene models (exon)
|
|
38
|
+
def chromosome_sequence(region)
|
|
39
|
+
left_pad = 0
|
|
40
|
+
#TODO: Padd if it goes to the right
|
|
41
|
+
if(region.start < 0)
|
|
42
|
+
left_pad = region.start * -1
|
|
43
|
+
left_pad += 1
|
|
44
|
+
region.start = 0
|
|
45
|
+
end
|
|
46
|
+
str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
|
|
47
|
+
#str << "n" * (region.size - str.size + 1) if region.size > str.size
|
|
48
|
+
str
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def add_chromosome_arm(opts)
|
|
53
|
+
@chromosomes = Hash.new unless @chromosomes
|
|
54
|
+
name = opts[:name]
|
|
55
|
+
path = opts[:reference_path]
|
|
56
|
+
path = opts[:alig_path]
|
|
57
|
+
chromosomes[name] = Bio::DB::Fasta::FastaFile.new(path)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def add_snp(snp)
|
|
61
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
|
62
|
+
@snp_map[snp.gene] << snp
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def add_snp_file(filename, chromosome, snp_in, original_name)
|
|
66
|
+
|
|
67
|
+
File.open(filename) do | f |
|
|
68
|
+
f.each_line do | line |
|
|
69
|
+
snp = SNP.parse(line)
|
|
70
|
+
snp.flanking_size = flanking_size
|
|
71
|
+
if snp.position > 0
|
|
72
|
+
snp.container = self
|
|
73
|
+
snp.chromosome = chromosome
|
|
74
|
+
snp.snp_in = snp_in
|
|
75
|
+
snp.original_name = original_name
|
|
76
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
|
77
|
+
@snp_map[snp.gene] << snp
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def primer_3_input_for_snp(snp)
|
|
85
|
+
gene_region = snp.covered_region
|
|
86
|
+
local_pos_in_gene = snp.local_position
|
|
87
|
+
puts ""
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def fasta_string_for_snp(snp)
|
|
91
|
+
gene_region = snp.covered_region
|
|
92
|
+
local_pos_in_gene = snp.local_position
|
|
93
|
+
ret_str = ""
|
|
94
|
+
@parents.each do |name, bam|
|
|
95
|
+
ret_str << ">#{gene_region.id}_SNP-#{snp.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
|
|
96
|
+
to_print = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
|
|
97
|
+
to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
|
|
98
|
+
ret_str << to_print << "\n"
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
snp.exon_list.each do | chromosome, exon |
|
|
102
|
+
target_region = exon.target_region
|
|
103
|
+
exon_start_offset = exon.query_region.start - gene_region.start
|
|
104
|
+
chr_local_pos=local_pos_in_gene + target_region.start + 1
|
|
105
|
+
ret_str << ">#{chromosome}_SNP-#{chr_local_pos} #{exon.to_s} #{target_region.orientation}\n"
|
|
106
|
+
to_print = "-" * exon_start_offset
|
|
107
|
+
chr_seq = chromosome_sequence(exon.target_region).to_s
|
|
108
|
+
l_pos = exon_start_offset + local_pos_in_gene
|
|
109
|
+
to_print << chr_seq
|
|
110
|
+
to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
|
|
111
|
+
ret_str << to_print
|
|
112
|
+
end
|
|
113
|
+
puts ret_str
|
|
114
|
+
ret_str
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def print_fasta_snp_exones (file)
|
|
118
|
+
@missing_exons = Set.new unless @missing_exons
|
|
119
|
+
@snp_map.each do | gene, snp_array|
|
|
120
|
+
snp_array.each do |snp|
|
|
121
|
+
#file.puts snp.primer_fasta_string
|
|
122
|
+
begin
|
|
123
|
+
file.puts snp.aligned_sequences_fasta
|
|
124
|
+
rescue Exception=>e
|
|
125
|
+
@missing_exons << snp.to_s
|
|
126
|
+
# $stderr.puts e.to_s
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def print_primer_3_exons (file, target_chromosome , parental )
|
|
133
|
+
@snp_map.each do | gene, snp_array|
|
|
134
|
+
snp_array.each do |snp|
|
|
135
|
+
begin
|
|
136
|
+
string = snp.primer_3_string( snp.chromosome, parental )
|
|
137
|
+
file.puts string if string.size > 0
|
|
138
|
+
rescue Exception=>e
|
|
139
|
+
@missing_exons << snp.to_s
|
|
140
|
+
#$stderr.puts e.to_s
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def add_alignments(opts=Hash.new)
|
|
147
|
+
opts = { :min_identity=>90 }.merge!(opts)
|
|
148
|
+
exonerate_filename = opts[:exonerate_file]
|
|
149
|
+
arm_selection = opts[:arm_selection]
|
|
150
|
+
|
|
151
|
+
unless arm_selection
|
|
152
|
+
arm_selection = lambda do | contig_name |
|
|
153
|
+
ret = contig_name[0,3]
|
|
154
|
+
return ret
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
File.open(exonerate_filename) do |f|
|
|
160
|
+
f.each_line do | line |
|
|
161
|
+
record = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
|
162
|
+
if record and record.identity >= opts[:min_identity]
|
|
163
|
+
snp_array = @snp_map[record.query_id]
|
|
164
|
+
if snp_array != nil
|
|
165
|
+
snp_array.each do |snp|
|
|
166
|
+
if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
|
|
167
|
+
begin
|
|
168
|
+
exon = record.exon_on_gene_position(snp.position)
|
|
169
|
+
snp.add_exon(exon, arm_selection.call(record.target_id))
|
|
170
|
+
rescue Bio::DB::Exonerate::ExonerateException
|
|
171
|
+
$stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def add_parental(opts=Hash.new)
|
|
182
|
+
# opts = { :name=>opts[:path]}.merge!(opts)
|
|
183
|
+
sam = nil
|
|
184
|
+
name = opts[:name] ? opts[:name] : "Unknown"
|
|
185
|
+
if opts[:path]
|
|
186
|
+
path = opts[:path]
|
|
187
|
+
name = opts[:name] ? opts[:name] : path.basename(".bam")
|
|
188
|
+
sam = Bio::DB::Sam.new({:fasta=>@gene_models_path, :bam=>opts[:path]})
|
|
189
|
+
end
|
|
190
|
+
@parents[name] = sam
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
end
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
module Bio::PolyploidTools
|
|
2
|
+
class Marker
|
|
3
|
+
include Comparable
|
|
4
|
+
#include Virgola
|
|
5
|
+
attr_reader :template_sequence, :original, :snp
|
|
6
|
+
attr_accessor :best_hit
|
|
7
|
+
attr_accessor :index_90k
|
|
8
|
+
attr_accessor :snp_id
|
|
9
|
+
attr_accessor :snp_name
|
|
10
|
+
attr_accessor :chr
|
|
11
|
+
attr_accessor :coordinates_chr
|
|
12
|
+
attr_accessor :map_order
|
|
13
|
+
attr_accessor :chr_arm
|
|
14
|
+
attr_accessor :distance_cm
|
|
15
|
+
attr_accessor :sequence
|
|
16
|
+
attr_writer :contig
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
#after_map :parse_sequence_snp
|
|
21
|
+
|
|
22
|
+
def to_fasta
|
|
23
|
+
">#{self.snp_name}\n#{self.template_sequence}"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def contig
|
|
27
|
+
@contig = best_hit.target_id.chomp if best_hit
|
|
28
|
+
@contig
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def to_csv
|
|
32
|
+
"#{index_90k},#{snp_id},#{snp_name},#{chr},#{coordinates_chr},#{map_order},#{chr_arm},#{distance_cm},#{sequence},#{contig}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def <=>(anOter)
|
|
36
|
+
return 0 if anOter.snp_name == @snp_name
|
|
37
|
+
return @chr_arm <=> anOter.chr_arm if anOter.chr_arm != @chr_arm
|
|
38
|
+
return @snp_name <=> anOter.snp_name if anOter.coordinates_chr == @coordinates_chr
|
|
39
|
+
return @coordinates_chr <=> anOter.coordinates_chr
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def initialize(line)
|
|
43
|
+
line.chomp!
|
|
44
|
+
@template_sequence = nil
|
|
45
|
+
#INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE
|
|
46
|
+
@index_90k, @snp_id, @snp_name, @chr, @coordinates_chr, @map_order, @chr_arm, @distance_cm, @sequence, @contig = line.split(',')
|
|
47
|
+
parse_sequence_snp
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.parse(filename)
|
|
51
|
+
f = File.open(filename, "r").read
|
|
52
|
+
f.each_line do |line|
|
|
53
|
+
m = Marker.new(line)
|
|
54
|
+
yield m if m.template_sequence
|
|
55
|
+
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
protected
|
|
60
|
+
def parse_sequence_snp
|
|
61
|
+
pos = 0
|
|
62
|
+
@chr.upcase!
|
|
63
|
+
match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence)
|
|
64
|
+
if match_data
|
|
65
|
+
@position = Regexp.last_match(:pre).size + 1
|
|
66
|
+
@original = Regexp.last_match(:org)
|
|
67
|
+
@snp = Regexp.last_match(:snp)
|
|
68
|
+
amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
|
|
69
|
+
@template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
|
|
70
|
+
return @template_sequence
|
|
71
|
+
end
|
|
72
|
+
return nil
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
#The map hast to come sorted.
|
|
78
|
+
class ArmMap
|
|
79
|
+
attr_reader :markers , :global_reference, :reference
|
|
80
|
+
attr_accessor :chromosome
|
|
81
|
+
def initialize
|
|
82
|
+
@markers = Hash.new
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def align_markers(output)
|
|
86
|
+
Bio::Blat.align(@reference.fasta_path, @fasta_markers, output) do |hit|
|
|
87
|
+
marker = markers[hit.query_id]
|
|
88
|
+
best = marker.best_hit
|
|
89
|
+
unless marker.best_hit
|
|
90
|
+
markers[hit.query_id].best_hit = hit
|
|
91
|
+
else
|
|
92
|
+
marker.best_hit = hit if hit.score > marker.best_hit.score
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def print_fasta_contigs_for_markers(contigs_file)
|
|
98
|
+
|
|
99
|
+
contigs = Set.new
|
|
100
|
+
markers.each do |k, marker|
|
|
101
|
+
|
|
102
|
+
if marker.best_hit
|
|
103
|
+
contigs << marker.best_hit.target_id
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
fasta=File.open(contigs_file, "w")
|
|
108
|
+
contigs.each do |contig_id|
|
|
109
|
+
reg = @reference.index.region_for_entry(contig_id)
|
|
110
|
+
fasta.puts ">#{contig_id}\n#{@reference.fetch_sequence(reg.get_full_region)}"
|
|
111
|
+
end
|
|
112
|
+
fasta.close
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def print_fasta_markers(filename)
|
|
119
|
+
@fasta_markers = filename
|
|
120
|
+
fasta=File.open(filename, "w")
|
|
121
|
+
|
|
122
|
+
markers.each do |k, marker|
|
|
123
|
+
fasta.puts marker.to_fasta
|
|
124
|
+
end
|
|
125
|
+
fasta.close
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def global_reference(reference)
|
|
129
|
+
@global_reference = Bio::DB::Fasta::FastaFile.new(reference)
|
|
130
|
+
@global_reference.load_fai_entries
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def reference(reference)
|
|
134
|
+
@reference = Bio::DB::Fasta::FastaFile.new(reference)
|
|
135
|
+
@reference.load_fai_entries
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def print_fasta_contigs_from_reference(filename)
|
|
139
|
+
if File.exist?(filename)
|
|
140
|
+
reference(filename)
|
|
141
|
+
return
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
#puts "loaded"
|
|
145
|
+
|
|
146
|
+
fasta=File.open(filename, "w")
|
|
147
|
+
|
|
148
|
+
Bio::FlatFile.auto( @global_reference.fasta_path) do |ff|
|
|
149
|
+
ff.each do |f|
|
|
150
|
+
chr_reg = arm_selection_embl(f.entry_id)
|
|
151
|
+
if chr_reg == chromosome
|
|
152
|
+
fasta.puts f.entry
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
fasta.close
|
|
157
|
+
reference(filename)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def print_map_with_contigs(filename)
|
|
162
|
+
file = File.open(filename, "w")
|
|
163
|
+
markers.values.sort { |x,y| x.map_order <=> y.map_order }.each do | marker |
|
|
164
|
+
file.puts marker.to_csv
|
|
165
|
+
end
|
|
166
|
+
file.close
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
protected
|
|
170
|
+
def arm_selection_embl(contig_name)
|
|
171
|
+
ret = contig_name.split('_')[2][0,2]
|
|
172
|
+
return ret
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|