pets 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
data/lib/pets/generalMethods.rb
CHANGED
@@ -13,80 +13,15 @@ def system_call(code_folder, script, args_string)
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
#Common methods for predictors
|
25
|
-
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
26
|
-
#1. Indexing by chr (region)
|
27
|
-
def coor_overlap?(ref_start, ref_stop, start, stop)
|
28
|
-
overlap = false
|
29
|
-
if (stop > ref_start && stop <= ref_stop) ||
|
30
|
-
(start >= ref_start && start < ref_stop) ||
|
31
|
-
(start <= ref_start && stop >= ref_stop) ||
|
32
|
-
(start > ref_start && stop < ref_stop)
|
33
|
-
overlap = true
|
34
|
-
end
|
35
|
-
return overlap
|
36
|
-
end
|
37
|
-
|
38
|
-
def load_training_file4regions(training_file)
|
39
|
-
training_set = {}
|
40
|
-
posInfo = loadFile(training_file)
|
41
|
-
posInfo.each do |info|
|
42
|
-
chr = info.shift
|
43
|
-
query = training_set[chr]
|
44
|
-
if query.nil?
|
45
|
-
training_set[chr] = [info]
|
46
|
-
else
|
47
|
-
query << info
|
48
|
-
end
|
49
|
-
end
|
50
|
-
return training_set
|
51
|
-
end
|
52
|
-
|
53
|
-
#2. Indexing by hpo (code)
|
54
|
-
#prepare training file for analysis using phenotype2region prediction
|
55
|
-
def load_training_file4HPO(training_file, thresold=0)
|
56
|
-
training_set = {}
|
57
|
-
information = loadFile(training_file, thresold)
|
58
|
-
information.each do |info|
|
59
|
-
hpoCode = info.delete_at(4)
|
60
|
-
query = training_set[hpoCode]
|
61
|
-
if query.nil?
|
62
|
-
training_set[hpoCode] = [info]
|
63
|
-
else
|
64
|
-
query << info
|
65
|
-
end
|
66
|
-
end
|
67
|
-
# STDERR.puts training_set.keys.inspect
|
68
|
-
return training_set
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
#3. Load training info file:
|
73
|
-
#Chr;Start;Stop;HPO;Association;node
|
74
|
-
def loadFile(file, thresold=0)
|
75
|
-
information = []
|
76
|
-
File.open(file).each do |line|
|
77
|
-
line.chomp!
|
78
|
-
allInfo = line.split("\t")
|
79
|
-
associationValue = allInfo[4].to_f
|
80
|
-
if associationValue >= thresold
|
81
|
-
chr = allInfo[0]
|
82
|
-
startPos = allInfo[1].to_i
|
83
|
-
stopPos = allInfo[2].to_i
|
84
|
-
hpoCode = allInfo[3]
|
85
|
-
nodeID = allInfo[5]
|
86
|
-
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
87
|
-
end
|
16
|
+
def add_record(hash, key, record, uniq=false)
|
17
|
+
query = hash[key]
|
18
|
+
if query.nil?
|
19
|
+
hash[key] = [record]
|
20
|
+
elsif !uniq # We not take care by repeated entries
|
21
|
+
query << record
|
22
|
+
elsif !query.include?(record) # We want uniq entries
|
23
|
+
query << record
|
88
24
|
end
|
89
|
-
return information
|
90
25
|
end
|
91
26
|
|
92
27
|
|
@@ -111,105 +46,6 @@ def compute_IC_values(patient_data, total_patients)
|
|
111
46
|
return patients_per_hpo
|
112
47
|
end
|
113
48
|
|
114
|
-
def load_hpo_ci_values(information_coefficient_file)
|
115
|
-
hpos_ci_values = {}
|
116
|
-
File.open(information_coefficient_file).each do |line|
|
117
|
-
line.chomp!
|
118
|
-
hpo_code, ci = line.split("\t")
|
119
|
-
hpos_ci_values[hpo_code.to_sym] = ci.to_f
|
120
|
-
end
|
121
|
-
return hpos_ci_values
|
122
|
-
end
|
123
|
-
|
124
|
-
def load_clustered_patients(file)
|
125
|
-
clusters = {}
|
126
|
-
File.open(file).each do |line|
|
127
|
-
line.chomp!
|
128
|
-
pat_id, cluster_id = line.split("\t")
|
129
|
-
query = clusters[cluster_id]
|
130
|
-
if query.nil?
|
131
|
-
clusters[cluster_id] = [pat_id]
|
132
|
-
else
|
133
|
-
query << pat_id
|
134
|
-
end
|
135
|
-
end
|
136
|
-
return clusters
|
137
|
-
end
|
138
|
-
|
139
|
-
def load_gene_data(gene_data_path)
|
140
|
-
gene_list = {} #geneID => attr
|
141
|
-
gene_location = {} # chr => gene
|
142
|
-
infile = open(gene_data_path)
|
143
|
-
gz = Zlib::GzipReader.new(infile)
|
144
|
-
current_chr = nil
|
145
|
-
genes = []
|
146
|
-
gz.each_line do |line|
|
147
|
-
line.chomp!
|
148
|
-
next if line =~ /^#/
|
149
|
-
fields = line.split("\t")
|
150
|
-
if fields[8].include?('genome=chromosome')
|
151
|
-
chr = fields[8].split(';')[1].split('=').last
|
152
|
-
gene_location[current_chr] = genes
|
153
|
-
genes = []
|
154
|
-
current_chr = chr
|
155
|
-
elsif fields[2] == 'gene'
|
156
|
-
attributes = {}
|
157
|
-
fields[8].split(';').each do |pair|
|
158
|
-
key, value = pair.split('=')
|
159
|
-
attributes[key] = value
|
160
|
-
end
|
161
|
-
geneName = nil
|
162
|
-
geneName = attributes['gene'] if !attributes['gene'].nil?
|
163
|
-
geneSyns = []
|
164
|
-
geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
|
165
|
-
description = attributes['description']
|
166
|
-
description = URI.unescape(description) if !description.nil?
|
167
|
-
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
168
|
-
gene_list[$1] = [geneName, geneSyns, description]
|
169
|
-
genes << [$1, fields[3].to_i, fields[4].to_i]
|
170
|
-
end
|
171
|
-
end
|
172
|
-
gene_location[current_chr] = genes
|
173
|
-
return gene_list, gene_location
|
174
|
-
end
|
175
|
-
|
176
|
-
def parse_kegg_data(query_genes)
|
177
|
-
kegg_data = {} #gene => attb
|
178
|
-
while !query_genes.empty?
|
179
|
-
gene_set = query_genes.shift(10)
|
180
|
-
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
181
|
-
uri = URI(url)
|
182
|
-
response = Net::HTTP.get(uri)
|
183
|
-
geneID = nil
|
184
|
-
gene_names = []
|
185
|
-
definition = nil
|
186
|
-
pathways = []
|
187
|
-
parsing_pathway_field = false
|
188
|
-
response.squeeze(' ').each_line do |line|
|
189
|
-
line.chomp!
|
190
|
-
if line =~ /^ENTRY/
|
191
|
-
geneID = line.split(' ')[1]
|
192
|
-
elsif line =~ /^NAME/
|
193
|
-
gene_names = line.split(' ', 2).last.split(', ')
|
194
|
-
elsif line =~ /^DEFINITION/
|
195
|
-
definition = line.split(' ', 2)[1]
|
196
|
-
elsif line =~ /^PATHWAY/
|
197
|
-
pathways << line.split(' ', 3)[1..2]
|
198
|
-
parsing_pathway_field = true
|
199
|
-
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
200
|
-
parsing_pathway_field = false
|
201
|
-
elsif parsing_pathway_field
|
202
|
-
pathways << line.strip.split(' ', 2)
|
203
|
-
elsif line == '///'
|
204
|
-
parsing_pathway_field = false
|
205
|
-
kegg_data[geneID] = [gene_names, definition, pathways]
|
206
|
-
pathways = []
|
207
|
-
gene_names = []
|
208
|
-
end
|
209
|
-
end
|
210
|
-
end
|
211
|
-
return kegg_data
|
212
|
-
end
|
213
49
|
|
214
50
|
def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
|
215
51
|
kegg_data = {}
|
@@ -270,21 +106,6 @@ def merge_genes_with_kegg_data(gene_list, kegg_data)
|
|
270
106
|
return merged_data
|
271
107
|
end
|
272
108
|
|
273
|
-
def write_compressed_plain_file(data, path)
|
274
|
-
File.open(path, 'w') do |f|
|
275
|
-
gz = Zlib::GzipWriter.new(f)
|
276
|
-
gz.write data.to_json
|
277
|
-
gz.close
|
278
|
-
end
|
279
|
-
end
|
280
|
-
|
281
|
-
def read_compressed_json(path)
|
282
|
-
infile = open(path)
|
283
|
-
gz = Zlib::GzipReader.new(infile)
|
284
|
-
object = JSON.parse(gz.read)
|
285
|
-
return object
|
286
|
-
end
|
287
|
-
|
288
109
|
def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
|
289
110
|
pathways_genes_in_predictions = {}
|
290
111
|
genes_in_predictions = []
|
@@ -358,138 +179,8 @@ def binom(n,k)
|
|
358
179
|
end
|
359
180
|
end
|
360
181
|
|
361
|
-
def get_reference(genomic_ranges)
|
362
|
-
#genomic_ranges = [patientID, mut_start, mut_stop]
|
363
|
-
reference = []
|
364
|
-
reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
|
365
|
-
reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
|
366
|
-
reference.uniq!
|
367
|
-
reference.sort!
|
368
|
-
#Define overlap range
|
369
|
-
final_reference = []
|
370
|
-
reference.each_with_index do |coord,i|
|
371
|
-
next_coord = reference[i + 1]
|
372
|
-
final_reference << [coord, next_coord] if !next_coord.nil?
|
373
|
-
end
|
374
|
-
return final_reference
|
375
|
-
end
|
376
182
|
|
377
|
-
def overlap_patients(genomic_ranges, reference)
|
378
|
-
overlaps = []
|
379
|
-
reference.each do |start, stop|
|
380
|
-
patients = []
|
381
|
-
genomic_ranges.each do |pt_id, pt_start, pt_stop|
|
382
|
-
if (start <= pt_start && stop >= pt_stop) ||
|
383
|
-
(start > pt_start && stop < pt_stop) ||
|
384
|
-
(stop > pt_start && stop <= pt_stop) ||
|
385
|
-
(start >= pt_start && start < pt_stop)
|
386
|
-
patients << pt_id
|
387
|
-
end
|
388
|
-
end
|
389
|
-
overlaps << patients.uniq
|
390
|
-
end
|
391
|
-
return overlaps
|
392
|
-
end
|
393
183
|
|
394
|
-
def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
|
395
|
-
patients_out_of_cluster = 0
|
396
|
-
patients_by_cluster = {}
|
397
|
-
sors = []
|
398
|
-
patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
|
399
|
-
reference = get_reference(genomic_ranges) # Get putative overlap regions
|
400
|
-
overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
|
401
|
-
clust_number = 1
|
402
|
-
reference.each_with_index do |ref, i|
|
403
|
-
current_patients = overlapping_patients[i]
|
404
|
-
if current_patients.length > pat_per_reg
|
405
|
-
ref << chrm
|
406
|
-
node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
|
407
|
-
ref << node_identifier
|
408
|
-
save_sor(current_patients, node_identifier, patients_by_cluster)
|
409
|
-
sors << ref
|
410
|
-
clust_number += 1
|
411
|
-
end
|
412
|
-
end
|
413
|
-
end
|
414
|
-
return patients_by_cluster, sors
|
415
|
-
end
|
416
|
-
|
417
|
-
def save_sor(current_patients, node_identifier, patients_by_cluster)
|
418
|
-
current_patients.each do |patient|
|
419
|
-
add_record(patients_by_cluster, patient, node_identifier)
|
420
|
-
end
|
421
|
-
end
|
422
|
-
|
423
|
-
def add_record(hash, key, record)
|
424
|
-
query = hash[key]
|
425
|
-
if query.nil?
|
426
|
-
hash[key] = [record]
|
427
|
-
elsif !query.include?(record)
|
428
|
-
query << record
|
429
|
-
end
|
430
|
-
end
|
431
|
-
|
432
|
-
def load_patient_cohort(options)
|
433
|
-
patient_data = {}
|
434
|
-
count = 0
|
435
|
-
fields2extract = get_fields2extract(options)
|
436
|
-
field_numbers = fields2extract.values
|
437
|
-
File.open(options[:input_file]).each do |line|
|
438
|
-
line.chomp!
|
439
|
-
if options[:header] && count == 0
|
440
|
-
line.gsub!(/#\s*/,'') # correct comment like headers
|
441
|
-
field_names = line.split("\t")
|
442
|
-
get_field_numbers2extract(field_names, fields2extract)
|
443
|
-
field_numbers = fields2extract.values
|
444
|
-
else
|
445
|
-
fields = line.split("\t")
|
446
|
-
pat_record = field_numbers.map{|n| fields[n]}
|
447
|
-
if fields2extract[:pat_id_col].nil?
|
448
|
-
pat_id = "pat_#{count}" #generate ids
|
449
|
-
else
|
450
|
-
original_id = pat_record.shift
|
451
|
-
pat_id = original_id + "_i#{count}" # make sure that ids are uniq
|
452
|
-
end
|
453
|
-
if !pat_record[0].nil?
|
454
|
-
pat_record[0] = pat_record[0].split(options[:hpo_separator])
|
455
|
-
else
|
456
|
-
pat_record[0] = []
|
457
|
-
end
|
458
|
-
pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
|
459
|
-
pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
|
460
|
-
patient_data[pat_id] = pat_record
|
461
|
-
end
|
462
|
-
count +=1
|
463
|
-
end
|
464
|
-
options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
|
465
|
-
return patient_data
|
466
|
-
end
|
467
|
-
|
468
|
-
def get_fields2extract(options)
|
469
|
-
fields2extract = {}
|
470
|
-
[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
|
471
|
-
col = options[field]
|
472
|
-
if !col.nil?
|
473
|
-
col = col.to_i if !options[:header]
|
474
|
-
fields2extract[field] = col
|
475
|
-
end
|
476
|
-
end
|
477
|
-
return fields2extract
|
478
|
-
end
|
479
|
-
|
480
|
-
def get_field_numbers2extract(field_names, fields2extract)
|
481
|
-
fields2extract.each do |field, name|
|
482
|
-
fields2extract[field] = field_names.index(name)
|
483
|
-
end
|
484
|
-
end
|
485
|
-
|
486
|
-
def download(ftp_server, path, name)
|
487
|
-
ftp = Net::FTP.new()
|
488
|
-
ftp.connect(ftp_server)
|
489
|
-
ftp.login
|
490
|
-
ftp.getbinaryfile(path, name)
|
491
|
-
ftp.close
|
492
|
-
end
|
493
184
|
|
494
185
|
def get_and_parse_external_data(all_paths)
|
495
186
|
sources = [
|
@@ -552,8 +243,27 @@ def get_detailed_similarity(profile, candidates, evidences, hpo)
|
|
552
243
|
return matrix
|
553
244
|
end
|
554
245
|
|
555
|
-
def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit)
|
556
|
-
candidates = similarities.to_a
|
246
|
+
def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {})
|
247
|
+
candidates = similarities.to_a
|
248
|
+
if other_scores.empty?
|
249
|
+
candidates.sort!{|s1, s2| s2.last <=> s1.last}
|
250
|
+
candidates = candidates.first(candidate_limit)
|
251
|
+
else # Prioritize first by the external list of scores, select the candidates and then rioritize by similarities
|
252
|
+
selected_candidates = []
|
253
|
+
candidates.each do |cand|
|
254
|
+
cand_id = cand[0]
|
255
|
+
cand_lab = id2label[cand_id.to_s]
|
256
|
+
next if cand_lab.nil?
|
257
|
+
other_score = other_scores[cand_lab]
|
258
|
+
next if other_score.nil?
|
259
|
+
cand << other_score
|
260
|
+
selected_candidates << cand
|
261
|
+
end
|
262
|
+
selected_candidates.sort!{|e1, e2| e2[2] <=> e1[2]}
|
263
|
+
candidates = selected_candidates.first(candidate_limit)
|
264
|
+
candidates.sort!{|e1, e2| e2[1] <=> e1[1]}
|
265
|
+
candidates.each{|c| c.pop}
|
266
|
+
end
|
557
267
|
candidates_ids = candidates.map{|c| c.first}
|
558
268
|
candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
|
559
269
|
candidate_similarity_matrix.each_with_index do |row, i|
|
@@ -0,0 +1,240 @@
|
|
1
|
+
class Genomic_Feature
|
2
|
+
@@ref = nil
|
3
|
+
|
4
|
+
def self.array2genomic_feature(arr)
|
5
|
+
new(arr.map{|r| yield(r)})
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.hash2genomic_feature(h)
|
9
|
+
vars = []
|
10
|
+
h.each do |h, v|
|
11
|
+
vars << yield(h, v)
|
12
|
+
end
|
13
|
+
new(vars)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.add_reference(genomic_regions)
|
17
|
+
@@ref = genomic_regions
|
18
|
+
end
|
19
|
+
|
20
|
+
#If any method use gen_fet as name is a Genomic_Feature object
|
21
|
+
def initialize(feat_array, annotations: nil) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
22
|
+
@regions = {}
|
23
|
+
@reg_by_to = {}
|
24
|
+
@reg_id = -1
|
25
|
+
load_features(feat_array)
|
26
|
+
load_annotations(annotations) if !annotations.nil?
|
27
|
+
end
|
28
|
+
|
29
|
+
def load_features(feat_array)
|
30
|
+
feat_array.each do |chr, start, stop, to|
|
31
|
+
chr = chr.to_sym
|
32
|
+
@reg_id +=1
|
33
|
+
id = to.nil? ? @reg_id : to
|
34
|
+
region = {chr: chr, start: start, stop: stop, to: id }
|
35
|
+
@reg_by_to[id] = region
|
36
|
+
add_record(@regions, chr, region)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def load_annotations(annotations)
|
41
|
+
each do |chr, reg|
|
42
|
+
annot = annotations[reg[:to]]
|
43
|
+
reg[:attrs] = annot if !annot.nil?
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def length
|
48
|
+
return @regions.length
|
49
|
+
end
|
50
|
+
|
51
|
+
def each_chr()
|
52
|
+
@regions.each do |chr, regs|
|
53
|
+
yield(chr, regs)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def each()
|
58
|
+
@regions.each do |chr, regs|
|
59
|
+
regs.each do |region|
|
60
|
+
yield(chr, region)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_chr
|
66
|
+
return @regions.keys
|
67
|
+
end
|
68
|
+
|
69
|
+
def get_chr_regs(chr)
|
70
|
+
return @regions[chr]
|
71
|
+
end
|
72
|
+
|
73
|
+
def region_by_to(to)
|
74
|
+
return @reg_by_to[to]
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_sizes
|
78
|
+
sizes = []
|
79
|
+
each do |chr, region|
|
80
|
+
size = region[:stop] - region[:start] + 1
|
81
|
+
sizes << size
|
82
|
+
end
|
83
|
+
return sizes
|
84
|
+
end
|
85
|
+
|
86
|
+
def get_features(attr_type: nil)
|
87
|
+
features = match(@@ref)
|
88
|
+
if !attr_type.nil?
|
89
|
+
features.each do |reg_id, feat_ids|
|
90
|
+
new_feat_ids = feat_ids.map{|fi| @@ref.region_by_to(fi).dig(:attrs, attr_type)}
|
91
|
+
features[reg_id] = new_feat_ids.compact.uniq
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return features
|
95
|
+
end
|
96
|
+
|
97
|
+
def match(other_gen_feat)
|
98
|
+
all_matches = {}
|
99
|
+
each_chr do |chr, regs|
|
100
|
+
other_regs = other_gen_feat.get_chr_regs(chr)
|
101
|
+
next if other_regs.nil?
|
102
|
+
regs.each do |reg|
|
103
|
+
local_matches = []
|
104
|
+
start = reg[:start]
|
105
|
+
stop = reg[:stop]
|
106
|
+
other_regs.each do |other_reg|
|
107
|
+
local_matches << other_reg[:to] if coor_overlap?(start, stop, other_reg)
|
108
|
+
end
|
109
|
+
all_matches[reg[:to]] = local_matches if !local_matches.empty?
|
110
|
+
end
|
111
|
+
end
|
112
|
+
return all_matches
|
113
|
+
end
|
114
|
+
|
115
|
+
def get_summary_sizes
|
116
|
+
sizes = Hash.new(0)
|
117
|
+
each do |chr, region|
|
118
|
+
size = region[:stop] - region[:start] + 1
|
119
|
+
sizes[size] += 1
|
120
|
+
end
|
121
|
+
return sizes.to_a.sort!{|s| s[1] <=> s[1] }
|
122
|
+
end
|
123
|
+
|
124
|
+
def merge(gen_fet, to = nil) # 'to' the regions must be connected "to" given id
|
125
|
+
gen_fet.each do |chr, region|
|
126
|
+
to.nil? ? region[:to] = @reg_id +=1 : region[:to] = to # handle id custom or default
|
127
|
+
add_record(@regions, chr, region)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def get_reference_overlaps(genomic_ranges, reference)
|
132
|
+
overlaps = []
|
133
|
+
reference.each do |start, stop|
|
134
|
+
reg_ids = []
|
135
|
+
genomic_ranges.each do |reg|
|
136
|
+
overlap = coor_overlap?(start, stop, reg)
|
137
|
+
reg_ids << reg[:to] if overlap
|
138
|
+
end
|
139
|
+
overlaps << reg_ids.uniq
|
140
|
+
end
|
141
|
+
return overlaps
|
142
|
+
end
|
143
|
+
|
144
|
+
def generate_cluster_regions(meth, tag, ids_per_reg = 1, obj = false)
|
145
|
+
compute_windows(meth) # Get putative genome windows
|
146
|
+
ids_by_cluster = {}
|
147
|
+
annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
|
148
|
+
@regions.each do |chr, regs|
|
149
|
+
reference = @windows[chr]
|
150
|
+
overlaps = get_reference_overlaps(regs, reference)
|
151
|
+
clust_numb = 0
|
152
|
+
reference.each_with_index do |ref, i|
|
153
|
+
current_ids = overlaps[i]
|
154
|
+
if current_ids.length > ids_per_reg
|
155
|
+
clust_id = "#{chr}.#{clust_numb +=1}.#{tag}.#{current_ids.length}"
|
156
|
+
current_ids.each do |curr_id|
|
157
|
+
add_record(ids_by_cluster, curr_id, clust_id, true)
|
158
|
+
end
|
159
|
+
annotated_full_ref << ref.dup.concat([chr, clust_id])
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
annotated_full_ref = Genomic_Feature.array2genomic_feature(annotated_full_ref){|r| [r[2], r[0], r[1], r[3]]} if obj
|
164
|
+
return ids_by_cluster, annotated_full_ref
|
165
|
+
end
|
166
|
+
|
167
|
+
def compute_windows(meth)
|
168
|
+
@windows = {}
|
169
|
+
@regions.each do |chr, regs|
|
170
|
+
chr_windows = nil
|
171
|
+
if meth == :reg_overlap
|
172
|
+
chr_windows = compute_region_overlap_windows(regs)
|
173
|
+
end
|
174
|
+
@windows[chr] = chr_windows
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
private
|
179
|
+
|
180
|
+
def add_record(hash, key, record, uniq=false)
|
181
|
+
query = hash[key]
|
182
|
+
if query.nil?
|
183
|
+
hash[key] = [record]
|
184
|
+
elsif !uniq # We not take care by repeated entries
|
185
|
+
query << record
|
186
|
+
elsif !query.include?(record) # We want uniq entries
|
187
|
+
query << record
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def compute_region_overlap_windows(genomic_ranges)
|
192
|
+
reference = []
|
193
|
+
single_nt = []
|
194
|
+
genomic_ranges.each do |gr|
|
195
|
+
start = gr[:start]
|
196
|
+
stop = gr[:stop]
|
197
|
+
if stop - start > 0
|
198
|
+
reference << start # get start
|
199
|
+
reference << stop # get stop
|
200
|
+
else # Build a window of at least one nt for snv
|
201
|
+
single_nt << start
|
202
|
+
end
|
203
|
+
end
|
204
|
+
reference.uniq!
|
205
|
+
single_nt.each do |snt| # add start stop for snv
|
206
|
+
reference << snt
|
207
|
+
reference << snt
|
208
|
+
end
|
209
|
+
reference.sort!
|
210
|
+
#Define overlap ranges
|
211
|
+
final_reference = []
|
212
|
+
last_len = 1
|
213
|
+
reference.each_with_index do |coord,i|
|
214
|
+
next_coord = reference[i + 1]
|
215
|
+
if !next_coord.nil?
|
216
|
+
current_len = next_coord - coord
|
217
|
+
coord = coord + 1 if last_len == 0 # Separate SNV window from others
|
218
|
+
if current_len == 0 && last_len > 0 && !final_reference.empty?
|
219
|
+
final_reference.last[1] -= 1 # Separate SNV window from others
|
220
|
+
end
|
221
|
+
final_reference << [coord, next_coord]
|
222
|
+
last_len = current_len
|
223
|
+
end
|
224
|
+
end
|
225
|
+
return final_reference
|
226
|
+
end
|
227
|
+
|
228
|
+
def coor_overlap?(start, stop, reg)
|
229
|
+
overlap = false
|
230
|
+
reg_start = reg[:start]
|
231
|
+
reg_stop = reg[:stop]
|
232
|
+
if (start <= reg_start && stop >= reg_stop) ||
|
233
|
+
(start > reg_start && stop < reg_stop) ||
|
234
|
+
(stop > reg_start && stop <= reg_stop) ||
|
235
|
+
(start >= reg_start && start < reg_stop)
|
236
|
+
overlap = true
|
237
|
+
end
|
238
|
+
return overlap
|
239
|
+
end
|
240
|
+
end
|