pets 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
File without changes
data/lib/pets.rb ADDED
@@ -0,0 +1,6 @@
1
+ require "pets/version"
2
+
3
+ module Pets
4
+ class Error < StandardError; end
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,77 @@
1
+ def process_patient_data(patient_data)
2
+ parsed_patient_data = {}
3
+ patient_data.each do |patientID, metadata|
4
+ phenotypes, chr, start, stop = metadata
5
+ info = [patientID, start.to_i, stop.to_i]
6
+ query = parsed_patient_data[chr]
7
+ if query.nil?
8
+ parsed_patient_data[chr] = [info]
9
+ else
10
+ query << info
11
+ end
12
+ end
13
+ return parsed_patient_data
14
+ end
15
+
16
+ def get_final_coverage(raw_coverage, bin_size)
17
+ coverage_to_plot = []
18
+ raw_coverage.each do |chr, coverages|
19
+ coverages.each do |start, stop, coverage|
20
+ bin_start = start - start % bin_size
21
+ bin_stop = stop - stop%bin_size
22
+ while bin_start < bin_stop
23
+ coverage_to_plot << [chr, bin_start, coverage]
24
+ bin_start += bin_size
25
+ end
26
+ end
27
+ end
28
+ return coverage_to_plot
29
+ end
30
+
31
+ def get_sor_length_distribution(raw_coverage)
32
+ all_cnvs_length = []
33
+ cnvs_count = []
34
+ raw_coverage.each do |chr, coords_info|
35
+ coords_info.each do |start, stop, pat_records|
36
+ region_length = stop - start + 1
37
+ all_cnvs_length << [region_length, pat_records]
38
+ end
39
+ end
40
+ all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]}
41
+ return all_cnvs_length
42
+ end
43
+
44
+ def get_cnvs_length(patient_data)
45
+ length_stats = Hash.new(0)
46
+ patient_data.each do |pat_id, patient_record|
47
+ string_hpos, chr, start, stop = patient_record
48
+ length_stats[stop - start] += 1
49
+ end
50
+ return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
51
+ end
52
+
53
+
54
+ def calculate_coverage(regions_data, delete_thresold = 0)
55
+ raw_coverage = {}
56
+ n_regions = 0
57
+ patients = 0
58
+ nt = 0
59
+ regions_data.each do |start, stop, chr, node|
60
+ number_of_patients = node.split('.').last.to_i
61
+ if number_of_patients <= delete_thresold
62
+ number_of_patients = 0
63
+ else
64
+ n_regions += 1
65
+ patients += number_of_patients
66
+ nt += stop - start
67
+ end
68
+ coords = [start, stop, number_of_patients]
69
+ query = raw_coverage[chr]
70
+ if query.nil?
71
+ raw_coverage[chr] = [coords]
72
+ else
73
+ query << coords
74
+ end
75
+ end
76
+ return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
77
+ end
@@ -0,0 +1,556 @@
1
+ require 'uri'
2
+ #Common methods for predictors
3
+ #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
4
+ #1. Indexing by chr (region)
5
+
6
+ def load_training_file4regions(training_file)
7
+ training_set = {}
8
+ posInfo = loadFile(training_file)
9
+ posInfo.each do |info|
10
+ chr = info.shift
11
+ query = training_set[chr]
12
+ if query.nil?
13
+ training_set[chr] = [info]
14
+ else
15
+ query << info
16
+ end
17
+ end
18
+ return training_set
19
+ end
20
+
21
+ #2. Indexing by hpo (code)
22
+ #prepare training file for analysis using phenotype2region prediction
23
+ def load_training_file4HPO(training_file, thresold=0)
24
+ training_set = {}
25
+ information = loadFile(training_file, thresold)
26
+ information.each do |info|
27
+ hpoCode = info.delete_at(4)
28
+ query = training_set[hpoCode]
29
+ if query.nil?
30
+ training_set[hpoCode] = [info]
31
+ else
32
+ query << info
33
+ end
34
+ end
35
+ # STDERR.puts training_set.keys.inspect
36
+ return training_set
37
+ end
38
+
39
+
40
+ #3. Load training info file:
41
+ #Chr;Start;Stop;HPO;Association;node
42
+ def loadFile(file, thresold=0)
43
+ information = []
44
+ File.open(file).each do |line|
45
+ line.chomp!
46
+ allInfo = line.split("\t")
47
+ associationValue = allInfo[4].to_f
48
+ if associationValue >= thresold
49
+ chr = allInfo[0]
50
+ startPos = allInfo[1].to_i
51
+ stopPos = allInfo[2].to_i
52
+ hpoCode = allInfo[3]
53
+ nodeID = allInfo[5]
54
+ information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
55
+ end
56
+ end
57
+ return information
58
+ end
59
+
60
+ def add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
61
+ if !hpo_black_list.include?(id)
62
+ attributes = [id, name, is_a - hpo_black_list, syn]
63
+ hpo_storage[id] = attributes
64
+ alt_ids.each do |altid|
65
+ hpo_storage[altid] = attributes
66
+ end
67
+ end
68
+ end
69
+
70
+ def load_hpo_file(hpo_file, hpo_black_list=[])
71
+ hpo_storage = {}
72
+ id = nil
73
+ name = nil
74
+ alt_ids = []
75
+ syn = []
76
+ is_a = []
77
+ File.open(hpo_file).each do |line|
78
+ line.chomp!
79
+ tag, info = line.split(': ')
80
+ if tag == 'id' || tag == 'name' || tag == 'is_a' || tag == 'synonym' || tag == 'alt_id'
81
+ if tag == 'id'
82
+ add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) if !name.nil?
83
+ id = info
84
+ name = nil
85
+ alt_id = []
86
+ syn = []
87
+ is_a = []
88
+ end
89
+ if tag == 'alt_id'
90
+ alt_ids << info
91
+ elsif tag == 'is_a'
92
+ is_a << info.split(' ! ')[0]
93
+ elsif tag == 'synonym'
94
+ syn << info.split('"')[1] #to keep only the name of the synonym
95
+ else
96
+ name = info
97
+ end
98
+ end
99
+ end
100
+ add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
101
+ # STDERR.puts hpo_storage.inspect
102
+ # Process.exit
103
+ return hpo_storage
104
+ end
105
+
106
+ def load_hpo_black_list(excluded_hpo_file)
107
+ excluded_hpos = []
108
+ File.open(excluded_hpo_file).each do |line|
109
+ line.chomp!
110
+ excluded_hpos << line
111
+ end
112
+ return excluded_hpos
113
+ end
114
+
115
+ def create_hpo_dictionary(hpo_storage)
116
+ hpo_dictionary = {}
117
+ hpo_storage.each do |hpo, metadata|
118
+ hpo_code, hpo_name, hpo_parents, hpo_synonyms = metadata
119
+ hpo_dictionary[hpo_name] = hpo_code
120
+ hpo_synonyms.each do |syn|
121
+ hpo_dictionary[syn] = hpo_code
122
+ end
123
+ end
124
+ return hpo_dictionary
125
+ end
126
+
127
+ def get_child_parent_relations(hpo_storage)
128
+ # for getting hpo childs
129
+ storage_child = {}
130
+ hpo_storage.each do |hpo_code, hpo_data|
131
+ id, name, is_a, syn = hpo_data
132
+ hpo_child = [id, name]
133
+ is_a.each do |par_hpo_code|
134
+ query = storage_child[par_hpo_code]
135
+ if query.nil?
136
+ storage_child[par_hpo_code] = [hpo_child]
137
+ else
138
+ query << hpo_child
139
+ end
140
+ end
141
+ end
142
+ return storage_child
143
+ end
144
+
145
+ def compute_IC_values(patient_data, total_patients)
146
+ patients_per_hpo = Hash.new(0)
147
+ last_patient_ID = ''
148
+ patient_data.each do |patient_ID, metadata|
149
+ patient, count = patient_ID.split('_i')
150
+ if patient != last_patient_ID
151
+ hpos, chr, start, stop = metadata
152
+ hpos.each do |h|
153
+ patients_per_hpo[h] += 1
154
+ end
155
+ end
156
+ last_patient_ID = patient
157
+ end
158
+ # STDERR.puts patients_per_hpo.inspect
159
+ # Process.exit
160
+ patients_per_hpo.each do |hpo, patient_number|
161
+ patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
162
+ end
163
+ return patients_per_hpo
164
+ end
165
+
166
+ # def get_child_parent_relations(hpo_storage)
167
+ # # for getting hpo childs
168
+ # storage_child = {}
169
+ # hpo_storage.each do |hpo_code, hpo_data|
170
+ # STDERR.puts hpo_data[3].inspect
171
+ # Process.exit
172
+ # main_code, hpo_name, synonyms, parents = hpo_data
173
+ # parents.each do |par_hpo_code, par_hpo_name|
174
+ # query = storage_child[par_hpo_code]
175
+ # hpo_child = [main_code, hpo_name]
176
+ # if query.nil?
177
+ # storage_child[par_hpo_code] = [par_hpo_name, [hpo_child]]
178
+ # else
179
+ # query.last << hpo_child
180
+ # end
181
+ # end
182
+ # end
183
+
184
+ # return storage_child
185
+ # end
186
+
187
+
188
+ def load_hpo_ci_values(information_coefficient_file)
189
+ hpos_ci_values = {}
190
+ File.open(information_coefficient_file).each do |line|
191
+ line.chomp!
192
+ hpo_code, ci = line.split("\t")
193
+ hpos_ci_values[hpo_code] = ci.to_f
194
+ end
195
+ return hpos_ci_values
196
+ end
197
+
198
+ def load_clustered_patients(file)
199
+ clusters = {}
200
+ File.open(file).each do |line|
201
+ line.chomp!
202
+ pat_id, cluster_id = line.split("\t")
203
+ query = clusters[cluster_id]
204
+ if query.nil?
205
+ clusters[cluster_id] = [pat_id]
206
+ else
207
+ query << pat_id
208
+ end
209
+ end
210
+ return clusters
211
+ end
212
+
213
+ def load_gene_data(gene_data_path)
214
+ gene_list = {} #geneID => attr
215
+ gene_location = {} # chr => gene
216
+ infile = open(gene_data_path)
217
+ gz = Zlib::GzipReader.new(infile)
218
+ current_chr = nil
219
+ genes = []
220
+ gz.each_line do |line|
221
+ line.chomp!
222
+ next if line =~ /^#/
223
+ fields = line.split("\t")
224
+ if fields[8].include?('genome=chromosome')
225
+ chr = fields[8].split(';')[1].split('=').last
226
+ gene_location[current_chr] = genes
227
+ genes = []
228
+ current_chr = chr
229
+ elsif fields[2] == 'gene'
230
+ attributes = {}
231
+ fields[8].split(';').each do |pair|
232
+ key, value = pair.split('=')
233
+ attributes[key] = value
234
+ end
235
+ geneNames = []
236
+ geneNames << attributes['gene'] if !attributes['gene'].nil?
237
+ geneNames.concat(attributes['gene_synonym'].split(',')) if !attributes['gene_synonym'].nil?
238
+ description = attributes['description']
239
+ description = URI.unescape(description) if !description.nil?
240
+ attributes['Dbxref'] =~ /GeneID:(\d+)/
241
+ gene_list[$1] = [geneNames, description]
242
+ genes << [$1, fields[3].to_i, fields[4].to_i]
243
+ end
244
+ end
245
+ gene_location[current_chr] = genes
246
+ return gene_list, gene_location
247
+ end
248
+
249
+ def parse_kegg_data(query_genes)
250
+ kegg_data = {} #gene => attb
251
+ while !query_genes.empty?
252
+ gene_set = query_genes.shift(10)
253
+ url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
254
+ uri = URI(url)
255
+ response = Net::HTTP.get(uri)
256
+ geneID = nil
257
+ gene_names = []
258
+ definition = nil
259
+ pathways = []
260
+ parsing_pathway_field = false
261
+ response.squeeze(' ').each_line do |line|
262
+ line.chomp!
263
+ if line =~ /^ENTRY/
264
+ geneID = line.split(' ')[1]
265
+ elsif line =~ /^NAME/
266
+ gene_names = line.split(' ', 2).last.split(', ')
267
+ elsif line =~ /^DEFINITION/
268
+ definition = line.split(' ', 2)[1]
269
+ elsif line =~ /^PATHWAY/
270
+ pathways << line.split(' ', 3)[1..2]
271
+ parsing_pathway_field = true
272
+ elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
273
+ parsing_pathway_field = false
274
+ elsif parsing_pathway_field
275
+ pathways << line.strip.split(' ', 2)
276
+ elsif line == '///'
277
+ parsing_pathway_field = false
278
+ kegg_data[geneID] = [gene_names, definition, pathways]
279
+ pathways = []
280
+ gene_names = []
281
+ end
282
+ end
283
+ end
284
+ return kegg_data
285
+ end
286
+
287
+ def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
288
+ kegg_data = {}
289
+ gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path)
290
+ keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG')
291
+ keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/}
292
+
293
+ gene2biosystems.each do |geneID, pathways|
294
+ kegg_pathways = []
295
+ pathways.each do |biosystem|
296
+ kAttrib = keggAttributes[biosystem]
297
+ kegg_pathways << kAttrib if !kAttrib.nil?
298
+ end
299
+ kegg_data[geneID] = kegg_pathways
300
+ end
301
+ return kegg_data
302
+ end
303
+
304
+ def loadBiosistemsInfo(biosystems_info_path, filterDB)
305
+ bsid2attributes = {}
306
+ infile = open(biosystems_info_path)
307
+ gz = Zlib::GzipReader.new(infile)
308
+ gz.each_line do |line|
309
+ line.chomp!
310
+ #STDERR.puts line.inspect
311
+ fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t")
312
+ bsid = fields.shift
313
+ bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0]
314
+ end
315
+ return bsid2attributes
316
+ end
317
+
318
+ def load_biosystem2gene_dictionary(biosystems_gene_path)
319
+ gene2kegg = {}
320
+ infile = open(biosystems_gene_path)
321
+ gz = Zlib::GzipReader.new(infile)
322
+ gz.each_line do |line|
323
+ line.chomp!
324
+ biosystem, gene, score = line.split("\t")
325
+ query = gene2kegg[gene]
326
+ if query.nil?
327
+ gene2kegg[gene] = [biosystem]
328
+ else
329
+ query << biosystem
330
+ end
331
+ end
332
+ return gene2kegg
333
+ end
334
+
335
+ def merge_genes_with_kegg_data(gene_list, kegg_data)
336
+ merged_data = {}
337
+ gene_list.each do |geneID, attributes|
338
+ query = kegg_data[geneID]
339
+ if query.nil?
340
+ attributes << []
341
+ else
342
+ attributes << query
343
+ end
344
+ merged_data[geneID] = attributes
345
+ end
346
+ return merged_data
347
+ end
348
+
349
+ def write_compressed_plain_file(data, path)
350
+ File.open(path, 'w') do |f|
351
+ gz = Zlib::GzipWriter.new(f)
352
+ gz.write data.to_json
353
+ gz.close
354
+ end
355
+ end
356
+
357
+ def read_compressed_json(path)
358
+ infile = open(path)
359
+ gz = Zlib::GzipReader.new(infile)
360
+ object = JSON.parse(gz.read)
361
+ return object
362
+ end
363
+
364
+ def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
365
+ pathways_genes_in_predictions = {}
366
+ genes_in_predictions = []
367
+ genes_clusters.each do |cluster|
368
+ cluster.each do |geneID, data|
369
+ geneNames, description, pathways = data
370
+ pathways.each do |pathway|
371
+ query = pathways_genes_in_predictions[pathway]
372
+ if query.nil?
373
+ pathways_genes_in_predictions[pathway] = [geneID]
374
+ else
375
+ query << geneID if !query.include?(geneID)
376
+ end
377
+ end
378
+ genes_in_predictions << geneID if !genes_in_predictions.include?(geneID)
379
+ end
380
+ end
381
+ genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions
382
+ gene_number = genes_with_kegg.length
383
+ stats = []
384
+ pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes|
385
+ pathway_id, pathway_name = pathway
386
+ no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes
387
+ pathway_no_predicted_genes_count = 0
388
+ no_pathway_no_predicted_genes_count = 0
389
+ genes_out_of_predictions.each do |geneID|
390
+ query = genes_with_kegg[geneID]
391
+ if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id)
392
+ pathway_no_predicted_genes_count += 1
393
+ else
394
+ no_pathway_no_predicted_genes_count += 1
395
+ end
396
+ end
397
+ #Fisher => http://www.biostathandbook.com/fishers.html
398
+ no_pathway_predicted_genes_count = no_pathway_predicted_genes.length
399
+ pathway_predicted_genes_count = pathway_predicted_genes.length
400
+ accumulated_prob = 0
401
+ pathway_no_predicted_genes_count.times do |n|
402
+ no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n
403
+ pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n
404
+ if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0
405
+ accumulated_prob += compute_hyper_prob(
406
+ n,
407
+ no_pathway_predicted_genes_count_shifted,
408
+ pathway_predicted_genes_count_shifted,
409
+ no_pathway_no_predicted_genes_count + n,
410
+ gene_number
411
+ )
412
+ else
413
+ break
414
+ end
415
+ end
416
+ contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count]
417
+ stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob]
418
+ end
419
+ return stats
420
+ end
421
+
422
+ def compute_hyper_prob(a, b, c, d, n)
423
+ binomA = binom(a + b, a)
424
+ binomC = binom(c + d, c)
425
+ divisor = binom(n, a + c)
426
+ return (binomA * binomC).fdiv(divisor)
427
+ end
428
+
429
+ def binom(n,k)
430
+ if k > 0 && k < n
431
+ res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
432
+ else
433
+ res = 1
434
+ end
435
+ end
436
+
437
+ def get_reference(genomic_ranges)
438
+ #genomic_ranges = [patientID, mut_start, mut_stop]
439
+ reference = []
440
+ reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
441
+ reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
442
+ reference.uniq!
443
+ reference.sort!
444
+ #Define overlap range
445
+ final_reference = []
446
+ reference.each_with_index do |coord,i|
447
+ next_coord = reference[i + 1]
448
+ final_reference << [coord, next_coord] if !next_coord.nil?
449
+ end
450
+ return final_reference
451
+ end
452
+
453
+ def overlap_patients(genomic_ranges, reference)
454
+ overlaps = []
455
+ reference.each do |start, stop|
456
+ patients = []
457
+ genomic_ranges.each do |pt_id, pt_start, pt_stop|
458
+ if (start <= pt_start && stop >= pt_stop) ||
459
+ (start > pt_start && stop < pt_stop) ||
460
+ (stop > pt_start && stop <= pt_stop) ||
461
+ (start >= pt_start && start < pt_stop)
462
+ patients << pt_id
463
+ end
464
+ end
465
+ overlaps << patients.uniq
466
+ end
467
+ return overlaps
468
+ end
469
+
470
+ def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
471
+ patients_out_of_cluster = 0
472
+ patients_by_cluster = {}
473
+ sors = []
474
+ patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
475
+ reference = get_reference(genomic_ranges) # Get putative overlap regions
476
+ overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
477
+ clust_number = 1
478
+ reference.each_with_index do |ref, i|
479
+ current_patients = overlapping_patients[i]
480
+ if current_patients.length > pat_per_reg
481
+ ref << chrm
482
+ node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
483
+ ref << node_identifier
484
+ save_sor(current_patients, node_identifier, patients_by_cluster)
485
+ sors << ref
486
+ clust_number += 1
487
+ end
488
+ end
489
+ end
490
+ return patients_by_cluster, sors
491
+ end
492
+
493
+ def save_sor(current_patients, node_identifier, patients_by_cluster)
494
+ current_patients.each do |patient|
495
+ add_record(patients_by_cluster, patient, node_identifier)
496
+ end
497
+ end
498
+
499
+ def add_record(hash, key, record)
500
+ query = hash[key]
501
+ if query.nil?
502
+ hash[key] = [record]
503
+ elsif !query.include?(record)
504
+ query << record
505
+ end
506
+ end
507
+
508
+ def load_patient_cohort(options)
509
+ patient_data = {}
510
+ count = 0
511
+ fields2extract = get_fields2extract(options)
512
+ field_numbers = fields2extract.values
513
+ original_ids = []
514
+ File.open(options[:input_file]).each do |line|
515
+ line.chomp!
516
+ if options[:header] && count == 0
517
+ line.gsub!(/#\s*/,'') # correct comment like headers
518
+ field_names = line.split("\t")
519
+ get_field_numbers2extract(field_names, fields2extract)
520
+ field_numbers = fields2extract.values
521
+ else
522
+ fields = line.split("\t")
523
+ pat_record = field_numbers.map{|n| fields[n]}
524
+ if fields2extract[:pat_id_col].nil?
525
+ pat_id = "pat_#{count}" #generate ids
526
+ else
527
+ original_id = pat_record.shift
528
+ original_ids << original_id
529
+ pat_id = original_id + "_i#{count}" # make sure that ids are uniq
530
+ end
531
+ patient_data[pat_id] = pat_record
532
+ end
533
+ count +=1
534
+ end
535
+ fields2extract[:pat_id_col].nil? ? patient_number = count : patient_number = original_ids.uniq.length
536
+ options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
537
+ return patient_data, patient_number
538
+ end
539
+
540
+ def get_fields2extract(options)
541
+ fields2extract = {}
542
+ [:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
543
+ col = options[field]
544
+ if !col.nil?
545
+ col = col.to_i if !options[:header]
546
+ fields2extract[field] = col
547
+ end
548
+ end
549
+ return fields2extract
550
+ end
551
+
552
+ def get_field_numbers2extract(field_names, fields2extract)
553
+ fields2extract.each do |field, name|
554
+ fields2extract[field] = field_names.index(name)
555
+ end
556
+ end