pets 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
File without changes
data/lib/pets.rb ADDED
@@ -0,0 +1,6 @@
1
+ require "pets/version"
2
+
3
+ module Pets
4
+ class Error < StandardError; end
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,77 @@
1
+ def process_patient_data(patient_data)
2
+ parsed_patient_data = {}
3
+ patient_data.each do |patientID, metadata|
4
+ phenotypes, chr, start, stop = metadata
5
+ info = [patientID, start.to_i, stop.to_i]
6
+ query = parsed_patient_data[chr]
7
+ if query.nil?
8
+ parsed_patient_data[chr] = [info]
9
+ else
10
+ query << info
11
+ end
12
+ end
13
+ return parsed_patient_data
14
+ end
15
+
16
+ def get_final_coverage(raw_coverage, bin_size)
17
+ coverage_to_plot = []
18
+ raw_coverage.each do |chr, coverages|
19
+ coverages.each do |start, stop, coverage|
20
+ bin_start = start - start % bin_size
21
+ bin_stop = stop - stop%bin_size
22
+ while bin_start < bin_stop
23
+ coverage_to_plot << [chr, bin_start, coverage]
24
+ bin_start += bin_size
25
+ end
26
+ end
27
+ end
28
+ return coverage_to_plot
29
+ end
30
+
31
+ def get_sor_length_distribution(raw_coverage)
32
+ all_cnvs_length = []
33
+ cnvs_count = []
34
+ raw_coverage.each do |chr, coords_info|
35
+ coords_info.each do |start, stop, pat_records|
36
+ region_length = stop - start + 1
37
+ all_cnvs_length << [region_length, pat_records]
38
+ end
39
+ end
40
+ all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]}
41
+ return all_cnvs_length
42
+ end
43
+
44
+ def get_cnvs_length(patient_data)
45
+ length_stats = Hash.new(0)
46
+ patient_data.each do |pat_id, patient_record|
47
+ string_hpos, chr, start, stop = patient_record
48
+ length_stats[stop - start] += 1
49
+ end
50
+ return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
51
+ end
52
+
53
+
54
+ def calculate_coverage(regions_data, delete_thresold = 0)
55
+ raw_coverage = {}
56
+ n_regions = 0
57
+ patients = 0
58
+ nt = 0
59
+ regions_data.each do |start, stop, chr, node|
60
+ number_of_patients = node.split('.').last.to_i
61
+ if number_of_patients <= delete_thresold
62
+ number_of_patients = 0
63
+ else
64
+ n_regions += 1
65
+ patients += number_of_patients
66
+ nt += stop - start
67
+ end
68
+ coords = [start, stop, number_of_patients]
69
+ query = raw_coverage[chr]
70
+ if query.nil?
71
+ raw_coverage[chr] = [coords]
72
+ else
73
+ query << coords
74
+ end
75
+ end
76
+ return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
77
+ end
@@ -0,0 +1,556 @@
1
+ require 'uri'
2
+ #Common methods for predictors
3
+ #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
4
+ #1. Indexing by chr (region)
5
+
6
+ def load_training_file4regions(training_file)
7
+ training_set = {}
8
+ posInfo = loadFile(training_file)
9
+ posInfo.each do |info|
10
+ chr = info.shift
11
+ query = training_set[chr]
12
+ if query.nil?
13
+ training_set[chr] = [info]
14
+ else
15
+ query << info
16
+ end
17
+ end
18
+ return training_set
19
+ end
20
+
21
+ #2. Indexing by hpo (code)
22
+ #prepare training file for analysis using phenotype2region prediction
23
+ def load_training_file4HPO(training_file, thresold=0)
24
+ training_set = {}
25
+ information = loadFile(training_file, thresold)
26
+ information.each do |info|
27
+ hpoCode = info.delete_at(4)
28
+ query = training_set[hpoCode]
29
+ if query.nil?
30
+ training_set[hpoCode] = [info]
31
+ else
32
+ query << info
33
+ end
34
+ end
35
+ # STDERR.puts training_set.keys.inspect
36
+ return training_set
37
+ end
38
+
39
+
40
+ #3. Load training info file:
41
+ #Chr;Start;Stop;HPO;Association;node
42
+ def loadFile(file, thresold=0)
43
+ information = []
44
+ File.open(file).each do |line|
45
+ line.chomp!
46
+ allInfo = line.split("\t")
47
+ associationValue = allInfo[4].to_f
48
+ if associationValue >= thresold
49
+ chr = allInfo[0]
50
+ startPos = allInfo[1].to_i
51
+ stopPos = allInfo[2].to_i
52
+ hpoCode = allInfo[3]
53
+ nodeID = allInfo[5]
54
+ information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
55
+ end
56
+ end
57
+ return information
58
+ end
59
+
60
+ def add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
61
+ if !hpo_black_list.include?(id)
62
+ attributes = [id, name, is_a - hpo_black_list, syn]
63
+ hpo_storage[id] = attributes
64
+ alt_ids.each do |altid|
65
+ hpo_storage[altid] = attributes
66
+ end
67
+ end
68
+ end
69
+
70
+ def load_hpo_file(hpo_file, hpo_black_list=[])
71
+ hpo_storage = {}
72
+ id = nil
73
+ name = nil
74
+ alt_ids = []
75
+ syn = []
76
+ is_a = []
77
+ File.open(hpo_file).each do |line|
78
+ line.chomp!
79
+ tag, info = line.split(': ')
80
+ if tag == 'id' || tag == 'name' || tag == 'is_a' || tag == 'synonym' || tag == 'alt_id'
81
+ if tag == 'id'
82
+ add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) if !name.nil?
83
+ id = info
84
+ name = nil
85
+ alt_id = []
86
+ syn = []
87
+ is_a = []
88
+ end
89
+ if tag == 'alt_id'
90
+ alt_ids << info
91
+ elsif tag == 'is_a'
92
+ is_a << info.split(' ! ')[0]
93
+ elsif tag == 'synonym'
94
+ syn << info.split('"')[1] #to keep only the name of the synonym
95
+ else
96
+ name = info
97
+ end
98
+ end
99
+ end
100
+ add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
101
+ # STDERR.puts hpo_storage.inspect
102
+ # Process.exit
103
+ return hpo_storage
104
+ end
105
+
106
+ def load_hpo_black_list(excluded_hpo_file)
107
+ excluded_hpos = []
108
+ File.open(excluded_hpo_file).each do |line|
109
+ line.chomp!
110
+ excluded_hpos << line
111
+ end
112
+ return excluded_hpos
113
+ end
114
+
115
+ def create_hpo_dictionary(hpo_storage)
116
+ hpo_dictionary = {}
117
+ hpo_storage.each do |hpo, metadata|
118
+ hpo_code, hpo_name, hpo_parents, hpo_synonyms = metadata
119
+ hpo_dictionary[hpo_name] = hpo_code
120
+ hpo_synonyms.each do |syn|
121
+ hpo_dictionary[syn] = hpo_code
122
+ end
123
+ end
124
+ return hpo_dictionary
125
+ end
126
+
127
+ def get_child_parent_relations(hpo_storage)
128
+ # for getting hpo childs
129
+ storage_child = {}
130
+ hpo_storage.each do |hpo_code, hpo_data|
131
+ id, name, is_a, syn = hpo_data
132
+ hpo_child = [id, name]
133
+ is_a.each do |par_hpo_code|
134
+ query = storage_child[par_hpo_code]
135
+ if query.nil?
136
+ storage_child[par_hpo_code] = [hpo_child]
137
+ else
138
+ query << hpo_child
139
+ end
140
+ end
141
+ end
142
+ return storage_child
143
+ end
144
+
145
+ def compute_IC_values(patient_data, total_patients)
146
+ patients_per_hpo = Hash.new(0)
147
+ last_patient_ID = ''
148
+ patient_data.each do |patient_ID, metadata|
149
+ patient, count = patient_ID.split('_i')
150
+ if patient != last_patient_ID
151
+ hpos, chr, start, stop = metadata
152
+ hpos.each do |h|
153
+ patients_per_hpo[h] += 1
154
+ end
155
+ end
156
+ last_patient_ID = patient
157
+ end
158
+ # STDERR.puts patients_per_hpo.inspect
159
+ # Process.exit
160
+ patients_per_hpo.each do |hpo, patient_number|
161
+ patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
162
+ end
163
+ return patients_per_hpo
164
+ end
165
+
166
+ # def get_child_parent_relations(hpo_storage)
167
+ # # for getting hpo childs
168
+ # storage_child = {}
169
+ # hpo_storage.each do |hpo_code, hpo_data|
170
+ # STDERR.puts hpo_data[3].inspect
171
+ # Process.exit
172
+ # main_code, hpo_name, synonyms, parents = hpo_data
173
+ # parents.each do |par_hpo_code, par_hpo_name|
174
+ # query = storage_child[par_hpo_code]
175
+ # hpo_child = [main_code, hpo_name]
176
+ # if query.nil?
177
+ # storage_child[par_hpo_code] = [par_hpo_name, [hpo_child]]
178
+ # else
179
+ # query.last << hpo_child
180
+ # end
181
+ # end
182
+ # end
183
+
184
+ # return storage_child
185
+ # end
186
+
187
+
188
+ def load_hpo_ci_values(information_coefficient_file)
189
+ hpos_ci_values = {}
190
+ File.open(information_coefficient_file).each do |line|
191
+ line.chomp!
192
+ hpo_code, ci = line.split("\t")
193
+ hpos_ci_values[hpo_code] = ci.to_f
194
+ end
195
+ return hpos_ci_values
196
+ end
197
+
198
+ def load_clustered_patients(file)
199
+ clusters = {}
200
+ File.open(file).each do |line|
201
+ line.chomp!
202
+ pat_id, cluster_id = line.split("\t")
203
+ query = clusters[cluster_id]
204
+ if query.nil?
205
+ clusters[cluster_id] = [pat_id]
206
+ else
207
+ query << pat_id
208
+ end
209
+ end
210
+ return clusters
211
+ end
212
+
213
+ def load_gene_data(gene_data_path)
214
+ gene_list = {} #geneID => attr
215
+ gene_location = {} # chr => gene
216
+ infile = open(gene_data_path)
217
+ gz = Zlib::GzipReader.new(infile)
218
+ current_chr = nil
219
+ genes = []
220
+ gz.each_line do |line|
221
+ line.chomp!
222
+ next if line =~ /^#/
223
+ fields = line.split("\t")
224
+ if fields[8].include?('genome=chromosome')
225
+ chr = fields[8].split(';')[1].split('=').last
226
+ gene_location[current_chr] = genes
227
+ genes = []
228
+ current_chr = chr
229
+ elsif fields[2] == 'gene'
230
+ attributes = {}
231
+ fields[8].split(';').each do |pair|
232
+ key, value = pair.split('=')
233
+ attributes[key] = value
234
+ end
235
+ geneNames = []
236
+ geneNames << attributes['gene'] if !attributes['gene'].nil?
237
+ geneNames.concat(attributes['gene_synonym'].split(',')) if !attributes['gene_synonym'].nil?
238
+ description = attributes['description']
239
+ description = URI.unescape(description) if !description.nil?
240
+ attributes['Dbxref'] =~ /GeneID:(\d+)/
241
+ gene_list[$1] = [geneNames, description]
242
+ genes << [$1, fields[3].to_i, fields[4].to_i]
243
+ end
244
+ end
245
+ gene_location[current_chr] = genes
246
+ return gene_list, gene_location
247
+ end
248
+
249
+ def parse_kegg_data(query_genes)
250
+ kegg_data = {} #gene => attb
251
+ while !query_genes.empty?
252
+ gene_set = query_genes.shift(10)
253
+ url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
254
+ uri = URI(url)
255
+ response = Net::HTTP.get(uri)
256
+ geneID = nil
257
+ gene_names = []
258
+ definition = nil
259
+ pathways = []
260
+ parsing_pathway_field = false
261
+ response.squeeze(' ').each_line do |line|
262
+ line.chomp!
263
+ if line =~ /^ENTRY/
264
+ geneID = line.split(' ')[1]
265
+ elsif line =~ /^NAME/
266
+ gene_names = line.split(' ', 2).last.split(', ')
267
+ elsif line =~ /^DEFINITION/
268
+ definition = line.split(' ', 2)[1]
269
+ elsif line =~ /^PATHWAY/
270
+ pathways << line.split(' ', 3)[1..2]
271
+ parsing_pathway_field = true
272
+ elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
273
+ parsing_pathway_field = false
274
+ elsif parsing_pathway_field
275
+ pathways << line.strip.split(' ', 2)
276
+ elsif line == '///'
277
+ parsing_pathway_field = false
278
+ kegg_data[geneID] = [gene_names, definition, pathways]
279
+ pathways = []
280
+ gene_names = []
281
+ end
282
+ end
283
+ end
284
+ return kegg_data
285
+ end
286
+
287
+ def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
288
+ kegg_data = {}
289
+ gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path)
290
+ keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG')
291
+ keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/}
292
+
293
+ gene2biosystems.each do |geneID, pathways|
294
+ kegg_pathways = []
295
+ pathways.each do |biosystem|
296
+ kAttrib = keggAttributes[biosystem]
297
+ kegg_pathways << kAttrib if !kAttrib.nil?
298
+ end
299
+ kegg_data[geneID] = kegg_pathways
300
+ end
301
+ return kegg_data
302
+ end
303
+
304
+ def loadBiosistemsInfo(biosystems_info_path, filterDB)
305
+ bsid2attributes = {}
306
+ infile = open(biosystems_info_path)
307
+ gz = Zlib::GzipReader.new(infile)
308
+ gz.each_line do |line|
309
+ line.chomp!
310
+ #STDERR.puts line.inspect
311
+ fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t")
312
+ bsid = fields.shift
313
+ bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0]
314
+ end
315
+ return bsid2attributes
316
+ end
317
+
318
+ def load_biosystem2gene_dictionary(biosystems_gene_path)
319
+ gene2kegg = {}
320
+ infile = open(biosystems_gene_path)
321
+ gz = Zlib::GzipReader.new(infile)
322
+ gz.each_line do |line|
323
+ line.chomp!
324
+ biosystem, gene, score = line.split("\t")
325
+ query = gene2kegg[gene]
326
+ if query.nil?
327
+ gene2kegg[gene] = [biosystem]
328
+ else
329
+ query << biosystem
330
+ end
331
+ end
332
+ return gene2kegg
333
+ end
334
+
335
+ def merge_genes_with_kegg_data(gene_list, kegg_data)
336
+ merged_data = {}
337
+ gene_list.each do |geneID, attributes|
338
+ query = kegg_data[geneID]
339
+ if query.nil?
340
+ attributes << []
341
+ else
342
+ attributes << query
343
+ end
344
+ merged_data[geneID] = attributes
345
+ end
346
+ return merged_data
347
+ end
348
+
349
+ def write_compressed_plain_file(data, path)
350
+ File.open(path, 'w') do |f|
351
+ gz = Zlib::GzipWriter.new(f)
352
+ gz.write data.to_json
353
+ gz.close
354
+ end
355
+ end
356
+
357
+ def read_compressed_json(path)
358
+ infile = open(path)
359
+ gz = Zlib::GzipReader.new(infile)
360
+ object = JSON.parse(gz.read)
361
+ return object
362
+ end
363
+
364
+ def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
365
+ pathways_genes_in_predictions = {}
366
+ genes_in_predictions = []
367
+ genes_clusters.each do |cluster|
368
+ cluster.each do |geneID, data|
369
+ geneNames, description, pathways = data
370
+ pathways.each do |pathway|
371
+ query = pathways_genes_in_predictions[pathway]
372
+ if query.nil?
373
+ pathways_genes_in_predictions[pathway] = [geneID]
374
+ else
375
+ query << geneID if !query.include?(geneID)
376
+ end
377
+ end
378
+ genes_in_predictions << geneID if !genes_in_predictions.include?(geneID)
379
+ end
380
+ end
381
+ genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions
382
+ gene_number = genes_with_kegg.length
383
+ stats = []
384
+ pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes|
385
+ pathway_id, pathway_name = pathway
386
+ no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes
387
+ pathway_no_predicted_genes_count = 0
388
+ no_pathway_no_predicted_genes_count = 0
389
+ genes_out_of_predictions.each do |geneID|
390
+ query = genes_with_kegg[geneID]
391
+ if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id)
392
+ pathway_no_predicted_genes_count += 1
393
+ else
394
+ no_pathway_no_predicted_genes_count += 1
395
+ end
396
+ end
397
+ #Fisher => http://www.biostathandbook.com/fishers.html
398
+ no_pathway_predicted_genes_count = no_pathway_predicted_genes.length
399
+ pathway_predicted_genes_count = pathway_predicted_genes.length
400
+ accumulated_prob = 0
401
+ pathway_no_predicted_genes_count.times do |n|
402
+ no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n
403
+ pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n
404
+ if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0
405
+ accumulated_prob += compute_hyper_prob(
406
+ n,
407
+ no_pathway_predicted_genes_count_shifted,
408
+ pathway_predicted_genes_count_shifted,
409
+ no_pathway_no_predicted_genes_count + n,
410
+ gene_number
411
+ )
412
+ else
413
+ break
414
+ end
415
+ end
416
+ contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count]
417
+ stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob]
418
+ end
419
+ return stats
420
+ end
421
+
422
+ def compute_hyper_prob(a, b, c, d, n)
423
+ binomA = binom(a + b, a)
424
+ binomC = binom(c + d, c)
425
+ divisor = binom(n, a + c)
426
+ return (binomA * binomC).fdiv(divisor)
427
+ end
428
+
429
+ def binom(n,k)
430
+ if k > 0 && k < n
431
+ res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
432
+ else
433
+ res = 1
434
+ end
435
+ end
436
+
437
+ def get_reference(genomic_ranges)
438
+ #genomic_ranges = [patientID, mut_start, mut_stop]
439
+ reference = []
440
+ reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
441
+ reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
442
+ reference.uniq!
443
+ reference.sort!
444
+ #Define overlap range
445
+ final_reference = []
446
+ reference.each_with_index do |coord,i|
447
+ next_coord = reference[i + 1]
448
+ final_reference << [coord, next_coord] if !next_coord.nil?
449
+ end
450
+ return final_reference
451
+ end
452
+
453
+ def overlap_patients(genomic_ranges, reference)
454
+ overlaps = []
455
+ reference.each do |start, stop|
456
+ patients = []
457
+ genomic_ranges.each do |pt_id, pt_start, pt_stop|
458
+ if (start <= pt_start && stop >= pt_stop) ||
459
+ (start > pt_start && stop < pt_stop) ||
460
+ (stop > pt_start && stop <= pt_stop) ||
461
+ (start >= pt_start && start < pt_stop)
462
+ patients << pt_id
463
+ end
464
+ end
465
+ overlaps << patients.uniq
466
+ end
467
+ return overlaps
468
+ end
469
+
470
+ def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
471
+ patients_out_of_cluster = 0
472
+ patients_by_cluster = {}
473
+ sors = []
474
+ patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
475
+ reference = get_reference(genomic_ranges) # Get putative overlap regions
476
+ overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
477
+ clust_number = 1
478
+ reference.each_with_index do |ref, i|
479
+ current_patients = overlapping_patients[i]
480
+ if current_patients.length > pat_per_reg
481
+ ref << chrm
482
+ node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
483
+ ref << node_identifier
484
+ save_sor(current_patients, node_identifier, patients_by_cluster)
485
+ sors << ref
486
+ clust_number += 1
487
+ end
488
+ end
489
+ end
490
+ return patients_by_cluster, sors
491
+ end
492
+
493
+ def save_sor(current_patients, node_identifier, patients_by_cluster)
494
+ current_patients.each do |patient|
495
+ add_record(patients_by_cluster, patient, node_identifier)
496
+ end
497
+ end
498
+
499
+ def add_record(hash, key, record)
500
+ query = hash[key]
501
+ if query.nil?
502
+ hash[key] = [record]
503
+ elsif !query.include?(record)
504
+ query << record
505
+ end
506
+ end
507
+
508
+ def load_patient_cohort(options)
509
+ patient_data = {}
510
+ count = 0
511
+ fields2extract = get_fields2extract(options)
512
+ field_numbers = fields2extract.values
513
+ original_ids = []
514
+ File.open(options[:input_file]).each do |line|
515
+ line.chomp!
516
+ if options[:header] && count == 0
517
+ line.gsub!(/#\s*/,'') # correct comment like headers
518
+ field_names = line.split("\t")
519
+ get_field_numbers2extract(field_names, fields2extract)
520
+ field_numbers = fields2extract.values
521
+ else
522
+ fields = line.split("\t")
523
+ pat_record = field_numbers.map{|n| fields[n]}
524
+ if fields2extract[:pat_id_col].nil?
525
+ pat_id = "pat_#{count}" #generate ids
526
+ else
527
+ original_id = pat_record.shift
528
+ original_ids << original_id
529
+ pat_id = original_id + "_i#{count}" # make sure that ids are uniq
530
+ end
531
+ patient_data[pat_id] = pat_record
532
+ end
533
+ count +=1
534
+ end
535
+ fields2extract[:pat_id_col].nil? ? patient_number = count : patient_number = original_ids.uniq.length
536
+ options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
537
+ return patient_data, patient_number
538
+ end
539
+
540
+ def get_fields2extract(options)
541
+ fields2extract = {}
542
+ [:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
543
+ col = options[field]
544
+ if !col.nil?
545
+ col = col.to_i if !options[:header]
546
+ fields2extract[field] = col
547
+ end
548
+ end
549
+ return fields2extract
550
+ end
551
+
552
+ def get_field_numbers2extract(field_names, fields2extract)
553
+ fields2extract.each do |field, name|
554
+ fields2extract[field] = field_names.index(name)
555
+ end
556
+ end