pets 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ class Genomic_Feature
2
+ #If any method use gen_fet as name is a Genomic_Feature object
3
+ def initialize(feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
4
+ @regions = {}
5
+ @reg_id = -1
6
+ load_features(feat_array)
7
+ end
8
+
9
+ def load_features(feat_array)
10
+ feat_array.each do |chr, start, stop|
11
+ chr = chr.to_sym
12
+ region = {start: start, stop: stop, to: @reg_id +=1 }
13
+ add_record(@regions, chr, region)
14
+ end
15
+ end
16
+
17
+ def length
18
+ return @regions.length
19
+ end
20
+
21
+ def each()
22
+ @regions.each do |chr, regs|
23
+ regs.each do |region|
24
+ yield(chr, region)
25
+ end
26
+ end
27
+ end
28
+
29
+ def get_chr
30
+ return @regions.keys
31
+ end
32
+
33
+ def get_sizes
34
+ sizes = []
35
+ each do |chr, region|
36
+ size = region[:stop] - region[:start] + 1
37
+ sizes << size
38
+ end
39
+ return sizes
40
+ end
41
+
42
+ def get_summary_sizes
43
+ sizes = Hash.new(0)
44
+ each do |chr, region|
45
+ size = region[:stop] - region[:start] + 1
46
+ sizes[size] += 1
47
+ end
48
+ return sizes.to_a.sort!{|s| s[1] <=> s[1] }
49
+ end
50
+
51
+ def merge(gen_fet, to = nil) # 'to' the regions must be connected "to" given id
52
+ gen_fet.each do |chr, region|
53
+ to.nil? ? region[:to] = @reg_id +=1 : region[:to] = to # handle id custom or default
54
+ add_record(@regions, chr, region)
55
+ end
56
+ end
57
+
58
+ def get_reference_overlaps(genomic_ranges, reference)
59
+ overlaps = []
60
+ reference.each do |start, stop|
61
+ reg_ids = []
62
+ genomic_ranges.each do |reg|
63
+ reg_ids << reg[:to] if coor_overlap?(start, stop, reg)
64
+ end
65
+ overlaps << reg_ids.uniq
66
+ end
67
+ return overlaps
68
+ end
69
+
70
+ def generate_cluster_regions(meth, tag, ids_per_reg = 1)
71
+ compute_windows(meth) # Get putative genome windows
72
+ patients_out_of_cluster = 0
73
+ ids_by_cluster = {}
74
+ annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
75
+ @regions.each do |chr, regs|
76
+ reference = @windows[chr]
77
+ overlaps = get_reference_overlaps(regs, reference) # See what patient has match with a overlap region
78
+ clust_numb = 0
79
+ reference.each_with_index do |ref, i|
80
+ current_ids = overlaps[i]
81
+ if current_ids.length > ids_per_reg
82
+ clust_id = "#{chr}.#{clust_numb +=1}.#{tag}.#{current_ids.length}"
83
+ current_ids.each do |curr_id|
84
+ add_record(ids_by_cluster, curr_id, clust_id, true)
85
+ end
86
+ annotated_full_ref << ref.dup.concat([chr, clust_id])
87
+ end
88
+ end
89
+ end
90
+ return ids_by_cluster, annotated_full_ref
91
+ end
92
+
93
+ def compute_windows(meth)
94
+ @windows = {}
95
+ @regions.each do |chr, regs|
96
+ chr_windows = nil
97
+ if meth == :reg_overlap
98
+ chr_windows = compute_region_overlap_windows(regs)
99
+ end
100
+ @windows[chr] = chr_windows
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ def add_record(hash, key, record, uniq=false)
107
+ query = hash[key]
108
+ if query.nil?
109
+ hash[key] = [record]
110
+ elsif !uniq # We not take care by repeated entries
111
+ query << record
112
+ elsif !query.include?(record) # We want uniq entries
113
+ query << record
114
+ end
115
+ end
116
+
117
+ def compute_region_overlap_windows(genomic_ranges)
118
+ reference = []
119
+ reference.concat(genomic_ranges.map{|gr| gr[:start]})# get start
120
+ reference.concat(genomic_ranges.map{|gr| gr[:stop]})# get stop
121
+ reference.uniq!
122
+ reference.sort!
123
+ #Define overlap ranges
124
+ final_reference = []
125
+ reference.each_with_index do |coord,i|
126
+ next_coord = reference[i + 1]
127
+ final_reference << [coord, next_coord] if !next_coord.nil?
128
+ end
129
+ return final_reference
130
+ end
131
+
132
+ def coor_overlap?(start, stop, reg)
133
+ overlap = false
134
+ reg_start = reg[:start]
135
+ reg_stop = reg[:stop]
136
+ if (start <= reg_start && stop >= reg_stop) ||
137
+ (start > reg_start && stop < reg_stop) ||
138
+ (stop > reg_start && stop <= reg_stop) ||
139
+ (start >= reg_start && start < reg_stop)
140
+ overlap = true
141
+ end
142
+ return overlap
143
+ end
144
+ end
data/lib/pets/io.rb ADDED
@@ -0,0 +1,457 @@
1
+ require 'csv'
2
+
3
+ def load_hpo_ontology(hpo_file, excluded_hpo_file)
4
+ hpo = nil
5
+ if !hpo_file.include?('.json')
6
+ if !excluded_hpo_file.nil?
7
+ hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
8
+ else
9
+ hpo = Ontology.new(file: hpo_file, load_file: true)
10
+ end
11
+ else
12
+ hpo = Ontology.new
13
+ hpo.read(hpo_file)
14
+ if !excluded_hpo_file.nil?
15
+ hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
16
+ hpo.remove_removable()
17
+ hpo.build_index()
18
+ end
19
+ end
20
+ return hpo
21
+ end
22
+
23
+ def read_excluded_hpo_file(file)
24
+ excluded_hpo = []
25
+ File.open(file).each do |line|
26
+ excluded_hpo << line.chomp
27
+ end
28
+ return excluded_hpo
29
+ end
30
+
31
+ def write_hash(hash, file_path, header = [])
32
+ File.open(file_path, 'w') do |handler|
33
+ handler.puts header.join("\t") if !header.empty?
34
+ hash.each do |key, array|
35
+ handler.puts "#{key}\t#{array.join("\t")}"
36
+ end
37
+ end
38
+ end
39
+
40
+ def write_array(array, file_path)
41
+ File.open(file_path, 'w') do |handler|
42
+ array.each do |record|
43
+ if record.class == String
44
+ line = record
45
+ else
46
+ line = record.join("\t")
47
+ end
48
+ handler.puts line
49
+ end
50
+ end
51
+ end
52
+
53
+ def write_matrix_for_R(matrix, x_names, y_names, file)
54
+ File.open(file, 'w') do |f|
55
+ f.puts x_names.join("\t")
56
+ matrix.each_with_index do |row, i|
57
+ f.puts [y_names[i]].concat(row).join("\t")
58
+ end
59
+ end
60
+ end
61
+
62
+ def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
63
+ File.open(cluster_ic_data_file, 'w') do |f|
64
+ f.puts %w[cluster_id ic Plen].join("\t")
65
+ all_ics.each_with_index do |cluster_ics, i|
66
+ break if i == limit
67
+ cluster_length = cluster_ics.length
68
+ cluster_ics.each_with_index do |clust_ic, j|
69
+ f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
76
+ File.open(cluster_chromosome_data_file, 'w') do |f|
77
+ f.puts %w[cluster_id chr count].join("\t")
78
+ index = 0
79
+ last_id = cluster_data.first.first unless cluster_data.empty?
80
+ cluster_data.each do |cluster_id, patient_number, chr, count|
81
+ index += 1 if cluster_id != last_id
82
+ break if index == limit
83
+ f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
84
+ last_id = cluster_id
85
+ end
86
+ end
87
+ end
88
+
89
+ def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
90
+ File.open(coverage_to_plot_file, 'w') do |f|
91
+ coverage_to_plot.each do |chr, position, freq|
92
+ f.puts "#{chr}\t#{position}\t#{freq}"
93
+ end
94
+ end
95
+ end
96
+
97
+
98
+ def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
99
+ CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
100
+ suggested_childs.each do |pat_id, suggestions|
101
+ warning = nil
102
+ warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
103
+ csv << ["PATIENT #{pat_id}", "#{warning}"]
104
+ csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
105
+ suggestions.each do |parent, childs|
106
+ parent_code, parent_name = parent
107
+ if childs.empty?
108
+ csv << ["#{parent_name} (#{parent_code})", '-']
109
+ else
110
+ parent_writed = false
111
+ childs.each do |child_code, child_name|
112
+ if !parent_writed
113
+ parent_field = "#{parent_name} (#{parent_code})"
114
+ parent_writed = true
115
+ else
116
+ parent_field = ""
117
+ end
118
+ csv << [parent_field, "#{child_name} (#{child_code})"]
119
+ end
120
+ end
121
+ end
122
+ csv << ["", ""]
123
+ end
124
+ end
125
+ end
126
+
127
+ def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
128
+ File.open(filename, 'w') do |f|
129
+ f.puts "#{x_axis_name}\t#{y_axis_name}"
130
+ x_axis_value.each_with_index do |value,i|
131
+ y_value = y_axis_value[i]
132
+ raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
133
+ f.puts [value, y_value].join("\t")
134
+ end
135
+ end
136
+ end
137
+
138
+
139
+ def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
140
+ File.open(similarity_matrix_file, 'w') do |f|
141
+ similarity_matrix.each do |row|
142
+ f.puts row.join("\t")
143
+ end
144
+ end
145
+ end
146
+
147
+ def write_profile_pairs(similarity_pairs, filename)
148
+ File.open(filename, 'w') do |f|
149
+ similarity_pairs.each do |pairsA, pairsB_and_values|
150
+ pairsB_and_values.each do |pairsB, values|
151
+ f.puts "#{pairsA}\t#{pairsB}\t#{values}"
152
+ end
153
+ end
154
+ end
155
+ end
156
+
157
+ def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
158
+ File.open(output_file, 'w') do |f|
159
+ f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
160
+ average_hp_per_pat_distribution.each do |patient_num, ave|
161
+ f.puts "#{patient_num}\t#{ave}"
162
+ end
163
+ end
164
+ end
165
+
166
+ def parse_clusters_file(clusters_file, patient_data)
167
+ clusters_info = {}
168
+ clusters_table = []
169
+ File.open(clusters_file).each do |line|
170
+ line.chomp!
171
+ patientID, clusterID = line.split("\t")
172
+ patientHPOProfile = patient_data.get_profile(patientID)
173
+ query = clusters_info[clusterID]
174
+ if query.nil?
175
+ clusters_info[clusterID] = {patientID => patientHPOProfile}
176
+ else
177
+ query[patientID] = patientHPOProfile
178
+ end
179
+ end
180
+ clusters_info.each do |clusterID, patients_info|
181
+ patients_per_cluster = patients_info.keys.length
182
+ clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
183
+ end
184
+ return clusters_table, clusters_info
185
+ end
186
+
187
+ def load_profiles(file_path, hpo)
188
+ profiles = {}
189
+ #count = 0
190
+ File.open(file_path).each do |line|
191
+ id, profile = line.chomp.split("\t")
192
+ hpos = profile.split(',').map{|a| a.to_sym}
193
+ hpos, rejected_hpos = hpo.check_ids(hpos)
194
+ if !hpos.empty?
195
+ hpos = hpo.clean_profile(hpos)
196
+ profiles[id] = hpos if !hpos.empty?
197
+ end
198
+ end
199
+ return profiles
200
+ end
201
+
202
+ def load_variants(variant_folder)
203
+ variants = {}
204
+ Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
205
+ profile_id = File.basename(path, '.tab')
206
+ vars = []
207
+ File.open(path).each do |line|
208
+ fields = line.chomp.split("\t")
209
+ chr = fields[0]
210
+ start = fields[1].to_i
211
+ vars << [chr, start, start]
212
+ end
213
+ variants[profile_id] = Genomic_Feature.new(vars)
214
+ end
215
+ return variants
216
+ end
217
+
218
+ def load_evidences(evidences_path, hpo)
219
+ genomic_coordinates = {}
220
+ coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
221
+ coord_files.each do |cd_f|
222
+ entity = File.basename(cd_f, '.coords')
223
+ coordinates = load_coordinates(cd_f)
224
+ genomic_coordinates[entity] = coordinates
225
+ end
226
+ evidences = {}
227
+ evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
228
+ evidence_files.each do |e_f|
229
+ pair = File.basename(e_f, '.txt')
230
+ profiles, id2label = load_evidence_profiles(e_f, hpo)
231
+ evidences[pair] = {prof: profiles, id2lab: id2label}
232
+ end
233
+ return evidences, genomic_coordinates
234
+ end
235
+
236
+ def load_coordinates(file_path)
237
+ coordinates = {}
238
+ header = true
239
+ File.open(file_path).each do |line|
240
+ fields = line.chomp.split("\t")
241
+ if header
242
+ header = false
243
+ else
244
+ entity, chr, strand, start, stop = fields
245
+ coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
246
+ end
247
+ end
248
+ return coordinates
249
+ end
250
+
251
+ def load_evidence_profiles(file_path, hpo)
252
+ profiles = {}
253
+ id2label = {}
254
+ #count = 0
255
+ File.open(file_path).each do |line|
256
+ id, label, profile = line.chomp.split("\t")
257
+ hpos = profile.split(',').map{|a| a.to_sym}
258
+ hpos, rejected_hpos = hpo.check_ids(hpos)
259
+ if !hpos.empty?
260
+ hpos = hpo.clean_profile(hpos)
261
+ profiles[id] = hpos if !hpos.empty?
262
+ id2label[id] = label
263
+ end
264
+ end
265
+ return profiles, id2label
266
+ end
267
+
268
+ #Common methods for predictors
269
+ #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
270
+ #1. Indexing by chr (region)
271
+ def coor_overlap?(ref_start, ref_stop, start, stop)
272
+ overlap = false
273
+ if (stop > ref_start && stop <= ref_stop) ||
274
+ (start >= ref_start && start < ref_stop) ||
275
+ (start <= ref_start && stop >= ref_stop) ||
276
+ (start > ref_start && stop < ref_stop)
277
+ overlap = true
278
+ end
279
+ return overlap
280
+ end
281
+
282
+ def load_training_file4regions(training_file)
283
+ training_set = {}
284
+ posInfo = loadFile(training_file)
285
+ posInfo.each do |info|
286
+ chr = info.shift
287
+ query = training_set[chr]
288
+ if query.nil?
289
+ training_set[chr] = [info]
290
+ else
291
+ query << info
292
+ end
293
+ end
294
+ return training_set
295
+ end
296
+
297
+ #2. Indexing by hpo (code)
298
+ #prepare training file for analysis using phenotype2region prediction
299
+ def load_training_file4HPO(training_file, thresold=0)
300
+ training_set = {}
301
+ information = loadFile(training_file, thresold)
302
+ information.each do |info|
303
+ hpoCode = info.delete_at(4)
304
+ query = training_set[hpoCode]
305
+ if query.nil?
306
+ training_set[hpoCode] = [info]
307
+ else
308
+ query << info
309
+ end
310
+ end
311
+ # STDERR.puts training_set.keys.inspect
312
+ return training_set
313
+ end
314
+
315
+
316
+ #3. Load training info file:
317
+ #Chr;Start;Stop;HPO;Association;node
318
+ def loadFile(file, thresold=0)
319
+ information = []
320
+ File.open(file).each do |line|
321
+ line.chomp!
322
+ allInfo = line.split("\t")
323
+ associationValue = allInfo[4].to_f
324
+ if associationValue >= thresold
325
+ chr = allInfo[0]
326
+ startPos = allInfo[1].to_i
327
+ stopPos = allInfo[2].to_i
328
+ hpoCode = allInfo[3]
329
+ nodeID = allInfo[5]
330
+ information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
331
+ end
332
+ end
333
+ return information
334
+ end
335
+
336
+ def load_hpo_ci_values(information_coefficient_file)
337
+ hpos_ci_values = {}
338
+ File.open(information_coefficient_file).each do |line|
339
+ line.chomp!
340
+ hpo_code, ci = line.split("\t")
341
+ hpos_ci_values[hpo_code.to_sym] = ci.to_f
342
+ end
343
+ return hpos_ci_values
344
+ end
345
+
346
+ def load_clustered_patients(file)
347
+ clusters = {}
348
+ File.open(file).each do |line|
349
+ line.chomp!
350
+ pat_id, cluster_id = line.split("\t")
351
+ query = clusters[cluster_id]
352
+ if query.nil?
353
+ clusters[cluster_id] = [pat_id]
354
+ else
355
+ query << pat_id
356
+ end
357
+ end
358
+ return clusters
359
+ end
360
+
361
+ def load_gene_data(gene_data_path)
362
+ gene_list = {} #geneID => attr
363
+ gene_location = {} # chr => gene
364
+ infile = open(gene_data_path)
365
+ gz = Zlib::GzipReader.new(infile)
366
+ current_chr = nil
367
+ genes = []
368
+ gz.each_line do |line|
369
+ line.chomp!
370
+ next if line =~ /^#/
371
+ fields = line.split("\t")
372
+ if fields[8].include?('genome=chromosome')
373
+ chr = fields[8].split(';')[1].split('=').last
374
+ gene_location[current_chr] = genes
375
+ genes = []
376
+ current_chr = chr
377
+ elsif fields[2] == 'gene'
378
+ attributes = {}
379
+ fields[8].split(';').each do |pair|
380
+ key, value = pair.split('=')
381
+ attributes[key] = value
382
+ end
383
+ geneName = nil
384
+ geneName = attributes['gene'] if !attributes['gene'].nil?
385
+ geneSyns = []
386
+ geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
387
+ description = attributes['description']
388
+ description = URI.unescape(description) if !description.nil?
389
+ attributes['Dbxref'] =~ /GeneID:(\d+)/
390
+ gene_list[$1] = [geneName, geneSyns, description]
391
+ genes << [$1, fields[3].to_i, fields[4].to_i]
392
+ end
393
+ end
394
+ gene_location[current_chr] = genes
395
+ return gene_list, gene_location
396
+ end
397
+
398
+ def parse_kegg_data(query_genes)
399
+ kegg_data = {} #gene => attb
400
+ while !query_genes.empty?
401
+ gene_set = query_genes.shift(10)
402
+ url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
403
+ uri = URI(url)
404
+ response = Net::HTTP.get(uri)
405
+ geneID = nil
406
+ gene_names = []
407
+ definition = nil
408
+ pathways = []
409
+ parsing_pathway_field = false
410
+ response.squeeze(' ').each_line do |line|
411
+ line.chomp!
412
+ if line =~ /^ENTRY/
413
+ geneID = line.split(' ')[1]
414
+ elsif line =~ /^NAME/
415
+ gene_names = line.split(' ', 2).last.split(', ')
416
+ elsif line =~ /^DEFINITION/
417
+ definition = line.split(' ', 2)[1]
418
+ elsif line =~ /^PATHWAY/
419
+ pathways << line.split(' ', 3)[1..2]
420
+ parsing_pathway_field = true
421
+ elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
422
+ parsing_pathway_field = false
423
+ elsif parsing_pathway_field
424
+ pathways << line.strip.split(' ', 2)
425
+ elsif line == '///'
426
+ parsing_pathway_field = false
427
+ kegg_data[geneID] = [gene_names, definition, pathways]
428
+ pathways = []
429
+ gene_names = []
430
+ end
431
+ end
432
+ end
433
+ return kegg_data
434
+ end
435
+
436
+ def write_compressed_plain_file(data, path)
437
+ File.open(path, 'w') do |f|
438
+ gz = Zlib::GzipWriter.new(f)
439
+ gz.write data.to_json
440
+ gz.close
441
+ end
442
+ end
443
+
444
+ def read_compressed_json(path)
445
+ infile = open(path)
446
+ gz = Zlib::GzipReader.new(infile)
447
+ object = JSON.parse(gz.read)
448
+ return object
449
+ end
450
+
451
+ def download(ftp_server, path, name)
452
+ ftp = Net::FTP.new()
453
+ ftp.connect(ftp_server)
454
+ ftp.login
455
+ ftp.getbinaryfile(path, name)
456
+ ftp.close
457
+ end
@@ -0,0 +1,106 @@
1
+ class Cohort_Parser
2
+ def self.load(options)
3
+ fields2extract = get_fields2extract(options)
4
+ field_numbers = fields2extract.values
5
+ records = read_records(options, fields2extract, field_numbers)
6
+ cohort, rejected_terms, rejected_recs = create_cohort(records, options)
7
+ return cohort, rejected_terms, rejected_recs
8
+ end
9
+
10
+ def self.read_records(options, fields2extract, field_numbers)
11
+ records = {}
12
+ count = 0
13
+ File.open(options[:input_file]).each do |line|
14
+ line.chomp!
15
+ if options[:header] && count == 0
16
+ line.gsub!(/#\s*/,'') # correct comment like headers
17
+ field_names = line.split("\t")
18
+ get_field_numbers2extract(field_names, fields2extract)
19
+ field_numbers = fields2extract.values
20
+ else
21
+ fields = line.split("\t")
22
+ record = field_numbers.map{|n| fields[n]}
23
+ if fields2extract[:id_col].nil?
24
+ id = "rec_#{count}" #generate ids
25
+ else
26
+ id = record.shift
27
+ end
28
+ if !record[0].nil?
29
+ record[0] = record[0].split(options[:separator])
30
+ else
31
+ record[0] = []
32
+ end
33
+ record[2] = record[2].to_i if !options[:start_col].nil?
34
+ record[3] = record[3].to_i if !options[:end_col].nil?
35
+ query = records[id]
36
+ if query.nil?
37
+ records[id] = [record]
38
+ else
39
+ query << record
40
+ end
41
+ end
42
+ count +=1
43
+ end
44
+ return records
45
+ end
46
+
47
+ def self.get_fields2extract(options)
48
+ fields2extract = {}
49
+ [:id_col, :ont_col, :chromosome_col, :start_col, :end_col].each do |field|
50
+ col = options[field]
51
+ if !col.nil?
52
+ col = col.to_i if !options[:header]
53
+ fields2extract[field] = col
54
+ end
55
+ end
56
+ return fields2extract
57
+ end
58
+
59
+ def self.get_field_numbers2extract(field_names, fields2extract)
60
+ fields2extract.each do |field, name|
61
+ fields2extract[field] = field_names.index(name)
62
+ end
63
+ end
64
+
65
+ def self.create_cohort(records, options)
66
+ ont = Cohort.get_ontology(Cohort.act_ont)
67
+ rejected_terms = []
68
+ rejected_recs = []
69
+ cohort = Cohort.new()
70
+ records.each do |id, record|
71
+ rec = record.first
72
+ terms = rec.first
73
+ if options[:names]
74
+ init_term_number = terms.length
75
+ terms, rec_rejected_terms = ont.translate_names(terms)
76
+ if !rec_rejected_terms.empty?
77
+ STDERR.puts "WARNING: record #{id} has the unknown term NAMES '#{rec_rejected_terms.join(',')}'. Terms removed."
78
+ rejected_terms.concat(rec_rejected_terms)
79
+ end
80
+ if terms.empty? && init_term_number > 0
81
+ rejected_recs << id
82
+ next
83
+ end
84
+ end
85
+ if rec.length > 1 # there is genomic region attributes
86
+ variants = record.map{|v| v[1..3] }
87
+ else
88
+ variants = [] # Not exists genomic region attributes so we create a empty array
89
+ end
90
+ cohort.add_record([id, terms, check_variants(variants)])
91
+ end
92
+ return cohort, rejected_terms.uniq, rejected_recs
93
+ end
94
+
95
+ def self.check_variants(vars)
96
+ checked_vars = []
97
+ vars.each do |var| #[chr, start, stop]
98
+ if var.first == '-' # the chr must be defined
99
+ STDERR.puts "WARNING: variant #{var.join(',')} has been removed"
100
+ else
101
+ checked_vars << var
102
+ end
103
+ end
104
+ return vars
105
+ end
106
+ end
data/lib/pets/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pets
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.4"
3
3
  end