pets 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,144 @@
1
+ class Genomic_Feature
2
+ #If any method use gen_fet as name is a Genomic_Feature object
3
+ def initialize(feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
4
+ @regions = {}
5
+ @reg_id = -1
6
+ load_features(feat_array)
7
+ end
8
+
9
+ def load_features(feat_array)
10
+ feat_array.each do |chr, start, stop|
11
+ chr = chr.to_sym
12
+ region = {start: start, stop: stop, to: @reg_id +=1 }
13
+ add_record(@regions, chr, region)
14
+ end
15
+ end
16
+
17
+ def length
18
+ return @regions.length
19
+ end
20
+
21
+ def each()
22
+ @regions.each do |chr, regs|
23
+ regs.each do |region|
24
+ yield(chr, region)
25
+ end
26
+ end
27
+ end
28
+
29
+ def get_chr
30
+ return @regions.keys
31
+ end
32
+
33
+ def get_sizes
34
+ sizes = []
35
+ each do |chr, region|
36
+ size = region[:stop] - region[:start] + 1
37
+ sizes << size
38
+ end
39
+ return sizes
40
+ end
41
+
42
+ def get_summary_sizes
43
+ sizes = Hash.new(0)
44
+ each do |chr, region|
45
+ size = region[:stop] - region[:start] + 1
46
+ sizes[size] += 1
47
+ end
48
+ return sizes.to_a.sort!{|s| s[1] <=> s[1] }
49
+ end
50
+
51
+ def merge(gen_fet, to = nil) # 'to' the regions must be connected "to" given id
52
+ gen_fet.each do |chr, region|
53
+ to.nil? ? region[:to] = @reg_id +=1 : region[:to] = to # handle id custom or default
54
+ add_record(@regions, chr, region)
55
+ end
56
+ end
57
+
58
+ def get_reference_overlaps(genomic_ranges, reference)
59
+ overlaps = []
60
+ reference.each do |start, stop|
61
+ reg_ids = []
62
+ genomic_ranges.each do |reg|
63
+ reg_ids << reg[:to] if coor_overlap?(start, stop, reg)
64
+ end
65
+ overlaps << reg_ids.uniq
66
+ end
67
+ return overlaps
68
+ end
69
+
70
+ def generate_cluster_regions(meth, tag, ids_per_reg = 1)
71
+ compute_windows(meth) # Get putative genome windows
72
+ patients_out_of_cluster = 0
73
+ ids_by_cluster = {}
74
+ annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
75
+ @regions.each do |chr, regs|
76
+ reference = @windows[chr]
77
+ overlaps = get_reference_overlaps(regs, reference) # See what patient has match with a overlap region
78
+ clust_numb = 0
79
+ reference.each_with_index do |ref, i|
80
+ current_ids = overlaps[i]
81
+ if current_ids.length > ids_per_reg
82
+ clust_id = "#{chr}.#{clust_numb +=1}.#{tag}.#{current_ids.length}"
83
+ current_ids.each do |curr_id|
84
+ add_record(ids_by_cluster, curr_id, clust_id, true)
85
+ end
86
+ annotated_full_ref << ref.dup.concat([chr, clust_id])
87
+ end
88
+ end
89
+ end
90
+ return ids_by_cluster, annotated_full_ref
91
+ end
92
+
93
+ def compute_windows(meth)
94
+ @windows = {}
95
+ @regions.each do |chr, regs|
96
+ chr_windows = nil
97
+ if meth == :reg_overlap
98
+ chr_windows = compute_region_overlap_windows(regs)
99
+ end
100
+ @windows[chr] = chr_windows
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ def add_record(hash, key, record, uniq=false)
107
+ query = hash[key]
108
+ if query.nil?
109
+ hash[key] = [record]
110
+ elsif !uniq # We not take care by repeated entries
111
+ query << record
112
+ elsif !query.include?(record) # We want uniq entries
113
+ query << record
114
+ end
115
+ end
116
+
117
+ def compute_region_overlap_windows(genomic_ranges)
118
+ reference = []
119
+ reference.concat(genomic_ranges.map{|gr| gr[:start]})# get start
120
+ reference.concat(genomic_ranges.map{|gr| gr[:stop]})# get stop
121
+ reference.uniq!
122
+ reference.sort!
123
+ #Define overlap ranges
124
+ final_reference = []
125
+ reference.each_with_index do |coord,i|
126
+ next_coord = reference[i + 1]
127
+ final_reference << [coord, next_coord] if !next_coord.nil?
128
+ end
129
+ return final_reference
130
+ end
131
+
132
+ def coor_overlap?(start, stop, reg)
133
+ overlap = false
134
+ reg_start = reg[:start]
135
+ reg_stop = reg[:stop]
136
+ if (start <= reg_start && stop >= reg_stop) ||
137
+ (start > reg_start && stop < reg_stop) ||
138
+ (stop > reg_start && stop <= reg_stop) ||
139
+ (start >= reg_start && start < reg_stop)
140
+ overlap = true
141
+ end
142
+ return overlap
143
+ end
144
+ end
data/lib/pets/io.rb ADDED
@@ -0,0 +1,457 @@
1
+ require 'csv'
2
+
3
+ def load_hpo_ontology(hpo_file, excluded_hpo_file)
4
+ hpo = nil
5
+ if !hpo_file.include?('.json')
6
+ if !excluded_hpo_file.nil?
7
+ hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
8
+ else
9
+ hpo = Ontology.new(file: hpo_file, load_file: true)
10
+ end
11
+ else
12
+ hpo = Ontology.new
13
+ hpo.read(hpo_file)
14
+ if !excluded_hpo_file.nil?
15
+ hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
16
+ hpo.remove_removable()
17
+ hpo.build_index()
18
+ end
19
+ end
20
+ return hpo
21
+ end
22
+
23
+ def read_excluded_hpo_file(file)
24
+ excluded_hpo = []
25
+ File.open(file).each do |line|
26
+ excluded_hpo << line.chomp
27
+ end
28
+ return excluded_hpo
29
+ end
30
+
31
+ def write_hash(hash, file_path, header = [])
32
+ File.open(file_path, 'w') do |handler|
33
+ handler.puts header.join("\t") if !header.empty?
34
+ hash.each do |key, array|
35
+ handler.puts "#{key}\t#{array.join("\t")}"
36
+ end
37
+ end
38
+ end
39
+
40
+ def write_array(array, file_path)
41
+ File.open(file_path, 'w') do |handler|
42
+ array.each do |record|
43
+ if record.class == String
44
+ line = record
45
+ else
46
+ line = record.join("\t")
47
+ end
48
+ handler.puts line
49
+ end
50
+ end
51
+ end
52
+
53
+ def write_matrix_for_R(matrix, x_names, y_names, file)
54
+ File.open(file, 'w') do |f|
55
+ f.puts x_names.join("\t")
56
+ matrix.each_with_index do |row, i|
57
+ f.puts [y_names[i]].concat(row).join("\t")
58
+ end
59
+ end
60
+ end
61
+
62
+ def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
63
+ File.open(cluster_ic_data_file, 'w') do |f|
64
+ f.puts %w[cluster_id ic Plen].join("\t")
65
+ all_ics.each_with_index do |cluster_ics, i|
66
+ break if i == limit
67
+ cluster_length = cluster_ics.length
68
+ cluster_ics.each_with_index do |clust_ic, j|
69
+ f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
76
+ File.open(cluster_chromosome_data_file, 'w') do |f|
77
+ f.puts %w[cluster_id chr count].join("\t")
78
+ index = 0
79
+ last_id = cluster_data.first.first unless cluster_data.empty?
80
+ cluster_data.each do |cluster_id, patient_number, chr, count|
81
+ index += 1 if cluster_id != last_id
82
+ break if index == limit
83
+ f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
84
+ last_id = cluster_id
85
+ end
86
+ end
87
+ end
88
+
89
+ def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
90
+ File.open(coverage_to_plot_file, 'w') do |f|
91
+ coverage_to_plot.each do |chr, position, freq|
92
+ f.puts "#{chr}\t#{position}\t#{freq}"
93
+ end
94
+ end
95
+ end
96
+
97
+
98
+ def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
99
+ CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
100
+ suggested_childs.each do |pat_id, suggestions|
101
+ warning = nil
102
+ warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
103
+ csv << ["PATIENT #{pat_id}", "#{warning}"]
104
+ csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
105
+ suggestions.each do |parent, childs|
106
+ parent_code, parent_name = parent
107
+ if childs.empty?
108
+ csv << ["#{parent_name} (#{parent_code})", '-']
109
+ else
110
+ parent_writed = false
111
+ childs.each do |child_code, child_name|
112
+ if !parent_writed
113
+ parent_field = "#{parent_name} (#{parent_code})"
114
+ parent_writed = true
115
+ else
116
+ parent_field = ""
117
+ end
118
+ csv << [parent_field, "#{child_name} (#{child_code})"]
119
+ end
120
+ end
121
+ end
122
+ csv << ["", ""]
123
+ end
124
+ end
125
+ end
126
+
127
+ def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
128
+ File.open(filename, 'w') do |f|
129
+ f.puts "#{x_axis_name}\t#{y_axis_name}"
130
+ x_axis_value.each_with_index do |value,i|
131
+ y_value = y_axis_value[i]
132
+ raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
133
+ f.puts [value, y_value].join("\t")
134
+ end
135
+ end
136
+ end
137
+
138
+
139
+ def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
140
+ File.open(similarity_matrix_file, 'w') do |f|
141
+ similarity_matrix.each do |row|
142
+ f.puts row.join("\t")
143
+ end
144
+ end
145
+ end
146
+
147
+ def write_profile_pairs(similarity_pairs, filename)
148
+ File.open(filename, 'w') do |f|
149
+ similarity_pairs.each do |pairsA, pairsB_and_values|
150
+ pairsB_and_values.each do |pairsB, values|
151
+ f.puts "#{pairsA}\t#{pairsB}\t#{values}"
152
+ end
153
+ end
154
+ end
155
+ end
156
+
157
+ def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
158
+ File.open(output_file, 'w') do |f|
159
+ f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
160
+ average_hp_per_pat_distribution.each do |patient_num, ave|
161
+ f.puts "#{patient_num}\t#{ave}"
162
+ end
163
+ end
164
+ end
165
+
166
+ def parse_clusters_file(clusters_file, patient_data)
167
+ clusters_info = {}
168
+ clusters_table = []
169
+ File.open(clusters_file).each do |line|
170
+ line.chomp!
171
+ patientID, clusterID = line.split("\t")
172
+ patientHPOProfile = patient_data.get_profile(patientID)
173
+ query = clusters_info[clusterID]
174
+ if query.nil?
175
+ clusters_info[clusterID] = {patientID => patientHPOProfile}
176
+ else
177
+ query[patientID] = patientHPOProfile
178
+ end
179
+ end
180
+ clusters_info.each do |clusterID, patients_info|
181
+ patients_per_cluster = patients_info.keys.length
182
+ clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
183
+ end
184
+ return clusters_table, clusters_info
185
+ end
186
+
187
+ def load_profiles(file_path, hpo)
188
+ profiles = {}
189
+ #count = 0
190
+ File.open(file_path).each do |line|
191
+ id, profile = line.chomp.split("\t")
192
+ hpos = profile.split(',').map{|a| a.to_sym}
193
+ hpos, rejected_hpos = hpo.check_ids(hpos)
194
+ if !hpos.empty?
195
+ hpos = hpo.clean_profile(hpos)
196
+ profiles[id] = hpos if !hpos.empty?
197
+ end
198
+ end
199
+ return profiles
200
+ end
201
+
202
+ def load_variants(variant_folder)
203
+ variants = {}
204
+ Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
205
+ profile_id = File.basename(path, '.tab')
206
+ vars = []
207
+ File.open(path).each do |line|
208
+ fields = line.chomp.split("\t")
209
+ chr = fields[0]
210
+ start = fields[1].to_i
211
+ vars << [chr, start, start]
212
+ end
213
+ variants[profile_id] = Genomic_Feature.new(vars)
214
+ end
215
+ return variants
216
+ end
217
+
218
+ def load_evidences(evidences_path, hpo)
219
+ genomic_coordinates = {}
220
+ coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
221
+ coord_files.each do |cd_f|
222
+ entity = File.basename(cd_f, '.coords')
223
+ coordinates = load_coordinates(cd_f)
224
+ genomic_coordinates[entity] = coordinates
225
+ end
226
+ evidences = {}
227
+ evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
228
+ evidence_files.each do |e_f|
229
+ pair = File.basename(e_f, '.txt')
230
+ profiles, id2label = load_evidence_profiles(e_f, hpo)
231
+ evidences[pair] = {prof: profiles, id2lab: id2label}
232
+ end
233
+ return evidences, genomic_coordinates
234
+ end
235
+
236
+ def load_coordinates(file_path)
237
+ coordinates = {}
238
+ header = true
239
+ File.open(file_path).each do |line|
240
+ fields = line.chomp.split("\t")
241
+ if header
242
+ header = false
243
+ else
244
+ entity, chr, strand, start, stop = fields
245
+ coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
246
+ end
247
+ end
248
+ return coordinates
249
+ end
250
+
251
+ def load_evidence_profiles(file_path, hpo)
252
+ profiles = {}
253
+ id2label = {}
254
+ #count = 0
255
+ File.open(file_path).each do |line|
256
+ id, label, profile = line.chomp.split("\t")
257
+ hpos = profile.split(',').map{|a| a.to_sym}
258
+ hpos, rejected_hpos = hpo.check_ids(hpos)
259
+ if !hpos.empty?
260
+ hpos = hpo.clean_profile(hpos)
261
+ profiles[id] = hpos if !hpos.empty?
262
+ id2label[id] = label
263
+ end
264
+ end
265
+ return profiles, id2label
266
+ end
267
+
268
+ #Common methods for predictors
269
+ #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
270
+ #1. Indexing by chr (region)
271
+ def coor_overlap?(ref_start, ref_stop, start, stop)
272
+ overlap = false
273
+ if (stop > ref_start && stop <= ref_stop) ||
274
+ (start >= ref_start && start < ref_stop) ||
275
+ (start <= ref_start && stop >= ref_stop) ||
276
+ (start > ref_start && stop < ref_stop)
277
+ overlap = true
278
+ end
279
+ return overlap
280
+ end
281
+
282
+ def load_training_file4regions(training_file)
283
+ training_set = {}
284
+ posInfo = loadFile(training_file)
285
+ posInfo.each do |info|
286
+ chr = info.shift
287
+ query = training_set[chr]
288
+ if query.nil?
289
+ training_set[chr] = [info]
290
+ else
291
+ query << info
292
+ end
293
+ end
294
+ return training_set
295
+ end
296
+
297
+ #2. Indexing by hpo (code)
298
+ #prepare training file for analysis using phenotype2region prediction
299
+ def load_training_file4HPO(training_file, thresold=0)
300
+ training_set = {}
301
+ information = loadFile(training_file, thresold)
302
+ information.each do |info|
303
+ hpoCode = info.delete_at(4)
304
+ query = training_set[hpoCode]
305
+ if query.nil?
306
+ training_set[hpoCode] = [info]
307
+ else
308
+ query << info
309
+ end
310
+ end
311
+ # STDERR.puts training_set.keys.inspect
312
+ return training_set
313
+ end
314
+
315
+
316
+ #3. Load training info file:
317
+ #Chr;Start;Stop;HPO;Association;node
318
+ def loadFile(file, thresold=0)
319
+ information = []
320
+ File.open(file).each do |line|
321
+ line.chomp!
322
+ allInfo = line.split("\t")
323
+ associationValue = allInfo[4].to_f
324
+ if associationValue >= thresold
325
+ chr = allInfo[0]
326
+ startPos = allInfo[1].to_i
327
+ stopPos = allInfo[2].to_i
328
+ hpoCode = allInfo[3]
329
+ nodeID = allInfo[5]
330
+ information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
331
+ end
332
+ end
333
+ return information
334
+ end
335
+
336
+ def load_hpo_ci_values(information_coefficient_file)
337
+ hpos_ci_values = {}
338
+ File.open(information_coefficient_file).each do |line|
339
+ line.chomp!
340
+ hpo_code, ci = line.split("\t")
341
+ hpos_ci_values[hpo_code.to_sym] = ci.to_f
342
+ end
343
+ return hpos_ci_values
344
+ end
345
+
346
+ def load_clustered_patients(file)
347
+ clusters = {}
348
+ File.open(file).each do |line|
349
+ line.chomp!
350
+ pat_id, cluster_id = line.split("\t")
351
+ query = clusters[cluster_id]
352
+ if query.nil?
353
+ clusters[cluster_id] = [pat_id]
354
+ else
355
+ query << pat_id
356
+ end
357
+ end
358
+ return clusters
359
+ end
360
+
361
+ def load_gene_data(gene_data_path)
362
+ gene_list = {} #geneID => attr
363
+ gene_location = {} # chr => gene
364
+ infile = open(gene_data_path)
365
+ gz = Zlib::GzipReader.new(infile)
366
+ current_chr = nil
367
+ genes = []
368
+ gz.each_line do |line|
369
+ line.chomp!
370
+ next if line =~ /^#/
371
+ fields = line.split("\t")
372
+ if fields[8].include?('genome=chromosome')
373
+ chr = fields[8].split(';')[1].split('=').last
374
+ gene_location[current_chr] = genes
375
+ genes = []
376
+ current_chr = chr
377
+ elsif fields[2] == 'gene'
378
+ attributes = {}
379
+ fields[8].split(';').each do |pair|
380
+ key, value = pair.split('=')
381
+ attributes[key] = value
382
+ end
383
+ geneName = nil
384
+ geneName = attributes['gene'] if !attributes['gene'].nil?
385
+ geneSyns = []
386
+ geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
387
+ description = attributes['description']
388
+ description = URI.unescape(description) if !description.nil?
389
+ attributes['Dbxref'] =~ /GeneID:(\d+)/
390
+ gene_list[$1] = [geneName, geneSyns, description]
391
+ genes << [$1, fields[3].to_i, fields[4].to_i]
392
+ end
393
+ end
394
+ gene_location[current_chr] = genes
395
+ return gene_list, gene_location
396
+ end
397
+
398
+ def parse_kegg_data(query_genes)
399
+ kegg_data = {} #gene => attb
400
+ while !query_genes.empty?
401
+ gene_set = query_genes.shift(10)
402
+ url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
403
+ uri = URI(url)
404
+ response = Net::HTTP.get(uri)
405
+ geneID = nil
406
+ gene_names = []
407
+ definition = nil
408
+ pathways = []
409
+ parsing_pathway_field = false
410
+ response.squeeze(' ').each_line do |line|
411
+ line.chomp!
412
+ if line =~ /^ENTRY/
413
+ geneID = line.split(' ')[1]
414
+ elsif line =~ /^NAME/
415
+ gene_names = line.split(' ', 2).last.split(', ')
416
+ elsif line =~ /^DEFINITION/
417
+ definition = line.split(' ', 2)[1]
418
+ elsif line =~ /^PATHWAY/
419
+ pathways << line.split(' ', 3)[1..2]
420
+ parsing_pathway_field = true
421
+ elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
422
+ parsing_pathway_field = false
423
+ elsif parsing_pathway_field
424
+ pathways << line.strip.split(' ', 2)
425
+ elsif line == '///'
426
+ parsing_pathway_field = false
427
+ kegg_data[geneID] = [gene_names, definition, pathways]
428
+ pathways = []
429
+ gene_names = []
430
+ end
431
+ end
432
+ end
433
+ return kegg_data
434
+ end
435
+
436
+ def write_compressed_plain_file(data, path)
437
+ File.open(path, 'w') do |f|
438
+ gz = Zlib::GzipWriter.new(f)
439
+ gz.write data.to_json
440
+ gz.close
441
+ end
442
+ end
443
+
444
+ def read_compressed_json(path)
445
+ infile = open(path)
446
+ gz = Zlib::GzipReader.new(infile)
447
+ object = JSON.parse(gz.read)
448
+ return object
449
+ end
450
+
451
+ def download(ftp_server, path, name)
452
+ ftp = Net::FTP.new()
453
+ ftp.connect(ftp_server)
454
+ ftp.login
455
+ ftp.getbinaryfile(path, name)
456
+ ftp.close
457
+ end
@@ -0,0 +1,106 @@
1
+ class Cohort_Parser
2
+ def self.load(options)
3
+ fields2extract = get_fields2extract(options)
4
+ field_numbers = fields2extract.values
5
+ records = read_records(options, fields2extract, field_numbers)
6
+ cohort, rejected_terms, rejected_recs = create_cohort(records, options)
7
+ return cohort, rejected_terms, rejected_recs
8
+ end
9
+
10
+ def self.read_records(options, fields2extract, field_numbers)
11
+ records = {}
12
+ count = 0
13
+ File.open(options[:input_file]).each do |line|
14
+ line.chomp!
15
+ if options[:header] && count == 0
16
+ line.gsub!(/#\s*/,'') # correct comment like headers
17
+ field_names = line.split("\t")
18
+ get_field_numbers2extract(field_names, fields2extract)
19
+ field_numbers = fields2extract.values
20
+ else
21
+ fields = line.split("\t")
22
+ record = field_numbers.map{|n| fields[n]}
23
+ if fields2extract[:id_col].nil?
24
+ id = "rec_#{count}" #generate ids
25
+ else
26
+ id = record.shift
27
+ end
28
+ if !record[0].nil?
29
+ record[0] = record[0].split(options[:separator])
30
+ else
31
+ record[0] = []
32
+ end
33
+ record[2] = record[2].to_i if !options[:start_col].nil?
34
+ record[3] = record[3].to_i if !options[:end_col].nil?
35
+ query = records[id]
36
+ if query.nil?
37
+ records[id] = [record]
38
+ else
39
+ query << record
40
+ end
41
+ end
42
+ count +=1
43
+ end
44
+ return records
45
+ end
46
+
47
+ def self.get_fields2extract(options)
48
+ fields2extract = {}
49
+ [:id_col, :ont_col, :chromosome_col, :start_col, :end_col].each do |field|
50
+ col = options[field]
51
+ if !col.nil?
52
+ col = col.to_i if !options[:header]
53
+ fields2extract[field] = col
54
+ end
55
+ end
56
+ return fields2extract
57
+ end
58
+
59
+ def self.get_field_numbers2extract(field_names, fields2extract)
60
+ fields2extract.each do |field, name|
61
+ fields2extract[field] = field_names.index(name)
62
+ end
63
+ end
64
+
65
+ def self.create_cohort(records, options)
66
+ ont = Cohort.get_ontology(Cohort.act_ont)
67
+ rejected_terms = []
68
+ rejected_recs = []
69
+ cohort = Cohort.new()
70
+ records.each do |id, record|
71
+ rec = record.first
72
+ terms = rec.first
73
+ if options[:names]
74
+ init_term_number = terms.length
75
+ terms, rec_rejected_terms = ont.translate_names(terms)
76
+ if !rec_rejected_terms.empty?
77
+ STDERR.puts "WARNING: record #{id} has the unknown term NAMES '#{rec_rejected_terms.join(',')}'. Terms removed."
78
+ rejected_terms.concat(rec_rejected_terms)
79
+ end
80
+ if terms.empty? && init_term_number > 0
81
+ rejected_recs << id
82
+ next
83
+ end
84
+ end
85
+ if rec.length > 1 # there is genomic region attributes
86
+ variants = record.map{|v| v[1..3] }
87
+ else
88
+ variants = [] # Not exists genomic region attributes so we create a empty array
89
+ end
90
+ cohort.add_record([id, terms, check_variants(variants)])
91
+ end
92
+ return cohort, rejected_terms.uniq, rejected_recs
93
+ end
94
+
95
+ def self.check_variants(vars)
96
+ checked_vars = []
97
+ vars.each do |var| #[chr, start, stop]
98
+ if var.first == '-' # the chr must be defined
99
+ STDERR.puts "WARNING: variant #{var.join(',')} has been removed"
100
+ else
101
+ checked_vars << var
102
+ end
103
+ end
104
+ return vars
105
+ end
106
+ end
data/lib/pets/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pets
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.4"
3
3
  end