pets 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
data/lib/pets/io.rb ADDED
@@ -0,0 +1,481 @@
1
+ require 'csv'
2
+ require 'bio-vcf'
3
+
4
+ def load_hpo_ontology(hpo_file, excluded_hpo_file)
5
+ hpo = nil
6
+ if !hpo_file.include?('.json')
7
+ if !excluded_hpo_file.nil?
8
+ hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
9
+ else
10
+ hpo = Ontology.new(file: hpo_file, load_file: true)
11
+ end
12
+ else
13
+ hpo = Ontology.new
14
+ hpo.read(hpo_file)
15
+ if !excluded_hpo_file.nil?
16
+ hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
17
+ hpo.remove_removable()
18
+ hpo.build_index()
19
+ end
20
+ end
21
+ return hpo
22
+ end
23
+
24
+ def read_excluded_hpo_file(file)
25
+ excluded_hpo = []
26
+ File.open(file).each do |line|
27
+ excluded_hpo << line.chomp
28
+ end
29
+ return excluded_hpo
30
+ end
31
+
32
+ def write_hash(hash, file_path, header = [])
33
+ File.open(file_path, 'w') do |handler|
34
+ handler.puts header.join("\t") if !header.empty?
35
+ hash.each do |key, array|
36
+ handler.puts "#{key}\t#{array.join("\t")}"
37
+ end
38
+ end
39
+ end
40
+
41
+ def write_array(array, file_path)
42
+ File.open(file_path, 'w') do |handler|
43
+ array.each do |record|
44
+ if record.class == String
45
+ line = record
46
+ else
47
+ line = record.join("\t")
48
+ end
49
+ handler.puts line
50
+ end
51
+ end
52
+ end
53
+
54
+ def write_matrix_for_R(matrix, x_names, y_names, file)
55
+ File.open(file, 'w') do |f|
56
+ f.puts x_names.join("\t")
57
+ matrix.each_with_index do |row, i|
58
+ f.puts [y_names[i]].concat(row).join("\t")
59
+ end
60
+ end
61
+ end
62
+
63
+ def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
64
+ File.open(cluster_ic_data_file, 'w') do |f|
65
+ f.puts %w[cluster_id ic Plen].join("\t")
66
+ all_ics.each_with_index do |cluster_ics, i|
67
+ break if i == limit
68
+ cluster_length = cluster_ics.length
69
+ cluster_ics.each_with_index do |clust_ic, j|
70
+ f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
77
+ File.open(cluster_chromosome_data_file, 'w') do |f|
78
+ f.puts %w[cluster_id chr count].join("\t")
79
+ index = 0
80
+ last_id = cluster_data.first.first unless cluster_data.empty?
81
+ cluster_data.each do |cluster_id, patient_number, chr, count|
82
+ index += 1 if cluster_id != last_id
83
+ break if index == limit
84
+ f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
85
+ last_id = cluster_id
86
+ end
87
+ end
88
+ end
89
+
90
+ def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
91
+ File.open(coverage_to_plot_file, 'w') do |f|
92
+ coverage_to_plot.each do |chr, position, freq|
93
+ f.puts "#{chr}\t#{position}\t#{freq}"
94
+ end
95
+ end
96
+ end
97
+
98
+
99
+ def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
100
+ CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
101
+ suggested_childs.each do |pat_id, suggestions|
102
+ warning = nil
103
+ warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
104
+ csv << ["PATIENT #{pat_id}", "#{warning}"]
105
+ csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
106
+ suggestions.each do |parent, childs|
107
+ parent_code, parent_name = parent
108
+ if childs.empty?
109
+ csv << ["#{parent_name} (#{parent_code})", '-']
110
+ else
111
+ parent_writed = false
112
+ childs.each do |child_code, child_name|
113
+ if !parent_writed
114
+ parent_field = "#{parent_name} (#{parent_code})"
115
+ parent_writed = true
116
+ else
117
+ parent_field = ""
118
+ end
119
+ csv << [parent_field, "#{child_name} (#{child_code})"]
120
+ end
121
+ end
122
+ end
123
+ csv << ["", ""]
124
+ end
125
+ end
126
+ end
127
+
128
+ def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
129
+ File.open(filename, 'w') do |f|
130
+ f.puts "#{x_axis_name}\t#{y_axis_name}"
131
+ x_axis_value.each_with_index do |value,i|
132
+ y_value = y_axis_value[i]
133
+ raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
134
+ f.puts [value, y_value].join("\t")
135
+ end
136
+ end
137
+ end
138
+
139
+
140
+ def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
141
+ File.open(similarity_matrix_file, 'w') do |f|
142
+ similarity_matrix.each do |row|
143
+ f.puts row.join("\t")
144
+ end
145
+ end
146
+ end
147
+
148
+ def write_profile_pairs(similarity_pairs, filename)
149
+ File.open(filename, 'w') do |f|
150
+ similarity_pairs.each do |pairsA, pairsB_and_values|
151
+ pairsB_and_values.each do |pairsB, values|
152
+ f.puts "#{pairsA}\t#{pairsB}\t#{values}"
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
159
+ File.open(output_file, 'w') do |f|
160
+ f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
161
+ average_hp_per_pat_distribution.each do |patient_num, ave|
162
+ f.puts "#{patient_num}\t#{ave}"
163
+ end
164
+ end
165
+ end
166
+
167
+ def parse_clusters_file(clusters_file, patient_data)
168
+ clusters_info = {}
169
+ clusters_table = []
170
+ File.open(clusters_file).each do |line|
171
+ line.chomp!
172
+ patientID, clusterID = line.split("\t")
173
+ patientHPOProfile = patient_data.get_profile(patientID)
174
+ query = clusters_info[clusterID]
175
+ if query.nil?
176
+ clusters_info[clusterID] = {patientID => patientHPOProfile}
177
+ else
178
+ query[patientID] = patientHPOProfile
179
+ end
180
+ end
181
+ clusters_info.each do |clusterID, patients_info|
182
+ patients_per_cluster = patients_info.keys.length
183
+ clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
184
+ end
185
+ return clusters_table, clusters_info
186
+ end
187
+
188
+ def load_profiles(file_path, hpo)
189
+ profiles = {}
190
+ #count = 0
191
+ File.open(file_path).each do |line|
192
+ id, profile = line.chomp.split("\t")
193
+ hpos = profile.split(',').map{|a| a.to_sym}
194
+ hpos, rejected_hpos = hpo.check_ids(hpos)
195
+ if !hpos.empty?
196
+ hpos = hpo.clean_profile(hpos)
197
+ profiles[id] = hpos if !hpos.empty?
198
+ end
199
+ end
200
+ return profiles
201
+ end
202
+
203
+ def load_variants(variant_folder)
204
+ variants = {}
205
+ Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path|
206
+ profile_id, ext = File.basename(path).split(".", 2)
207
+ if ext == 'tab' || ext == 'txt'
208
+ vars = load_tabular_vars(path)
209
+ elsif ext == 'vcf' || ext == 'vcf.gz'
210
+ vars = load_vcf(path, ext)
211
+ end
212
+ variants[profile_id] = Genomic_Feature.new(vars)
213
+ end
214
+ return variants
215
+ end
216
+
217
+ def load_tabular_vars(path)
218
+ vars = []
219
+ File.open(path).each do |line|
220
+ fields = line.chomp.split("\t")
221
+ chr = fields[0].gsub('chr','')
222
+ start = fields[1].to_i
223
+ vars << [chr, start, start]
224
+ end
225
+ return vars
226
+ end
227
+
228
+ def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
229
+ vars = [] # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file
230
+ vcf = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false )
231
+ vcf.each do |var|
232
+ vars << [var.chrom.gsub('chr',''), var.pos, var.pos]
233
+ end
234
+ puts vars.length
235
+ return vars
236
+ end
237
+
238
+ def load_evidences(evidences_path, hpo)
239
+ genomic_coordinates = {}
240
+ coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
241
+ coord_files.each do |cd_f|
242
+ entity = File.basename(cd_f, '.coords')
243
+ coordinates = load_coordinates(cd_f)
244
+ genomic_coordinates[entity] = coordinates
245
+ end
246
+ evidences = {}
247
+ evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
248
+ evidence_files.each do |e_f|
249
+ pair = File.basename(e_f, '.txt')
250
+ profiles, id2label = load_evidence_profiles(e_f, hpo)
251
+ evidences[pair] = {prof: profiles, id2lab: id2label}
252
+ end
253
+ return evidences, genomic_coordinates
254
+ end
255
+
256
+ def load_coordinates(file_path)
257
+ coordinates = {}
258
+ header = true
259
+ File.open(file_path).each do |line|
260
+ fields = line.chomp.split("\t")
261
+ if header
262
+ header = false
263
+ else
264
+ entity, chr, strand, start, stop = fields
265
+ if chr == 'NA'
266
+ STDERR.puts "Warning: Record #{fields.inspect} is undefined"
267
+ next
268
+ end
269
+ coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
270
+ end
271
+ end
272
+ return coordinates
273
+ end
274
+
275
+ def load_evidence_profiles(file_path, hpo)
276
+ profiles = {}
277
+ id2label = {}
278
+ #count = 0
279
+ File.open(file_path).each do |line|
280
+ id, label, profile = line.chomp.split("\t")
281
+ hpos = profile.split(',').map{|a| a.to_sym}
282
+ hpos, rejected_hpos = hpo.check_ids(hpos)
283
+ if !hpos.empty?
284
+ hpos = hpo.clean_profile(hpos)
285
+ profiles[id] = hpos if !hpos.empty?
286
+ id2label[id] = label
287
+ end
288
+ end
289
+ return profiles, id2label
290
+ end
291
+
292
+ #Common methods for predictors
293
+ #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
294
+ #1. Indexing by chr (region)
295
+ def coor_overlap?(ref_start, ref_stop, start, stop)
296
+ overlap = false
297
+ if (stop > ref_start && stop <= ref_stop) ||
298
+ (start >= ref_start && start < ref_stop) ||
299
+ (start <= ref_start && stop >= ref_stop) ||
300
+ (start > ref_start && stop < ref_stop)
301
+ overlap = true
302
+ end
303
+ return overlap
304
+ end
305
+
306
+ def load_training_file4regions(training_file)
307
+ training_set = {}
308
+ posInfo = loadFile(training_file)
309
+ posInfo.each do |info|
310
+ chr = info.shift
311
+ query = training_set[chr]
312
+ if query.nil?
313
+ training_set[chr] = [info]
314
+ else
315
+ query << info
316
+ end
317
+ end
318
+ return training_set
319
+ end
320
+
321
+ #2. Indexing by hpo (code)
322
+ #prepare training file for analysis using phenotype2region prediction
323
+ def load_training_file4HPO(training_file, thresold=0)
324
+ training_set = {}
325
+ information = loadFile(training_file, thresold)
326
+ information.each do |info|
327
+ hpoCode = info.delete_at(4)
328
+ query = training_set[hpoCode]
329
+ if query.nil?
330
+ training_set[hpoCode] = [info]
331
+ else
332
+ query << info
333
+ end
334
+ end
335
+ # STDERR.puts training_set.keys.inspect
336
+ return training_set
337
+ end
338
+
339
+
340
+ #3. Load training info file:
341
+ #Chr;Start;Stop;HPO;Association;node
342
+ def loadFile(file, thresold=0)
343
+ information = []
344
+ File.open(file).each do |line|
345
+ line.chomp!
346
+ allInfo = line.split("\t")
347
+ associationValue = allInfo[4].to_f
348
+ if associationValue >= thresold
349
+ chr = allInfo[0]
350
+ startPos = allInfo[1].to_i
351
+ stopPos = allInfo[2].to_i
352
+ hpoCode = allInfo[3]
353
+ nodeID = allInfo[5]
354
+ information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
355
+ end
356
+ end
357
+ return information
358
+ end
359
+
360
+ def load_hpo_ci_values(information_coefficient_file)
361
+ hpos_ci_values = {}
362
+ File.open(information_coefficient_file).each do |line|
363
+ line.chomp!
364
+ hpo_code, ci = line.split("\t")
365
+ hpos_ci_values[hpo_code.to_sym] = ci.to_f
366
+ end
367
+ return hpos_ci_values
368
+ end
369
+
370
+ def load_clustered_patients(file)
371
+ clusters = {}
372
+ File.open(file).each do |line|
373
+ line.chomp!
374
+ pat_id, cluster_id = line.split("\t")
375
+ query = clusters[cluster_id]
376
+ if query.nil?
377
+ clusters[cluster_id] = [pat_id]
378
+ else
379
+ query << pat_id
380
+ end
381
+ end
382
+ return clusters
383
+ end
384
+
385
+ def load_gene_data(gene_data_path)
386
+ gene_list = {} #geneID => attr
387
+ gene_location = {} # chr => gene
388
+ infile = open(gene_data_path)
389
+ gz = Zlib::GzipReader.new(infile)
390
+ current_chr = nil
391
+ genes = []
392
+ gz.each_line do |line|
393
+ line.chomp!
394
+ next if line =~ /^#/
395
+ fields = line.split("\t")
396
+ if fields[8].include?('genome=chromosome')
397
+ chr = fields[8].split(';')[1].split('=').last
398
+ gene_location[current_chr] = genes
399
+ genes = []
400
+ current_chr = chr
401
+ elsif fields[2] == 'gene'
402
+ attributes = {}
403
+ fields[8].split(';').each do |pair|
404
+ key, value = pair.split('=')
405
+ attributes[key] = value
406
+ end
407
+ geneName = nil
408
+ geneName = attributes['gene'] if !attributes['gene'].nil?
409
+ geneSyns = []
410
+ geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
411
+ description = attributes['description']
412
+ description = URI.unescape(description) if !description.nil?
413
+ attributes['Dbxref'] =~ /GeneID:(\d+)/
414
+ gene_list[$1] = [geneName, geneSyns, description]
415
+ genes << [$1, fields[3].to_i, fields[4].to_i]
416
+ end
417
+ end
418
+ gene_location[current_chr] = genes
419
+ return gene_list, gene_location
420
+ end
421
+
422
+ def parse_kegg_data(query_genes)
423
+ kegg_data = {} #gene => attb
424
+ while !query_genes.empty?
425
+ gene_set = query_genes.shift(10)
426
+ url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
427
+ uri = URI(url)
428
+ response = Net::HTTP.get(uri)
429
+ geneID = nil
430
+ gene_names = []
431
+ definition = nil
432
+ pathways = []
433
+ parsing_pathway_field = false
434
+ response.squeeze(' ').each_line do |line|
435
+ line.chomp!
436
+ if line =~ /^ENTRY/
437
+ geneID = line.split(' ')[1]
438
+ elsif line =~ /^NAME/
439
+ gene_names = line.split(' ', 2).last.split(', ')
440
+ elsif line =~ /^DEFINITION/
441
+ definition = line.split(' ', 2)[1]
442
+ elsif line =~ /^PATHWAY/
443
+ pathways << line.split(' ', 3)[1..2]
444
+ parsing_pathway_field = true
445
+ elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
446
+ parsing_pathway_field = false
447
+ elsif parsing_pathway_field
448
+ pathways << line.strip.split(' ', 2)
449
+ elsif line == '///'
450
+ parsing_pathway_field = false
451
+ kegg_data[geneID] = [gene_names, definition, pathways]
452
+ pathways = []
453
+ gene_names = []
454
+ end
455
+ end
456
+ end
457
+ return kegg_data
458
+ end
459
+
460
+ def write_compressed_plain_file(data, path)
461
+ File.open(path, 'w') do |f|
462
+ gz = Zlib::GzipWriter.new(f)
463
+ gz.write data.to_json
464
+ gz.close
465
+ end
466
+ end
467
+
468
+ def read_compressed_json(path)
469
+ infile = open(path)
470
+ gz = Zlib::GzipReader.new(infile)
471
+ object = JSON.parse(gz.read)
472
+ return object
473
+ end
474
+
475
+ def download(ftp_server, path, name)
476
+ ftp = Net::FTP.new()
477
+ ftp.connect(ftp_server)
478
+ ftp.login
479
+ ftp.getbinaryfile(path, name)
480
+ ftp.close
481
+ end
@@ -0,0 +1,111 @@
1
+ class Cohort_Parser
2
+ def self.load(options)
3
+ fields2extract = get_fields2extract(options)
4
+ field_numbers = fields2extract.values
5
+ records = read_records(options, fields2extract, field_numbers)
6
+ options[:extracted_fields] = fields2extract.keys
7
+ cohort, rejected_terms, rejected_recs = create_cohort(records, options)
8
+ return cohort, rejected_terms, rejected_recs
9
+ end
10
+
11
+ def self.read_records(options, fields2extract, field_numbers)
12
+ records = {}
13
+ count = 0
14
+ File.open(options[:input_file]).each do |line|
15
+ line.chomp!
16
+ if options[:header] && count == 0
17
+ line.gsub!(/#\s*/,'') # correct comment like headers
18
+ field_names = line.split("\t")
19
+ get_field_numbers2extract(field_names, fields2extract)
20
+ field_numbers = fields2extract.values
21
+ else
22
+ fields = line.split("\t")
23
+ record = field_numbers.map{|n| fields[n]}
24
+ if fields2extract[:id_col].nil?
25
+ id = "rec_#{count}" #generate ids
26
+ else
27
+ id = record.shift
28
+ end
29
+ if !record[0].nil?
30
+ record[0] = record[0].split(options[:separator])
31
+ else
32
+ record[0] = []
33
+ end
34
+ record[2] = record[2].to_i if !options[:start_col].nil?
35
+ record[3] = record[3].to_i if !options[:end_col].nil?
36
+ query = records[id]
37
+ if query.nil?
38
+ records[id] = [record]
39
+ else
40
+ query << record
41
+ end
42
+ end
43
+ count +=1
44
+ end
45
+ return records
46
+ end
47
+
48
+ def self.get_fields2extract(options)
49
+ fields2extract = {}
50
+ [:id_col, :ont_col, :chromosome_col, :start_col, :end_col, :sex_col].each do |field|
51
+ col = options[field]
52
+ if !col.nil?
53
+ col = col.to_i if !options[:header]
54
+ fields2extract[field] = col
55
+ end
56
+ end
57
+ return fields2extract
58
+ end
59
+
60
+ def self.get_field_numbers2extract(field_names, fields2extract)
61
+ fields2extract.each do |field, name|
62
+ fields2extract[field] = field_names.index(name)
63
+ end
64
+ end
65
+
66
+ def self.create_cohort(records, options)
67
+ ont = Cohort.get_ontology(Cohort.act_ont)
68
+ rejected_terms = []
69
+ rejected_recs = []
70
+ cohort = Cohort.new()
71
+ records.each do |id, record|
72
+ rec = record.first
73
+ terms = rec.first
74
+ if options[:names] # Translate hpo names 2 codes
75
+ init_term_number = terms.length
76
+ terms, rec_rejected_terms = ont.translate_names(terms)
77
+ if !rec_rejected_terms.empty?
78
+ STDERR.puts "WARNING: record #{id} has the unknown term NAMES '#{rec_rejected_terms.join(',')}'. Terms removed."
79
+ rejected_terms.concat(rec_rejected_terms)
80
+ end
81
+ if terms.empty? && init_term_number > 0
82
+ rejected_recs << id
83
+ next
84
+ end
85
+ end
86
+ if rec.length > 1 # there is genomic region attributes
87
+ variants = record.map{|v| v[1..3] }
88
+ else
89
+ variants = [] # Not exists genomic region attributes so we create a empty array
90
+ end
91
+ other_attr = {}
92
+ if options[:extracted_fields].include?(:sex_col) # Check for additional attributes. -1 is applied to ignore :id in extracted fields
93
+ other_attr[:sex] = record.first[options[:extracted_fields].index(:sex_col) -1]
94
+ end
95
+ cohort.add_record([id, terms, check_variants(variants)], other_attr)
96
+ end
97
+ return cohort, rejected_terms.uniq, rejected_recs
98
+ end
99
+
100
+ def self.check_variants(vars)
101
+ checked_vars = []
102
+ vars.each do |var| #[chr, start, stop]
103
+ if var.first == '-' # the chr must be defined
104
+ STDERR.puts "WARNING: variant #{var.join(',')} has been removed"
105
+ else
106
+ checked_vars << var
107
+ end
108
+ end
109
+ return vars
110
+ end
111
+ end
@@ -0,0 +1,39 @@
1
+ require 'genomic_features'
2
+ class Reference_parser
3
+
4
+ def self.load(file_path, file_format: nil, feature_type: nil)
5
+ file_format = file_path.split('.', 2).last if file_format.nil?
6
+ if file_format == 'gtf'
7
+ regions, all_attrs = parse_gtf(file_path, feature_type: feature_type)
8
+ end
9
+
10
+ return Genomic_Feature.new(regions, annotations: all_attrs)
11
+ end
12
+
13
+ def self.parse_gtf(file_path, feature_type: nil) # https://www.ensembl.org/info/website/upload/gff.html
14
+ features = []
15
+ all_attrs = {}
16
+ File.open(file_path).each do |line|
17
+ next if /^#/ =~ line
18
+ seqname, source, feature, start, stop, score, strand, frame, attribute = line.chomp.split("\t")
19
+ if feature_type.nil? || feature_type == feature
20
+ attrs = process_attrs(attribute, ';', ' ')
21
+ attrs['source'] = source
22
+ attrs['feature'] = feature
23
+ id = attrs['gene_id']
24
+ features << [seqname.gsub('chr',''), start.to_i, stop.to_i, id]
25
+ all_attrs[id] = attrs
26
+ end
27
+ end
28
+ return features, all_attrs
29
+ end
30
+
31
+ private
32
+ def self.process_attrs(attributes, tuple_sep, field_sep)
33
+ return attributes.split(tuple_sep).map{|attr_pair|
34
+ tuple = attr_pair.strip.split(field_sep, 2)
35
+ tuple.last.gsub!('"','')
36
+ tuple
37
+ }.to_h
38
+ end
39
+ end
data/lib/pets/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pets
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.5"
3
3
  end
data/lib/pets.rb CHANGED
@@ -1,4 +1,13 @@
1
1
  require "pets/version"
2
+ require "pets/constants"
3
+ require "pets/parsers/cohort_parser"
4
+ require "pets/parsers/reference_parser"
5
+ require "pets/coPatReporterMethods"
6
+ require "pets/generalMethods"
7
+ require "pets/io"
8
+ require "pets/phen2reg_methods"
9
+ require "pets/cohort"
10
+ require "pets/genomic_features"
2
11
 
3
12
  module Pets
4
13
  class Error < StandardError; end
data/pets.gemspec CHANGED
@@ -38,14 +38,18 @@ Gem::Specification.new do |spec|
38
38
 
39
39
  spec.add_development_dependency "bundler", "~> 2.0"
40
40
  spec.add_development_dependency "rake", "~> 13.0.3"
41
- spec.add_development_dependency "rspec", "~> 3.10.0"
41
+ spec.add_development_dependency "rspec", "~> 3.11.0"
42
42
  spec.add_dependency "statistics2"
43
43
  spec.add_dependency "terminal-table"
44
44
  spec.add_dependency "semtools", "~> 0.1.0"
45
+ spec.add_dependency "NetAnalyzer"
45
46
  spec.add_dependency "report_html"
46
47
  spec.add_dependency "numo-narray"
47
48
  spec.add_dependency "npy"
48
- spec.add_dependency "parallel", "~> 1.20.1"
49
-
49
+ spec.add_dependency "expcalc"
50
+ spec.add_dependency "bio-vcf"
51
+ spec.add_dependency "parallel", "~> 1.20.1"
52
+ spec.add_runtime_dependency 'net-ftp'
53
+ spec.add_runtime_dependency 'net-http'
50
54
  end
51
55