pets 0.2.3 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
data/lib/pets/io.rb ADDED
@@ -0,0 +1,481 @@
1
+ require 'csv'
2
+ require 'bio-vcf'
3
+
4
+ def load_hpo_ontology(hpo_file, excluded_hpo_file)
5
+ hpo = nil
6
+ if !hpo_file.include?('.json')
7
+ if !excluded_hpo_file.nil?
8
+ hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
9
+ else
10
+ hpo = Ontology.new(file: hpo_file, load_file: true)
11
+ end
12
+ else
13
+ hpo = Ontology.new
14
+ hpo.read(hpo_file)
15
+ if !excluded_hpo_file.nil?
16
+ hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
17
+ hpo.remove_removable()
18
+ hpo.build_index()
19
+ end
20
+ end
21
+ return hpo
22
+ end
23
+
24
+ def read_excluded_hpo_file(file)
25
+ excluded_hpo = []
26
+ File.open(file).each do |line|
27
+ excluded_hpo << line.chomp
28
+ end
29
+ return excluded_hpo
30
+ end
31
+
32
+ def write_hash(hash, file_path, header = [])
33
+ File.open(file_path, 'w') do |handler|
34
+ handler.puts header.join("\t") if !header.empty?
35
+ hash.each do |key, array|
36
+ handler.puts "#{key}\t#{array.join("\t")}"
37
+ end
38
+ end
39
+ end
40
+
41
+ def write_array(array, file_path)
42
+ File.open(file_path, 'w') do |handler|
43
+ array.each do |record|
44
+ if record.class == String
45
+ line = record
46
+ else
47
+ line = record.join("\t")
48
+ end
49
+ handler.puts line
50
+ end
51
+ end
52
+ end
53
+
54
+ def write_matrix_for_R(matrix, x_names, y_names, file)
55
+ File.open(file, 'w') do |f|
56
+ f.puts x_names.join("\t")
57
+ matrix.each_with_index do |row, i|
58
+ f.puts [y_names[i]].concat(row).join("\t")
59
+ end
60
+ end
61
+ end
62
+
63
+ def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
64
+ File.open(cluster_ic_data_file, 'w') do |f|
65
+ f.puts %w[cluster_id ic Plen].join("\t")
66
+ all_ics.each_with_index do |cluster_ics, i|
67
+ break if i == limit
68
+ cluster_length = cluster_ics.length
69
+ cluster_ics.each_with_index do |clust_ic, j|
70
+ f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
77
+ File.open(cluster_chromosome_data_file, 'w') do |f|
78
+ f.puts %w[cluster_id chr count].join("\t")
79
+ index = 0
80
+ last_id = cluster_data.first.first unless cluster_data.empty?
81
+ cluster_data.each do |cluster_id, patient_number, chr, count|
82
+ index += 1 if cluster_id != last_id
83
+ break if index == limit
84
+ f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
85
+ last_id = cluster_id
86
+ end
87
+ end
88
+ end
89
+
90
+ def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
91
+ File.open(coverage_to_plot_file, 'w') do |f|
92
+ coverage_to_plot.each do |chr, position, freq|
93
+ f.puts "#{chr}\t#{position}\t#{freq}"
94
+ end
95
+ end
96
+ end
97
+
98
+
99
+ def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
100
+ CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
101
+ suggested_childs.each do |pat_id, suggestions|
102
+ warning = nil
103
+ warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
104
+ csv << ["PATIENT #{pat_id}", "#{warning}"]
105
+ csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
106
+ suggestions.each do |parent, childs|
107
+ parent_code, parent_name = parent
108
+ if childs.empty?
109
+ csv << ["#{parent_name} (#{parent_code})", '-']
110
+ else
111
+ parent_writed = false
112
+ childs.each do |child_code, child_name|
113
+ if !parent_writed
114
+ parent_field = "#{parent_name} (#{parent_code})"
115
+ parent_writed = true
116
+ else
117
+ parent_field = ""
118
+ end
119
+ csv << [parent_field, "#{child_name} (#{child_code})"]
120
+ end
121
+ end
122
+ end
123
+ csv << ["", ""]
124
+ end
125
+ end
126
+ end
127
+
128
+ def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
129
+ File.open(filename, 'w') do |f|
130
+ f.puts "#{x_axis_name}\t#{y_axis_name}"
131
+ x_axis_value.each_with_index do |value,i|
132
+ y_value = y_axis_value[i]
133
+ raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
134
+ f.puts [value, y_value].join("\t")
135
+ end
136
+ end
137
+ end
138
+
139
+
140
+ def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
141
+ File.open(similarity_matrix_file, 'w') do |f|
142
+ similarity_matrix.each do |row|
143
+ f.puts row.join("\t")
144
+ end
145
+ end
146
+ end
147
+
148
+ def write_profile_pairs(similarity_pairs, filename)
149
+ File.open(filename, 'w') do |f|
150
+ similarity_pairs.each do |pairsA, pairsB_and_values|
151
+ pairsB_and_values.each do |pairsB, values|
152
+ f.puts "#{pairsA}\t#{pairsB}\t#{values}"
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
159
+ File.open(output_file, 'w') do |f|
160
+ f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
161
+ average_hp_per_pat_distribution.each do |patient_num, ave|
162
+ f.puts "#{patient_num}\t#{ave}"
163
+ end
164
+ end
165
+ end
166
+
167
+ def parse_clusters_file(clusters_file, patient_data)
168
+ clusters_info = {}
169
+ clusters_table = []
170
+ File.open(clusters_file).each do |line|
171
+ line.chomp!
172
+ patientID, clusterID = line.split("\t")
173
+ patientHPOProfile = patient_data.get_profile(patientID)
174
+ query = clusters_info[clusterID]
175
+ if query.nil?
176
+ clusters_info[clusterID] = {patientID => patientHPOProfile}
177
+ else
178
+ query[patientID] = patientHPOProfile
179
+ end
180
+ end
181
+ clusters_info.each do |clusterID, patients_info|
182
+ patients_per_cluster = patients_info.keys.length
183
+ clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
184
+ end
185
+ return clusters_table, clusters_info
186
+ end
187
+
188
+ def load_profiles(file_path, hpo)
189
+ profiles = {}
190
+ #count = 0
191
+ File.open(file_path).each do |line|
192
+ id, profile = line.chomp.split("\t")
193
+ hpos = profile.split(',').map{|a| a.to_sym}
194
+ hpos, rejected_hpos = hpo.check_ids(hpos)
195
+ if !hpos.empty?
196
+ hpos = hpo.clean_profile(hpos)
197
+ profiles[id] = hpos if !hpos.empty?
198
+ end
199
+ end
200
+ return profiles
201
+ end
202
+
203
+ def load_variants(variant_folder)
204
+ variants = {}
205
+ Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path|
206
+ profile_id, ext = File.basename(path).split(".", 2)
207
+ if ext == 'tab' || ext == 'txt'
208
+ vars = load_tabular_vars(path)
209
+ elsif ext == 'vcf' || ext == 'vcf.gz'
210
+ vars = load_vcf(path, ext)
211
+ end
212
+ variants[profile_id] = Genomic_Feature.new(vars)
213
+ end
214
+ return variants
215
+ end
216
+
217
+ def load_tabular_vars(path)
218
+ vars = []
219
+ File.open(path).each do |line|
220
+ fields = line.chomp.split("\t")
221
+ chr = fields[0].gsub('chr','')
222
+ start = fields[1].to_i
223
+ vars << [chr, start, start]
224
+ end
225
+ return vars
226
+ end
227
+
228
+ def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
229
+ vars = [] # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file
230
+ vcf = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false )
231
+ vcf.each do |var|
232
+ vars << [var.chrom.gsub('chr',''), var.pos, var.pos]
233
+ end
234
+ puts vars.length
235
+ return vars
236
+ end
237
+
238
+ def load_evidences(evidences_path, hpo)
239
+ genomic_coordinates = {}
240
+ coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
241
+ coord_files.each do |cd_f|
242
+ entity = File.basename(cd_f, '.coords')
243
+ coordinates = load_coordinates(cd_f)
244
+ genomic_coordinates[entity] = coordinates
245
+ end
246
+ evidences = {}
247
+ evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
248
+ evidence_files.each do |e_f|
249
+ pair = File.basename(e_f, '.txt')
250
+ profiles, id2label = load_evidence_profiles(e_f, hpo)
251
+ evidences[pair] = {prof: profiles, id2lab: id2label}
252
+ end
253
+ return evidences, genomic_coordinates
254
+ end
255
+
256
+ def load_coordinates(file_path)
257
+ coordinates = {}
258
+ header = true
259
+ File.open(file_path).each do |line|
260
+ fields = line.chomp.split("\t")
261
+ if header
262
+ header = false
263
+ else
264
+ entity, chr, strand, start, stop = fields
265
+ if chr == 'NA'
266
+ STDERR.puts "Warning: Record #{fields.inspect} is undefined"
267
+ next
268
+ end
269
+ coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
270
+ end
271
+ end
272
+ return coordinates
273
+ end
274
+
275
+ def load_evidence_profiles(file_path, hpo)
276
+ profiles = {}
277
+ id2label = {}
278
+ #count = 0
279
+ File.open(file_path).each do |line|
280
+ id, label, profile = line.chomp.split("\t")
281
+ hpos = profile.split(',').map{|a| a.to_sym}
282
+ hpos, rejected_hpos = hpo.check_ids(hpos)
283
+ if !hpos.empty?
284
+ hpos = hpo.clean_profile(hpos)
285
+ profiles[id] = hpos if !hpos.empty?
286
+ id2label[id] = label
287
+ end
288
+ end
289
+ return profiles, id2label
290
+ end
291
+
292
+ #Common methods for predictors
293
+ #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
294
+ #1. Indexing by chr (region)
295
+ def coor_overlap?(ref_start, ref_stop, start, stop)
296
+ overlap = false
297
+ if (stop > ref_start && stop <= ref_stop) ||
298
+ (start >= ref_start && start < ref_stop) ||
299
+ (start <= ref_start && stop >= ref_stop) ||
300
+ (start > ref_start && stop < ref_stop)
301
+ overlap = true
302
+ end
303
+ return overlap
304
+ end
305
+
306
+ def load_training_file4regions(training_file)
307
+ training_set = {}
308
+ posInfo = loadFile(training_file)
309
+ posInfo.each do |info|
310
+ chr = info.shift
311
+ query = training_set[chr]
312
+ if query.nil?
313
+ training_set[chr] = [info]
314
+ else
315
+ query << info
316
+ end
317
+ end
318
+ return training_set
319
+ end
320
+
321
+ #2. Indexing by hpo (code)
322
+ #prepare training file for analysis using phenotype2region prediction
323
+ def load_training_file4HPO(training_file, thresold=0)
324
+ training_set = {}
325
+ information = loadFile(training_file, thresold)
326
+ information.each do |info|
327
+ hpoCode = info.delete_at(4)
328
+ query = training_set[hpoCode]
329
+ if query.nil?
330
+ training_set[hpoCode] = [info]
331
+ else
332
+ query << info
333
+ end
334
+ end
335
+ # STDERR.puts training_set.keys.inspect
336
+ return training_set
337
+ end
338
+
339
+
340
+ #3. Load training info file:
341
+ #Chr;Start;Stop;HPO;Association;node
342
+ def loadFile(file, thresold=0)
343
+ information = []
344
+ File.open(file).each do |line|
345
+ line.chomp!
346
+ allInfo = line.split("\t")
347
+ associationValue = allInfo[4].to_f
348
+ if associationValue >= thresold
349
+ chr = allInfo[0]
350
+ startPos = allInfo[1].to_i
351
+ stopPos = allInfo[2].to_i
352
+ hpoCode = allInfo[3]
353
+ nodeID = allInfo[5]
354
+ information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
355
+ end
356
+ end
357
+ return information
358
+ end
359
+
360
+ def load_hpo_ci_values(information_coefficient_file)
361
+ hpos_ci_values = {}
362
+ File.open(information_coefficient_file).each do |line|
363
+ line.chomp!
364
+ hpo_code, ci = line.split("\t")
365
+ hpos_ci_values[hpo_code.to_sym] = ci.to_f
366
+ end
367
+ return hpos_ci_values
368
+ end
369
+
370
+ def load_clustered_patients(file)
371
+ clusters = {}
372
+ File.open(file).each do |line|
373
+ line.chomp!
374
+ pat_id, cluster_id = line.split("\t")
375
+ query = clusters[cluster_id]
376
+ if query.nil?
377
+ clusters[cluster_id] = [pat_id]
378
+ else
379
+ query << pat_id
380
+ end
381
+ end
382
+ return clusters
383
+ end
384
+
385
+ def load_gene_data(gene_data_path)
386
+ gene_list = {} #geneID => attr
387
+ gene_location = {} # chr => gene
388
+ infile = open(gene_data_path)
389
+ gz = Zlib::GzipReader.new(infile)
390
+ current_chr = nil
391
+ genes = []
392
+ gz.each_line do |line|
393
+ line.chomp!
394
+ next if line =~ /^#/
395
+ fields = line.split("\t")
396
+ if fields[8].include?('genome=chromosome')
397
+ chr = fields[8].split(';')[1].split('=').last
398
+ gene_location[current_chr] = genes
399
+ genes = []
400
+ current_chr = chr
401
+ elsif fields[2] == 'gene'
402
+ attributes = {}
403
+ fields[8].split(';').each do |pair|
404
+ key, value = pair.split('=')
405
+ attributes[key] = value
406
+ end
407
+ geneName = nil
408
+ geneName = attributes['gene'] if !attributes['gene'].nil?
409
+ geneSyns = []
410
+ geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
411
+ description = attributes['description']
412
+ description = URI.unescape(description) if !description.nil?
413
+ attributes['Dbxref'] =~ /GeneID:(\d+)/
414
+ gene_list[$1] = [geneName, geneSyns, description]
415
+ genes << [$1, fields[3].to_i, fields[4].to_i]
416
+ end
417
+ end
418
+ gene_location[current_chr] = genes
419
+ return gene_list, gene_location
420
+ end
421
+
422
+ def parse_kegg_data(query_genes)
423
+ kegg_data = {} #gene => attb
424
+ while !query_genes.empty?
425
+ gene_set = query_genes.shift(10)
426
+ url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
427
+ uri = URI(url)
428
+ response = Net::HTTP.get(uri)
429
+ geneID = nil
430
+ gene_names = []
431
+ definition = nil
432
+ pathways = []
433
+ parsing_pathway_field = false
434
+ response.squeeze(' ').each_line do |line|
435
+ line.chomp!
436
+ if line =~ /^ENTRY/
437
+ geneID = line.split(' ')[1]
438
+ elsif line =~ /^NAME/
439
+ gene_names = line.split(' ', 2).last.split(', ')
440
+ elsif line =~ /^DEFINITION/
441
+ definition = line.split(' ', 2)[1]
442
+ elsif line =~ /^PATHWAY/
443
+ pathways << line.split(' ', 3)[1..2]
444
+ parsing_pathway_field = true
445
+ elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
446
+ parsing_pathway_field = false
447
+ elsif parsing_pathway_field
448
+ pathways << line.strip.split(' ', 2)
449
+ elsif line == '///'
450
+ parsing_pathway_field = false
451
+ kegg_data[geneID] = [gene_names, definition, pathways]
452
+ pathways = []
453
+ gene_names = []
454
+ end
455
+ end
456
+ end
457
+ return kegg_data
458
+ end
459
+
460
+ def write_compressed_plain_file(data, path)
461
+ File.open(path, 'w') do |f|
462
+ gz = Zlib::GzipWriter.new(f)
463
+ gz.write data.to_json
464
+ gz.close
465
+ end
466
+ end
467
+
468
+ def read_compressed_json(path)
469
+ infile = open(path)
470
+ gz = Zlib::GzipReader.new(infile)
471
+ object = JSON.parse(gz.read)
472
+ return object
473
+ end
474
+
475
+ def download(ftp_server, path, name)
476
+ ftp = Net::FTP.new()
477
+ ftp.connect(ftp_server)
478
+ ftp.login
479
+ ftp.getbinaryfile(path, name)
480
+ ftp.close
481
+ end
@@ -0,0 +1,111 @@
1
+ class Cohort_Parser
2
+ def self.load(options)
3
+ fields2extract = get_fields2extract(options)
4
+ field_numbers = fields2extract.values
5
+ records = read_records(options, fields2extract, field_numbers)
6
+ options[:extracted_fields] = fields2extract.keys
7
+ cohort, rejected_terms, rejected_recs = create_cohort(records, options)
8
+ return cohort, rejected_terms, rejected_recs
9
+ end
10
+
11
+ def self.read_records(options, fields2extract, field_numbers)
12
+ records = {}
13
+ count = 0
14
+ File.open(options[:input_file]).each do |line|
15
+ line.chomp!
16
+ if options[:header] && count == 0
17
+ line.gsub!(/#\s*/,'') # correct comment like headers
18
+ field_names = line.split("\t")
19
+ get_field_numbers2extract(field_names, fields2extract)
20
+ field_numbers = fields2extract.values
21
+ else
22
+ fields = line.split("\t")
23
+ record = field_numbers.map{|n| fields[n]}
24
+ if fields2extract[:id_col].nil?
25
+ id = "rec_#{count}" #generate ids
26
+ else
27
+ id = record.shift
28
+ end
29
+ if !record[0].nil?
30
+ record[0] = record[0].split(options[:separator])
31
+ else
32
+ record[0] = []
33
+ end
34
+ record[2] = record[2].to_i if !options[:start_col].nil?
35
+ record[3] = record[3].to_i if !options[:end_col].nil?
36
+ query = records[id]
37
+ if query.nil?
38
+ records[id] = [record]
39
+ else
40
+ query << record
41
+ end
42
+ end
43
+ count +=1
44
+ end
45
+ return records
46
+ end
47
+
48
+ def self.get_fields2extract(options)
49
+ fields2extract = {}
50
+ [:id_col, :ont_col, :chromosome_col, :start_col, :end_col, :sex_col].each do |field|
51
+ col = options[field]
52
+ if !col.nil?
53
+ col = col.to_i if !options[:header]
54
+ fields2extract[field] = col
55
+ end
56
+ end
57
+ return fields2extract
58
+ end
59
+
60
+ def self.get_field_numbers2extract(field_names, fields2extract)
61
+ fields2extract.each do |field, name|
62
+ fields2extract[field] = field_names.index(name)
63
+ end
64
+ end
65
+
66
+ def self.create_cohort(records, options)
67
+ ont = Cohort.get_ontology(Cohort.act_ont)
68
+ rejected_terms = []
69
+ rejected_recs = []
70
+ cohort = Cohort.new()
71
+ records.each do |id, record|
72
+ rec = record.first
73
+ terms = rec.first
74
+ if options[:names] # Translate hpo names 2 codes
75
+ init_term_number = terms.length
76
+ terms, rec_rejected_terms = ont.translate_names(terms)
77
+ if !rec_rejected_terms.empty?
78
+ STDERR.puts "WARNING: record #{id} has the unknown term NAMES '#{rec_rejected_terms.join(',')}'. Terms removed."
79
+ rejected_terms.concat(rec_rejected_terms)
80
+ end
81
+ if terms.empty? && init_term_number > 0
82
+ rejected_recs << id
83
+ next
84
+ end
85
+ end
86
+ if rec.length > 1 # there is genomic region attributes
87
+ variants = record.map{|v| v[1..3] }
88
+ else
89
+ variants = [] # Not exists genomic region attributes so we create a empty array
90
+ end
91
+ other_attr = {}
92
+ if options[:extracted_fields].include?(:sex_col) # Check for additional attributes. -1 is applied to ignore :id in extracted fields
93
+ other_attr[:sex] = record.first[options[:extracted_fields].index(:sex_col) -1]
94
+ end
95
+ cohort.add_record([id, terms, check_variants(variants)], other_attr)
96
+ end
97
+ return cohort, rejected_terms.uniq, rejected_recs
98
+ end
99
+
100
+ def self.check_variants(vars)
101
+ checked_vars = []
102
+ vars.each do |var| #[chr, start, stop]
103
+ if var.first == '-' # the chr must be defined
104
+ STDERR.puts "WARNING: variant #{var.join(',')} has been removed"
105
+ else
106
+ checked_vars << var
107
+ end
108
+ end
109
+ return vars
110
+ end
111
+ end
@@ -0,0 +1,39 @@
1
+ require 'genomic_features'
2
+ class Reference_parser
3
+
4
+ def self.load(file_path, file_format: nil, feature_type: nil)
5
+ file_format = file_path.split('.', 2).last if file_format.nil?
6
+ if file_format == 'gtf'
7
+ regions, all_attrs = parse_gtf(file_path, feature_type: feature_type)
8
+ end
9
+
10
+ return Genomic_Feature.new(regions, annotations: all_attrs)
11
+ end
12
+
13
+ def self.parse_gtf(file_path, feature_type: nil) # https://www.ensembl.org/info/website/upload/gff.html
14
+ features = []
15
+ all_attrs = {}
16
+ File.open(file_path).each do |line|
17
+ next if /^#/ =~ line
18
+ seqname, source, feature, start, stop, score, strand, frame, attribute = line.chomp.split("\t")
19
+ if feature_type.nil? || feature_type == feature
20
+ attrs = process_attrs(attribute, ';', ' ')
21
+ attrs['source'] = source
22
+ attrs['feature'] = feature
23
+ id = attrs['gene_id']
24
+ features << [seqname.gsub('chr',''), start.to_i, stop.to_i, id]
25
+ all_attrs[id] = attrs
26
+ end
27
+ end
28
+ return features, all_attrs
29
+ end
30
+
31
+ private
32
+ def self.process_attrs(attributes, tuple_sep, field_sep)
33
+ return attributes.split(tuple_sep).map{|attr_pair|
34
+ tuple = attr_pair.strip.split(field_sep, 2)
35
+ tuple.last.gsub!('"','')
36
+ tuple
37
+ }.to_h
38
+ end
39
+ end
data/lib/pets/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pets
2
- VERSION = "0.2.3"
2
+ VERSION = "0.2.5"
3
3
  end
data/lib/pets.rb CHANGED
@@ -1,4 +1,13 @@
1
1
  require "pets/version"
2
+ require "pets/constants"
3
+ require "pets/parsers/cohort_parser"
4
+ require "pets/parsers/reference_parser"
5
+ require "pets/coPatReporterMethods"
6
+ require "pets/generalMethods"
7
+ require "pets/io"
8
+ require "pets/phen2reg_methods"
9
+ require "pets/cohort"
10
+ require "pets/genomic_features"
2
11
 
3
12
  module Pets
4
13
  class Error < StandardError; end
data/pets.gemspec CHANGED
@@ -38,14 +38,18 @@ Gem::Specification.new do |spec|
38
38
 
39
39
  spec.add_development_dependency "bundler", "~> 2.0"
40
40
  spec.add_development_dependency "rake", "~> 13.0.3"
41
- spec.add_development_dependency "rspec", "~> 3.10.0"
41
+ spec.add_development_dependency "rspec", "~> 3.11.0"
42
42
  spec.add_dependency "statistics2"
43
43
  spec.add_dependency "terminal-table"
44
44
  spec.add_dependency "semtools", "~> 0.1.0"
45
+ spec.add_dependency "NetAnalyzer"
45
46
  spec.add_dependency "report_html"
46
47
  spec.add_dependency "numo-narray"
47
48
  spec.add_dependency "npy"
48
- spec.add_dependency "parallel", "~> 1.20.1"
49
-
49
+ spec.add_dependency "expcalc"
50
+ spec.add_dependency "bio-vcf"
51
+ spec.add_dependency "parallel", "~> 1.20.1"
52
+ spec.add_runtime_dependency 'net-ftp'
53
+ spec.add_runtime_dependency 'net-http'
50
54
  end
51
55