pets 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
data/lib/pets/io.rb
ADDED
@@ -0,0 +1,481 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'bio-vcf'
|
3
|
+
|
4
|
+
def load_hpo_ontology(hpo_file, excluded_hpo_file)
|
5
|
+
hpo = nil
|
6
|
+
if !hpo_file.include?('.json')
|
7
|
+
if !excluded_hpo_file.nil?
|
8
|
+
hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
|
9
|
+
else
|
10
|
+
hpo = Ontology.new(file: hpo_file, load_file: true)
|
11
|
+
end
|
12
|
+
else
|
13
|
+
hpo = Ontology.new
|
14
|
+
hpo.read(hpo_file)
|
15
|
+
if !excluded_hpo_file.nil?
|
16
|
+
hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
|
17
|
+
hpo.remove_removable()
|
18
|
+
hpo.build_index()
|
19
|
+
end
|
20
|
+
end
|
21
|
+
return hpo
|
22
|
+
end
|
23
|
+
|
24
|
+
def read_excluded_hpo_file(file)
|
25
|
+
excluded_hpo = []
|
26
|
+
File.open(file).each do |line|
|
27
|
+
excluded_hpo << line.chomp
|
28
|
+
end
|
29
|
+
return excluded_hpo
|
30
|
+
end
|
31
|
+
|
32
|
+
def write_hash(hash, file_path, header = [])
|
33
|
+
File.open(file_path, 'w') do |handler|
|
34
|
+
handler.puts header.join("\t") if !header.empty?
|
35
|
+
hash.each do |key, array|
|
36
|
+
handler.puts "#{key}\t#{array.join("\t")}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def write_array(array, file_path)
|
42
|
+
File.open(file_path, 'w') do |handler|
|
43
|
+
array.each do |record|
|
44
|
+
if record.class == String
|
45
|
+
line = record
|
46
|
+
else
|
47
|
+
line = record.join("\t")
|
48
|
+
end
|
49
|
+
handler.puts line
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def write_matrix_for_R(matrix, x_names, y_names, file)
|
55
|
+
File.open(file, 'w') do |f|
|
56
|
+
f.puts x_names.join("\t")
|
57
|
+
matrix.each_with_index do |row, i|
|
58
|
+
f.puts [y_names[i]].concat(row).join("\t")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
|
64
|
+
File.open(cluster_ic_data_file, 'w') do |f|
|
65
|
+
f.puts %w[cluster_id ic Plen].join("\t")
|
66
|
+
all_ics.each_with_index do |cluster_ics, i|
|
67
|
+
break if i == limit
|
68
|
+
cluster_length = cluster_ics.length
|
69
|
+
cluster_ics.each_with_index do |clust_ic, j|
|
70
|
+
f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
77
|
+
File.open(cluster_chromosome_data_file, 'w') do |f|
|
78
|
+
f.puts %w[cluster_id chr count].join("\t")
|
79
|
+
index = 0
|
80
|
+
last_id = cluster_data.first.first unless cluster_data.empty?
|
81
|
+
cluster_data.each do |cluster_id, patient_number, chr, count|
|
82
|
+
index += 1 if cluster_id != last_id
|
83
|
+
break if index == limit
|
84
|
+
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
85
|
+
last_id = cluster_id
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
91
|
+
File.open(coverage_to_plot_file, 'w') do |f|
|
92
|
+
coverage_to_plot.each do |chr, position, freq|
|
93
|
+
f.puts "#{chr}\t#{position}\t#{freq}"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
100
|
+
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
101
|
+
suggested_childs.each do |pat_id, suggestions|
|
102
|
+
warning = nil
|
103
|
+
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
104
|
+
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
105
|
+
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
106
|
+
suggestions.each do |parent, childs|
|
107
|
+
parent_code, parent_name = parent
|
108
|
+
if childs.empty?
|
109
|
+
csv << ["#{parent_name} (#{parent_code})", '-']
|
110
|
+
else
|
111
|
+
parent_writed = false
|
112
|
+
childs.each do |child_code, child_name|
|
113
|
+
if !parent_writed
|
114
|
+
parent_field = "#{parent_name} (#{parent_code})"
|
115
|
+
parent_writed = true
|
116
|
+
else
|
117
|
+
parent_field = ""
|
118
|
+
end
|
119
|
+
csv << [parent_field, "#{child_name} (#{child_code})"]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
csv << ["", ""]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
|
129
|
+
File.open(filename, 'w') do |f|
|
130
|
+
f.puts "#{x_axis_name}\t#{y_axis_name}"
|
131
|
+
x_axis_value.each_with_index do |value,i|
|
132
|
+
y_value = y_axis_value[i]
|
133
|
+
raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
|
134
|
+
f.puts [value, y_value].join("\t")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
|
141
|
+
File.open(similarity_matrix_file, 'w') do |f|
|
142
|
+
similarity_matrix.each do |row|
|
143
|
+
f.puts row.join("\t")
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def write_profile_pairs(similarity_pairs, filename)
|
149
|
+
File.open(filename, 'w') do |f|
|
150
|
+
similarity_pairs.each do |pairsA, pairsB_and_values|
|
151
|
+
pairsB_and_values.each do |pairsB, values|
|
152
|
+
f.puts "#{pairsA}\t#{pairsB}\t#{values}"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
|
159
|
+
File.open(output_file, 'w') do |f|
|
160
|
+
f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
|
161
|
+
average_hp_per_pat_distribution.each do |patient_num, ave|
|
162
|
+
f.puts "#{patient_num}\t#{ave}"
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def parse_clusters_file(clusters_file, patient_data)
|
168
|
+
clusters_info = {}
|
169
|
+
clusters_table = []
|
170
|
+
File.open(clusters_file).each do |line|
|
171
|
+
line.chomp!
|
172
|
+
patientID, clusterID = line.split("\t")
|
173
|
+
patientHPOProfile = patient_data.get_profile(patientID)
|
174
|
+
query = clusters_info[clusterID]
|
175
|
+
if query.nil?
|
176
|
+
clusters_info[clusterID] = {patientID => patientHPOProfile}
|
177
|
+
else
|
178
|
+
query[patientID] = patientHPOProfile
|
179
|
+
end
|
180
|
+
end
|
181
|
+
clusters_info.each do |clusterID, patients_info|
|
182
|
+
patients_per_cluster = patients_info.keys.length
|
183
|
+
clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
|
184
|
+
end
|
185
|
+
return clusters_table, clusters_info
|
186
|
+
end
|
187
|
+
|
188
|
+
def load_profiles(file_path, hpo)
|
189
|
+
profiles = {}
|
190
|
+
#count = 0
|
191
|
+
File.open(file_path).each do |line|
|
192
|
+
id, profile = line.chomp.split("\t")
|
193
|
+
hpos = profile.split(',').map{|a| a.to_sym}
|
194
|
+
hpos, rejected_hpos = hpo.check_ids(hpos)
|
195
|
+
if !hpos.empty?
|
196
|
+
hpos = hpo.clean_profile(hpos)
|
197
|
+
profiles[id] = hpos if !hpos.empty?
|
198
|
+
end
|
199
|
+
end
|
200
|
+
return profiles
|
201
|
+
end
|
202
|
+
|
203
|
+
def load_variants(variant_folder)
|
204
|
+
variants = {}
|
205
|
+
Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path|
|
206
|
+
profile_id, ext = File.basename(path).split(".", 2)
|
207
|
+
if ext == 'tab' || ext == 'txt'
|
208
|
+
vars = load_tabular_vars(path)
|
209
|
+
elsif ext == 'vcf' || ext == 'vcf.gz'
|
210
|
+
vars = load_vcf(path, ext)
|
211
|
+
end
|
212
|
+
variants[profile_id] = Genomic_Feature.new(vars)
|
213
|
+
end
|
214
|
+
return variants
|
215
|
+
end
|
216
|
+
|
217
|
+
def load_tabular_vars(path)
|
218
|
+
vars = []
|
219
|
+
File.open(path).each do |line|
|
220
|
+
fields = line.chomp.split("\t")
|
221
|
+
chr = fields[0].gsub('chr','')
|
222
|
+
start = fields[1].to_i
|
223
|
+
vars << [chr, start, start]
|
224
|
+
end
|
225
|
+
return vars
|
226
|
+
end
|
227
|
+
|
228
|
+
def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
|
229
|
+
vars = [] # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file
|
230
|
+
vcf = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false )
|
231
|
+
vcf.each do |var|
|
232
|
+
vars << [var.chrom.gsub('chr',''), var.pos, var.pos]
|
233
|
+
end
|
234
|
+
puts vars.length
|
235
|
+
return vars
|
236
|
+
end
|
237
|
+
|
238
|
+
def load_evidences(evidences_path, hpo)
|
239
|
+
genomic_coordinates = {}
|
240
|
+
coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
|
241
|
+
coord_files.each do |cd_f|
|
242
|
+
entity = File.basename(cd_f, '.coords')
|
243
|
+
coordinates = load_coordinates(cd_f)
|
244
|
+
genomic_coordinates[entity] = coordinates
|
245
|
+
end
|
246
|
+
evidences = {}
|
247
|
+
evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
|
248
|
+
evidence_files.each do |e_f|
|
249
|
+
pair = File.basename(e_f, '.txt')
|
250
|
+
profiles, id2label = load_evidence_profiles(e_f, hpo)
|
251
|
+
evidences[pair] = {prof: profiles, id2lab: id2label}
|
252
|
+
end
|
253
|
+
return evidences, genomic_coordinates
|
254
|
+
end
|
255
|
+
|
256
|
+
def load_coordinates(file_path)
|
257
|
+
coordinates = {}
|
258
|
+
header = true
|
259
|
+
File.open(file_path).each do |line|
|
260
|
+
fields = line.chomp.split("\t")
|
261
|
+
if header
|
262
|
+
header = false
|
263
|
+
else
|
264
|
+
entity, chr, strand, start, stop = fields
|
265
|
+
if chr == 'NA'
|
266
|
+
STDERR.puts "Warning: Record #{fields.inspect} is undefined"
|
267
|
+
next
|
268
|
+
end
|
269
|
+
coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
|
270
|
+
end
|
271
|
+
end
|
272
|
+
return coordinates
|
273
|
+
end
|
274
|
+
|
275
|
+
def load_evidence_profiles(file_path, hpo)
|
276
|
+
profiles = {}
|
277
|
+
id2label = {}
|
278
|
+
#count = 0
|
279
|
+
File.open(file_path).each do |line|
|
280
|
+
id, label, profile = line.chomp.split("\t")
|
281
|
+
hpos = profile.split(',').map{|a| a.to_sym}
|
282
|
+
hpos, rejected_hpos = hpo.check_ids(hpos)
|
283
|
+
if !hpos.empty?
|
284
|
+
hpos = hpo.clean_profile(hpos)
|
285
|
+
profiles[id] = hpos if !hpos.empty?
|
286
|
+
id2label[id] = label
|
287
|
+
end
|
288
|
+
end
|
289
|
+
return profiles, id2label
|
290
|
+
end
|
291
|
+
|
292
|
+
#Common methods for predictors
|
293
|
+
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
294
|
+
#1. Indexing by chr (region)
|
295
|
+
def coor_overlap?(ref_start, ref_stop, start, stop)
|
296
|
+
overlap = false
|
297
|
+
if (stop > ref_start && stop <= ref_stop) ||
|
298
|
+
(start >= ref_start && start < ref_stop) ||
|
299
|
+
(start <= ref_start && stop >= ref_stop) ||
|
300
|
+
(start > ref_start && stop < ref_stop)
|
301
|
+
overlap = true
|
302
|
+
end
|
303
|
+
return overlap
|
304
|
+
end
|
305
|
+
|
306
|
+
def load_training_file4regions(training_file)
|
307
|
+
training_set = {}
|
308
|
+
posInfo = loadFile(training_file)
|
309
|
+
posInfo.each do |info|
|
310
|
+
chr = info.shift
|
311
|
+
query = training_set[chr]
|
312
|
+
if query.nil?
|
313
|
+
training_set[chr] = [info]
|
314
|
+
else
|
315
|
+
query << info
|
316
|
+
end
|
317
|
+
end
|
318
|
+
return training_set
|
319
|
+
end
|
320
|
+
|
321
|
+
#2. Indexing by hpo (code)
|
322
|
+
#prepare training file for analysis using phenotype2region prediction
|
323
|
+
def load_training_file4HPO(training_file, thresold=0)
|
324
|
+
training_set = {}
|
325
|
+
information = loadFile(training_file, thresold)
|
326
|
+
information.each do |info|
|
327
|
+
hpoCode = info.delete_at(4)
|
328
|
+
query = training_set[hpoCode]
|
329
|
+
if query.nil?
|
330
|
+
training_set[hpoCode] = [info]
|
331
|
+
else
|
332
|
+
query << info
|
333
|
+
end
|
334
|
+
end
|
335
|
+
# STDERR.puts training_set.keys.inspect
|
336
|
+
return training_set
|
337
|
+
end
|
338
|
+
|
339
|
+
|
340
|
+
#3. Load training info file:
|
341
|
+
#Chr;Start;Stop;HPO;Association;node
|
342
|
+
def loadFile(file, thresold=0)
|
343
|
+
information = []
|
344
|
+
File.open(file).each do |line|
|
345
|
+
line.chomp!
|
346
|
+
allInfo = line.split("\t")
|
347
|
+
associationValue = allInfo[4].to_f
|
348
|
+
if associationValue >= thresold
|
349
|
+
chr = allInfo[0]
|
350
|
+
startPos = allInfo[1].to_i
|
351
|
+
stopPos = allInfo[2].to_i
|
352
|
+
hpoCode = allInfo[3]
|
353
|
+
nodeID = allInfo[5]
|
354
|
+
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
355
|
+
end
|
356
|
+
end
|
357
|
+
return information
|
358
|
+
end
|
359
|
+
|
360
|
+
def load_hpo_ci_values(information_coefficient_file)
|
361
|
+
hpos_ci_values = {}
|
362
|
+
File.open(information_coefficient_file).each do |line|
|
363
|
+
line.chomp!
|
364
|
+
hpo_code, ci = line.split("\t")
|
365
|
+
hpos_ci_values[hpo_code.to_sym] = ci.to_f
|
366
|
+
end
|
367
|
+
return hpos_ci_values
|
368
|
+
end
|
369
|
+
|
370
|
+
def load_clustered_patients(file)
|
371
|
+
clusters = {}
|
372
|
+
File.open(file).each do |line|
|
373
|
+
line.chomp!
|
374
|
+
pat_id, cluster_id = line.split("\t")
|
375
|
+
query = clusters[cluster_id]
|
376
|
+
if query.nil?
|
377
|
+
clusters[cluster_id] = [pat_id]
|
378
|
+
else
|
379
|
+
query << pat_id
|
380
|
+
end
|
381
|
+
end
|
382
|
+
return clusters
|
383
|
+
end
|
384
|
+
|
385
|
+
def load_gene_data(gene_data_path)
|
386
|
+
gene_list = {} #geneID => attr
|
387
|
+
gene_location = {} # chr => gene
|
388
|
+
infile = open(gene_data_path)
|
389
|
+
gz = Zlib::GzipReader.new(infile)
|
390
|
+
current_chr = nil
|
391
|
+
genes = []
|
392
|
+
gz.each_line do |line|
|
393
|
+
line.chomp!
|
394
|
+
next if line =~ /^#/
|
395
|
+
fields = line.split("\t")
|
396
|
+
if fields[8].include?('genome=chromosome')
|
397
|
+
chr = fields[8].split(';')[1].split('=').last
|
398
|
+
gene_location[current_chr] = genes
|
399
|
+
genes = []
|
400
|
+
current_chr = chr
|
401
|
+
elsif fields[2] == 'gene'
|
402
|
+
attributes = {}
|
403
|
+
fields[8].split(';').each do |pair|
|
404
|
+
key, value = pair.split('=')
|
405
|
+
attributes[key] = value
|
406
|
+
end
|
407
|
+
geneName = nil
|
408
|
+
geneName = attributes['gene'] if !attributes['gene'].nil?
|
409
|
+
geneSyns = []
|
410
|
+
geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
|
411
|
+
description = attributes['description']
|
412
|
+
description = URI.unescape(description) if !description.nil?
|
413
|
+
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
414
|
+
gene_list[$1] = [geneName, geneSyns, description]
|
415
|
+
genes << [$1, fields[3].to_i, fields[4].to_i]
|
416
|
+
end
|
417
|
+
end
|
418
|
+
gene_location[current_chr] = genes
|
419
|
+
return gene_list, gene_location
|
420
|
+
end
|
421
|
+
|
422
|
+
def parse_kegg_data(query_genes)
|
423
|
+
kegg_data = {} #gene => attb
|
424
|
+
while !query_genes.empty?
|
425
|
+
gene_set = query_genes.shift(10)
|
426
|
+
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
427
|
+
uri = URI(url)
|
428
|
+
response = Net::HTTP.get(uri)
|
429
|
+
geneID = nil
|
430
|
+
gene_names = []
|
431
|
+
definition = nil
|
432
|
+
pathways = []
|
433
|
+
parsing_pathway_field = false
|
434
|
+
response.squeeze(' ').each_line do |line|
|
435
|
+
line.chomp!
|
436
|
+
if line =~ /^ENTRY/
|
437
|
+
geneID = line.split(' ')[1]
|
438
|
+
elsif line =~ /^NAME/
|
439
|
+
gene_names = line.split(' ', 2).last.split(', ')
|
440
|
+
elsif line =~ /^DEFINITION/
|
441
|
+
definition = line.split(' ', 2)[1]
|
442
|
+
elsif line =~ /^PATHWAY/
|
443
|
+
pathways << line.split(' ', 3)[1..2]
|
444
|
+
parsing_pathway_field = true
|
445
|
+
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
446
|
+
parsing_pathway_field = false
|
447
|
+
elsif parsing_pathway_field
|
448
|
+
pathways << line.strip.split(' ', 2)
|
449
|
+
elsif line == '///'
|
450
|
+
parsing_pathway_field = false
|
451
|
+
kegg_data[geneID] = [gene_names, definition, pathways]
|
452
|
+
pathways = []
|
453
|
+
gene_names = []
|
454
|
+
end
|
455
|
+
end
|
456
|
+
end
|
457
|
+
return kegg_data
|
458
|
+
end
|
459
|
+
|
460
|
+
def write_compressed_plain_file(data, path)
|
461
|
+
File.open(path, 'w') do |f|
|
462
|
+
gz = Zlib::GzipWriter.new(f)
|
463
|
+
gz.write data.to_json
|
464
|
+
gz.close
|
465
|
+
end
|
466
|
+
end
|
467
|
+
|
468
|
+
def read_compressed_json(path)
|
469
|
+
infile = open(path)
|
470
|
+
gz = Zlib::GzipReader.new(infile)
|
471
|
+
object = JSON.parse(gz.read)
|
472
|
+
return object
|
473
|
+
end
|
474
|
+
|
475
|
+
def download(ftp_server, path, name)
|
476
|
+
ftp = Net::FTP.new()
|
477
|
+
ftp.connect(ftp_server)
|
478
|
+
ftp.login
|
479
|
+
ftp.getbinaryfile(path, name)
|
480
|
+
ftp.close
|
481
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
class Cohort_Parser
|
2
|
+
def self.load(options)
|
3
|
+
fields2extract = get_fields2extract(options)
|
4
|
+
field_numbers = fields2extract.values
|
5
|
+
records = read_records(options, fields2extract, field_numbers)
|
6
|
+
options[:extracted_fields] = fields2extract.keys
|
7
|
+
cohort, rejected_terms, rejected_recs = create_cohort(records, options)
|
8
|
+
return cohort, rejected_terms, rejected_recs
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.read_records(options, fields2extract, field_numbers)
|
12
|
+
records = {}
|
13
|
+
count = 0
|
14
|
+
File.open(options[:input_file]).each do |line|
|
15
|
+
line.chomp!
|
16
|
+
if options[:header] && count == 0
|
17
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
18
|
+
field_names = line.split("\t")
|
19
|
+
get_field_numbers2extract(field_names, fields2extract)
|
20
|
+
field_numbers = fields2extract.values
|
21
|
+
else
|
22
|
+
fields = line.split("\t")
|
23
|
+
record = field_numbers.map{|n| fields[n]}
|
24
|
+
if fields2extract[:id_col].nil?
|
25
|
+
id = "rec_#{count}" #generate ids
|
26
|
+
else
|
27
|
+
id = record.shift
|
28
|
+
end
|
29
|
+
if !record[0].nil?
|
30
|
+
record[0] = record[0].split(options[:separator])
|
31
|
+
else
|
32
|
+
record[0] = []
|
33
|
+
end
|
34
|
+
record[2] = record[2].to_i if !options[:start_col].nil?
|
35
|
+
record[3] = record[3].to_i if !options[:end_col].nil?
|
36
|
+
query = records[id]
|
37
|
+
if query.nil?
|
38
|
+
records[id] = [record]
|
39
|
+
else
|
40
|
+
query << record
|
41
|
+
end
|
42
|
+
end
|
43
|
+
count +=1
|
44
|
+
end
|
45
|
+
return records
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.get_fields2extract(options)
|
49
|
+
fields2extract = {}
|
50
|
+
[:id_col, :ont_col, :chromosome_col, :start_col, :end_col, :sex_col].each do |field|
|
51
|
+
col = options[field]
|
52
|
+
if !col.nil?
|
53
|
+
col = col.to_i if !options[:header]
|
54
|
+
fields2extract[field] = col
|
55
|
+
end
|
56
|
+
end
|
57
|
+
return fields2extract
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.get_field_numbers2extract(field_names, fields2extract)
|
61
|
+
fields2extract.each do |field, name|
|
62
|
+
fields2extract[field] = field_names.index(name)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.create_cohort(records, options)
|
67
|
+
ont = Cohort.get_ontology(Cohort.act_ont)
|
68
|
+
rejected_terms = []
|
69
|
+
rejected_recs = []
|
70
|
+
cohort = Cohort.new()
|
71
|
+
records.each do |id, record|
|
72
|
+
rec = record.first
|
73
|
+
terms = rec.first
|
74
|
+
if options[:names] # Translate hpo names 2 codes
|
75
|
+
init_term_number = terms.length
|
76
|
+
terms, rec_rejected_terms = ont.translate_names(terms)
|
77
|
+
if !rec_rejected_terms.empty?
|
78
|
+
STDERR.puts "WARNING: record #{id} has the unknown term NAMES '#{rec_rejected_terms.join(',')}'. Terms removed."
|
79
|
+
rejected_terms.concat(rec_rejected_terms)
|
80
|
+
end
|
81
|
+
if terms.empty? && init_term_number > 0
|
82
|
+
rejected_recs << id
|
83
|
+
next
|
84
|
+
end
|
85
|
+
end
|
86
|
+
if rec.length > 1 # there is genomic region attributes
|
87
|
+
variants = record.map{|v| v[1..3] }
|
88
|
+
else
|
89
|
+
variants = [] # Not exists genomic region attributes so we create a empty array
|
90
|
+
end
|
91
|
+
other_attr = {}
|
92
|
+
if options[:extracted_fields].include?(:sex_col) # Check for additional attributes. -1 is applied to ignore :id in extracted fields
|
93
|
+
other_attr[:sex] = record.first[options[:extracted_fields].index(:sex_col) -1]
|
94
|
+
end
|
95
|
+
cohort.add_record([id, terms, check_variants(variants)], other_attr)
|
96
|
+
end
|
97
|
+
return cohort, rejected_terms.uniq, rejected_recs
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.check_variants(vars)
|
101
|
+
checked_vars = []
|
102
|
+
vars.each do |var| #[chr, start, stop]
|
103
|
+
if var.first == '-' # the chr must be defined
|
104
|
+
STDERR.puts "WARNING: variant #{var.join(',')} has been removed"
|
105
|
+
else
|
106
|
+
checked_vars << var
|
107
|
+
end
|
108
|
+
end
|
109
|
+
return vars
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'genomic_features'
|
2
|
+
class Reference_parser
|
3
|
+
|
4
|
+
def self.load(file_path, file_format: nil, feature_type: nil)
|
5
|
+
file_format = file_path.split('.', 2).last if file_format.nil?
|
6
|
+
if file_format == 'gtf'
|
7
|
+
regions, all_attrs = parse_gtf(file_path, feature_type: feature_type)
|
8
|
+
end
|
9
|
+
|
10
|
+
return Genomic_Feature.new(regions, annotations: all_attrs)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parse_gtf(file_path, feature_type: nil) # https://www.ensembl.org/info/website/upload/gff.html
|
14
|
+
features = []
|
15
|
+
all_attrs = {}
|
16
|
+
File.open(file_path).each do |line|
|
17
|
+
next if /^#/ =~ line
|
18
|
+
seqname, source, feature, start, stop, score, strand, frame, attribute = line.chomp.split("\t")
|
19
|
+
if feature_type.nil? || feature_type == feature
|
20
|
+
attrs = process_attrs(attribute, ';', ' ')
|
21
|
+
attrs['source'] = source
|
22
|
+
attrs['feature'] = feature
|
23
|
+
id = attrs['gene_id']
|
24
|
+
features << [seqname.gsub('chr',''), start.to_i, stop.to_i, id]
|
25
|
+
all_attrs[id] = attrs
|
26
|
+
end
|
27
|
+
end
|
28
|
+
return features, all_attrs
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def self.process_attrs(attributes, tuple_sep, field_sep)
|
33
|
+
return attributes.split(tuple_sep).map{|attr_pair|
|
34
|
+
tuple = attr_pair.strip.split(field_sep, 2)
|
35
|
+
tuple.last.gsub!('"','')
|
36
|
+
tuple
|
37
|
+
}.to_h
|
38
|
+
end
|
39
|
+
end
|
data/lib/pets/version.rb
CHANGED
data/lib/pets.rb
CHANGED
@@ -1,4 +1,13 @@
|
|
1
1
|
require "pets/version"
|
2
|
+
require "pets/constants"
|
3
|
+
require "pets/parsers/cohort_parser"
|
4
|
+
require "pets/parsers/reference_parser"
|
5
|
+
require "pets/coPatReporterMethods"
|
6
|
+
require "pets/generalMethods"
|
7
|
+
require "pets/io"
|
8
|
+
require "pets/phen2reg_methods"
|
9
|
+
require "pets/cohort"
|
10
|
+
require "pets/genomic_features"
|
2
11
|
|
3
12
|
module Pets
|
4
13
|
class Error < StandardError; end
|
data/pets.gemspec
CHANGED
@@ -38,14 +38,18 @@ Gem::Specification.new do |spec|
|
|
38
38
|
|
39
39
|
spec.add_development_dependency "bundler", "~> 2.0"
|
40
40
|
spec.add_development_dependency "rake", "~> 13.0.3"
|
41
|
-
spec.add_development_dependency "rspec", "~> 3.
|
41
|
+
spec.add_development_dependency "rspec", "~> 3.11.0"
|
42
42
|
spec.add_dependency "statistics2"
|
43
43
|
spec.add_dependency "terminal-table"
|
44
44
|
spec.add_dependency "semtools", "~> 0.1.0"
|
45
|
+
spec.add_dependency "NetAnalyzer"
|
45
46
|
spec.add_dependency "report_html"
|
46
47
|
spec.add_dependency "numo-narray"
|
47
48
|
spec.add_dependency "npy"
|
48
|
-
spec.add_dependency "
|
49
|
-
|
49
|
+
spec.add_dependency "expcalc"
|
50
|
+
spec.add_dependency "bio-vcf"
|
51
|
+
spec.add_dependency "parallel", "~> 1.20.1"
|
52
|
+
spec.add_runtime_dependency 'net-ftp'
|
53
|
+
spec.add_runtime_dependency 'net-http'
|
50
54
|
end
|
51
55
|
|