pets 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
data/lib/pets/cohort.rb
ADDED
@@ -0,0 +1,309 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'semtools'
|
3
|
+
|
4
|
+
class Cohort
|
5
|
+
@@ont = {}
|
6
|
+
class << self # https://www.ruby-forum.com/t/attr-accessor-for-class-variable/136693
|
7
|
+
attr_accessor :act_ont # Which ontology use for ont related operations
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_accessor :profiles
|
11
|
+
|
12
|
+
def self.get_ontology(ont_id)
|
13
|
+
return @@ont[ont_id]
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.load_ontology(ont_name, ont_file, excluded_terms_file = nil)
|
17
|
+
ont = nil
|
18
|
+
if !ont_file.include?('.json')
|
19
|
+
if !excluded_terms_file.nil?
|
20
|
+
ont = Ontology.new(file: ont_file, load_file: true, removable_terms: read_excluded_ont_file(excluded_terms_file))
|
21
|
+
else
|
22
|
+
ont = Ontology.new(file: ont_file, load_file: true)
|
23
|
+
end
|
24
|
+
else
|
25
|
+
ont = Ontology.new
|
26
|
+
ont.read(ont_file)
|
27
|
+
end
|
28
|
+
@@ont[ont_name] = ont
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.read_excluded_ont_file(file)
|
32
|
+
excluded_hpo = []
|
33
|
+
File.open(file).each do |line|
|
34
|
+
excluded_hpo << line.chomp
|
35
|
+
end
|
36
|
+
return excluded_hpo
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize()
|
40
|
+
@profiles = {}
|
41
|
+
@vars = {}
|
42
|
+
@extra_attr = {}
|
43
|
+
@var_idx = Genomic_Feature.new([])
|
44
|
+
end
|
45
|
+
|
46
|
+
def add_record(rec, extra_attr = nil) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
47
|
+
id, profile, vars = rec
|
48
|
+
@profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
|
49
|
+
@extra_attr[id] = extra_attr if !extra_attr.nil?
|
50
|
+
add_gen_feat(id, vars) if !vars.nil?
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete(id)
|
54
|
+
@profiles.delete(id)
|
55
|
+
@vars.delete(id)
|
56
|
+
end
|
57
|
+
|
58
|
+
def select_by_profile!
|
59
|
+
@profiles.select!{|id, profile| yield(id, profile)}
|
60
|
+
current_ids = @profiles.keys
|
61
|
+
@vars.select!{|id, var| current_ids.include?(id)}
|
62
|
+
end
|
63
|
+
|
64
|
+
def select_by_var!
|
65
|
+
@vars.select!{|id, profile| yield(id, profile)}
|
66
|
+
current_ids = @vars.keys
|
67
|
+
@profiles.select!{|id, var| current_ids.include?(id)}
|
68
|
+
end
|
69
|
+
|
70
|
+
def filter_by_term_number(n_terms)
|
71
|
+
select_by_profile!{|id, profile| profile.length >= n_terms}
|
72
|
+
end
|
73
|
+
|
74
|
+
def remove_incomplete_records # remove resc that lacks of vars or phenotypes
|
75
|
+
ids_with_terms = @profiles.keys
|
76
|
+
ids_with_vars = []
|
77
|
+
@vars.each{|id, regs| ids_with_vars << id if regs.length > 0}
|
78
|
+
full_ids = ids_with_vars & ids_with_terms
|
79
|
+
@profiles.select!{|id, prof| full_ids.include?(id)}
|
80
|
+
@vars.select!{|id, var| full_ids.include?(id)}
|
81
|
+
end
|
82
|
+
|
83
|
+
def add_gen_feat(id, feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
84
|
+
@vars[id] = Genomic_Feature.new(feat_array)
|
85
|
+
end
|
86
|
+
|
87
|
+
def get_profile(id)
|
88
|
+
return @profiles[id]
|
89
|
+
end
|
90
|
+
|
91
|
+
def get_vars(id)
|
92
|
+
return @vars[id]
|
93
|
+
end
|
94
|
+
|
95
|
+
def each_profile()
|
96
|
+
@profiles.each do |id, profile|
|
97
|
+
yield(id, profile)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def each_var()
|
102
|
+
@vars.each do |id, var_info|
|
103
|
+
yield(id, var_info)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_general_profile(thr=0) # TODO move funcionality to semtools
|
108
|
+
term_count = Hash.new(0)
|
109
|
+
each_profile do |id, prof|
|
110
|
+
prof.each do |term|
|
111
|
+
term_count[term] += 1
|
112
|
+
end
|
113
|
+
end
|
114
|
+
records = @profiles.length
|
115
|
+
general_profile = []
|
116
|
+
term_count.each do |term, count|
|
117
|
+
general_profile << term if count.fdiv(records) >= thr
|
118
|
+
end
|
119
|
+
ont = @@ont[Cohort.act_ont]
|
120
|
+
return ont.clean_profile_hard(general_profile)
|
121
|
+
end
|
122
|
+
|
123
|
+
def check(hard=false) # OLD format_patient_data
|
124
|
+
ont = @@ont[Cohort.act_ont]
|
125
|
+
rejected_terms = []
|
126
|
+
rejected_recs = []
|
127
|
+
@profiles.each do |id, terms|
|
128
|
+
if hard
|
129
|
+
terms = ont.clean_profile_hard(terms)
|
130
|
+
rejec_terms = []
|
131
|
+
else
|
132
|
+
terms, rejec_terms = ont.check_ids(terms)
|
133
|
+
end
|
134
|
+
if !rejec_terms.empty?
|
135
|
+
STDERR.puts "WARNING: record #{id} has the unknown CODES '#{rejec_terms.join(',')}'. Codes removed."
|
136
|
+
rejected_terms.concat(rejec_terms)
|
137
|
+
end
|
138
|
+
if terms.empty?
|
139
|
+
rejected_recs << id
|
140
|
+
else
|
141
|
+
@profiles[id] = terms
|
142
|
+
end
|
143
|
+
end
|
144
|
+
@profiles.select!{|id, record| !rejected_recs.include?(id)}
|
145
|
+
@vars.select!{|id, record| !rejected_recs.include?(id)}
|
146
|
+
return rejected_terms.uniq, rejected_recs
|
147
|
+
end
|
148
|
+
|
149
|
+
def link2ont(ont_id)
|
150
|
+
@@ont[ont_id].load_profiles(@profiles)
|
151
|
+
end
|
152
|
+
|
153
|
+
def get_profile_redundancy
|
154
|
+
ont = @@ont[Cohort.act_ont]
|
155
|
+
profile_sizes, parental_terms_per_profile = ont.get_profile_redundancy
|
156
|
+
return profile_sizes, parental_terms_per_profile
|
157
|
+
end
|
158
|
+
|
159
|
+
def get_profiles_terms_frequency(options={})
|
160
|
+
ont = @@ont[Cohort.act_ont]
|
161
|
+
term_stats = ont.get_profiles_terms_frequency(**options) #https://www.ruby-lang.org/en/news/2019/12/12/separation-of-positional-and-keyword-arguments-in-ruby-3-0/
|
162
|
+
return term_stats
|
163
|
+
end
|
164
|
+
|
165
|
+
def compute_term_list_and_childs()
|
166
|
+
ont = @@ont[Cohort.act_ont]
|
167
|
+
suggested_childs, term_with_childs_ratio = ont.compute_term_list_and_childs()
|
168
|
+
end
|
169
|
+
|
170
|
+
def get_profile_ontology_distribution_tables()
|
171
|
+
ont = @@ont[Cohort.act_ont]
|
172
|
+
ontology_levels, distribution_percentage = ont.get_profile_ontology_distribution_tables
|
173
|
+
ontology_levels.unshift(["level", "ontology", "cohort"])
|
174
|
+
distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
|
175
|
+
return ontology_levels, distribution_percentage
|
176
|
+
end
|
177
|
+
|
178
|
+
def get_ic_analysis()
|
179
|
+
ont = @@ont[Cohort.act_ont]
|
180
|
+
onto_ic, freq_ic = ont.get_observed_ics_by_onto_and_freq # IC for TERMS
|
181
|
+
onto_ic_profile, freq_ic_profile = ont.get_profiles_resnik_dual_ICs # IC for PROFILES
|
182
|
+
return onto_ic, freq_ic, onto_ic_profile, freq_ic_profile
|
183
|
+
end
|
184
|
+
|
185
|
+
def get_profiles_mean_size
|
186
|
+
ont = @@ont[Cohort.act_ont]
|
187
|
+
profile_mean_size = ont.get_profiles_mean_size
|
188
|
+
return profile_mean_size
|
189
|
+
end
|
190
|
+
|
191
|
+
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
192
|
+
ont = @@ont[Cohort.act_ont]
|
193
|
+
length_percent = ont.get_profile_length_at_percentile(perc=perc, increasing_sort: increasing_sort)
|
194
|
+
return length_percent
|
195
|
+
end
|
196
|
+
|
197
|
+
def get_dataset_specifity_index(type)
|
198
|
+
ont = @@ont[Cohort.act_ont]
|
199
|
+
dsi = ont.get_dataset_specifity_index(type)
|
200
|
+
return dsi
|
201
|
+
end
|
202
|
+
|
203
|
+
def compare_profiles(options={})
|
204
|
+
ont = @@ont[Cohort.act_ont]
|
205
|
+
similarities = ont.compare_profiles(**options)
|
206
|
+
return similarities
|
207
|
+
end
|
208
|
+
|
209
|
+
def index_vars # equivalent to process_patient_data
|
210
|
+
each_var do |id, var|
|
211
|
+
@var_idx.merge(var, id)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def get_vars_sizes(summary=false)
|
216
|
+
if summary
|
217
|
+
return @var_idx.get_summary_sizes
|
218
|
+
else
|
219
|
+
return @var_idx.get_sizes
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def generate_cluster_regions(meth, tag, lim)
|
224
|
+
@var_idx.generate_cluster_regions(meth, tag, lim)
|
225
|
+
end
|
226
|
+
|
227
|
+
def save(output_file, mode = :default, translate = false)
|
228
|
+
File.open(output_file, 'w') do |f|
|
229
|
+
f.puts "id\tchr\tstart\tstop\tterms" if mode == 'paco'
|
230
|
+
ont = @@ont[Cohort.act_ont]
|
231
|
+
@profiles.each do |id, terms|
|
232
|
+
terms, rejected = ont.translate_ids(terms) if translate
|
233
|
+
id_variants = @vars[id]
|
234
|
+
variants = []
|
235
|
+
if id_variants.nil? || id_variants.length == 0
|
236
|
+
variants << ['-', '-', '-']
|
237
|
+
else
|
238
|
+
id_variants.each do |chr, reg|
|
239
|
+
variants << [chr, reg[:start], reg[:stop]]
|
240
|
+
end
|
241
|
+
end
|
242
|
+
variants.each do |var|
|
243
|
+
if mode == :default
|
244
|
+
f.puts "#{id}\t#{terms.join('|')}\t#{var.join("\t")}"
|
245
|
+
elsif mode == :paco
|
246
|
+
f.puts "#{id}\t#{var.join("\t")}\t#{terms.join('|')}"
|
247
|
+
else
|
248
|
+
abort('Wrong save mode option, please try default or paco')
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def export_phenopackets(output_folder, genome_assembly, vcf_index: nil)
|
256
|
+
ont = @@ont[Cohort.act_ont]
|
257
|
+
metaData = {
|
258
|
+
"createdBy" => "PETS",
|
259
|
+
"resources" => [{
|
260
|
+
"id" => "hp",
|
261
|
+
"name" => "human phenotype ontology",
|
262
|
+
"namespacePrefix" => "HP",
|
263
|
+
"url" => "http://purl.obolibrary.org/obo/hp.owl",
|
264
|
+
# "version" => "2018-03-08",
|
265
|
+
"iriPrefix" => "http://purl.obolibrary.org/obo/HP_"
|
266
|
+
}]
|
267
|
+
}
|
268
|
+
|
269
|
+
@profiles.each do |id, terms|
|
270
|
+
phenopacket = {metaData: metaData}
|
271
|
+
query_sex = @extra_attr.dig(id, :sex)
|
272
|
+
sex = query_sex.nil? ? 'UNKNOWN_SEX' : query_sex
|
273
|
+
phenopacket[:subject] = {
|
274
|
+
id: id,
|
275
|
+
sex: sex
|
276
|
+
}
|
277
|
+
phenotypicFeatures = []
|
278
|
+
terms.each do |term|
|
279
|
+
term_name = ont.translate_id(term)
|
280
|
+
phenotypicFeatures << {
|
281
|
+
type: { id: term, label: term_name},
|
282
|
+
classOfOnset: {"id" => "HP:0003577", "label" => "Congenital onset"}
|
283
|
+
}
|
284
|
+
end
|
285
|
+
phenopacket[:phenotypicFeatures] = phenotypicFeatures
|
286
|
+
if !vcf_index.nil? && vcf_index.include?(id)
|
287
|
+
htsFiles = []
|
288
|
+
htsFiles << {
|
289
|
+
"uri" => "file:/" + vcf_index[id],
|
290
|
+
"description" => id,
|
291
|
+
"htsFormat" => "VCF",
|
292
|
+
"genomeAssembly" => genome_assembly,
|
293
|
+
"individualToSampleIdentifiers" => { "patient1" => id }
|
294
|
+
}
|
295
|
+
phenopacket[:htsFiles] = htsFiles
|
296
|
+
end
|
297
|
+
File.open(File.join(output_folder, id.to_s + ".json"), "w") { |f| f.write JSON.pretty_generate(phenopacket) }
|
298
|
+
id_variants = @vars[id]
|
299
|
+
variants = []
|
300
|
+
if id_variants.nil? || id_variants.length == 0
|
301
|
+
variants << ['-', '-', '-']
|
302
|
+
else
|
303
|
+
id_variants.each do |chr, reg|
|
304
|
+
variants << [chr, reg[:start], reg[:stop]]
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
options[:chromosome_col] = nil
|
2
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
3
|
+
options[:chromosome_col] = data
|
4
|
+
end
|
5
|
+
|
6
|
+
options[:id_col] = nil
|
7
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
8
|
+
options[:id_col] = data
|
9
|
+
end
|
10
|
+
|
11
|
+
options[:end_col] = nil
|
12
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
13
|
+
options[:end_col] = data
|
14
|
+
end
|
15
|
+
|
16
|
+
options[:genome_assembly] = 'hg38'
|
17
|
+
opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
|
18
|
+
options[:genome_assembly] = data
|
19
|
+
end
|
20
|
+
|
21
|
+
options[:header] = true
|
22
|
+
#chr\tstart\tstop
|
23
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
24
|
+
options[:header] = false
|
25
|
+
end
|
26
|
+
|
27
|
+
options[:sex_col] = nil
|
28
|
+
opts.on("-x", "--sex_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient sex") do |data|
|
29
|
+
options[:sex_col] = data
|
30
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
# Needs define ROOT_PATH constant in file requiring this file
|
2
|
+
COMMON_OPTPARSE = File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
|
3
|
+
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
4
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
5
|
+
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
6
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
7
|
+
MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
|
8
|
+
IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
|