pets 0.2.3 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
data/lib/pets/cohort.rb
ADDED
@@ -0,0 +1,309 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'semtools'
|
3
|
+
|
4
|
+
class Cohort
|
5
|
+
@@ont = {}
|
6
|
+
class << self # https://www.ruby-forum.com/t/attr-accessor-for-class-variable/136693
|
7
|
+
attr_accessor :act_ont # Which ontology use for ont related operations
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_accessor :profiles
|
11
|
+
|
12
|
+
def self.get_ontology(ont_id)
|
13
|
+
return @@ont[ont_id]
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.load_ontology(ont_name, ont_file, excluded_terms_file = nil)
|
17
|
+
ont = nil
|
18
|
+
if !ont_file.include?('.json')
|
19
|
+
if !excluded_terms_file.nil?
|
20
|
+
ont = Ontology.new(file: ont_file, load_file: true, removable_terms: read_excluded_ont_file(excluded_terms_file))
|
21
|
+
else
|
22
|
+
ont = Ontology.new(file: ont_file, load_file: true)
|
23
|
+
end
|
24
|
+
else
|
25
|
+
ont = Ontology.new
|
26
|
+
ont.read(ont_file)
|
27
|
+
end
|
28
|
+
@@ont[ont_name] = ont
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.read_excluded_ont_file(file)
|
32
|
+
excluded_hpo = []
|
33
|
+
File.open(file).each do |line|
|
34
|
+
excluded_hpo << line.chomp
|
35
|
+
end
|
36
|
+
return excluded_hpo
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize()
|
40
|
+
@profiles = {}
|
41
|
+
@vars = {}
|
42
|
+
@extra_attr = {}
|
43
|
+
@var_idx = Genomic_Feature.new([])
|
44
|
+
end
|
45
|
+
|
46
|
+
def add_record(rec, extra_attr = nil) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
47
|
+
id, profile, vars = rec
|
48
|
+
@profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
|
49
|
+
@extra_attr[id] = extra_attr if !extra_attr.nil?
|
50
|
+
add_gen_feat(id, vars) if !vars.nil?
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete(id)
|
54
|
+
@profiles.delete(id)
|
55
|
+
@vars.delete(id)
|
56
|
+
end
|
57
|
+
|
58
|
+
def select_by_profile!
|
59
|
+
@profiles.select!{|id, profile| yield(id, profile)}
|
60
|
+
current_ids = @profiles.keys
|
61
|
+
@vars.select!{|id, var| current_ids.include?(id)}
|
62
|
+
end
|
63
|
+
|
64
|
+
def select_by_var!
|
65
|
+
@vars.select!{|id, profile| yield(id, profile)}
|
66
|
+
current_ids = @vars.keys
|
67
|
+
@profiles.select!{|id, var| current_ids.include?(id)}
|
68
|
+
end
|
69
|
+
|
70
|
+
def filter_by_term_number(n_terms)
|
71
|
+
select_by_profile!{|id, profile| profile.length >= n_terms}
|
72
|
+
end
|
73
|
+
|
74
|
+
def remove_incomplete_records # remove resc that lacks of vars or phenotypes
|
75
|
+
ids_with_terms = @profiles.keys
|
76
|
+
ids_with_vars = []
|
77
|
+
@vars.each{|id, regs| ids_with_vars << id if regs.length > 0}
|
78
|
+
full_ids = ids_with_vars & ids_with_terms
|
79
|
+
@profiles.select!{|id, prof| full_ids.include?(id)}
|
80
|
+
@vars.select!{|id, var| full_ids.include?(id)}
|
81
|
+
end
|
82
|
+
|
83
|
+
def add_gen_feat(id, feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
84
|
+
@vars[id] = Genomic_Feature.new(feat_array)
|
85
|
+
end
|
86
|
+
|
87
|
+
def get_profile(id)
|
88
|
+
return @profiles[id]
|
89
|
+
end
|
90
|
+
|
91
|
+
def get_vars(id)
|
92
|
+
return @vars[id]
|
93
|
+
end
|
94
|
+
|
95
|
+
def each_profile()
|
96
|
+
@profiles.each do |id, profile|
|
97
|
+
yield(id, profile)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def each_var()
|
102
|
+
@vars.each do |id, var_info|
|
103
|
+
yield(id, var_info)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_general_profile(thr=0) # TODO move funcionality to semtools
|
108
|
+
term_count = Hash.new(0)
|
109
|
+
each_profile do |id, prof|
|
110
|
+
prof.each do |term|
|
111
|
+
term_count[term] += 1
|
112
|
+
end
|
113
|
+
end
|
114
|
+
records = @profiles.length
|
115
|
+
general_profile = []
|
116
|
+
term_count.each do |term, count|
|
117
|
+
general_profile << term if count.fdiv(records) >= thr
|
118
|
+
end
|
119
|
+
ont = @@ont[Cohort.act_ont]
|
120
|
+
return ont.clean_profile_hard(general_profile)
|
121
|
+
end
|
122
|
+
|
123
|
+
def check(hard=false) # OLD format_patient_data
|
124
|
+
ont = @@ont[Cohort.act_ont]
|
125
|
+
rejected_terms = []
|
126
|
+
rejected_recs = []
|
127
|
+
@profiles.each do |id, terms|
|
128
|
+
if hard
|
129
|
+
terms = ont.clean_profile_hard(terms)
|
130
|
+
rejec_terms = []
|
131
|
+
else
|
132
|
+
terms, rejec_terms = ont.check_ids(terms)
|
133
|
+
end
|
134
|
+
if !rejec_terms.empty?
|
135
|
+
STDERR.puts "WARNING: record #{id} has the unknown CODES '#{rejec_terms.join(',')}'. Codes removed."
|
136
|
+
rejected_terms.concat(rejec_terms)
|
137
|
+
end
|
138
|
+
if terms.empty?
|
139
|
+
rejected_recs << id
|
140
|
+
else
|
141
|
+
@profiles[id] = terms
|
142
|
+
end
|
143
|
+
end
|
144
|
+
@profiles.select!{|id, record| !rejected_recs.include?(id)}
|
145
|
+
@vars.select!{|id, record| !rejected_recs.include?(id)}
|
146
|
+
return rejected_terms.uniq, rejected_recs
|
147
|
+
end
|
148
|
+
|
149
|
+
def link2ont(ont_id)
|
150
|
+
@@ont[ont_id].load_profiles(@profiles)
|
151
|
+
end
|
152
|
+
|
153
|
+
def get_profile_redundancy
|
154
|
+
ont = @@ont[Cohort.act_ont]
|
155
|
+
profile_sizes, parental_terms_per_profile = ont.get_profile_redundancy
|
156
|
+
return profile_sizes, parental_terms_per_profile
|
157
|
+
end
|
158
|
+
|
159
|
+
def get_profiles_terms_frequency(options={})
|
160
|
+
ont = @@ont[Cohort.act_ont]
|
161
|
+
term_stats = ont.get_profiles_terms_frequency(**options) #https://www.ruby-lang.org/en/news/2019/12/12/separation-of-positional-and-keyword-arguments-in-ruby-3-0/
|
162
|
+
return term_stats
|
163
|
+
end
|
164
|
+
|
165
|
+
def compute_term_list_and_childs()
|
166
|
+
ont = @@ont[Cohort.act_ont]
|
167
|
+
suggested_childs, term_with_childs_ratio = ont.compute_term_list_and_childs()
|
168
|
+
end
|
169
|
+
|
170
|
+
def get_profile_ontology_distribution_tables()
|
171
|
+
ont = @@ont[Cohort.act_ont]
|
172
|
+
ontology_levels, distribution_percentage = ont.get_profile_ontology_distribution_tables
|
173
|
+
ontology_levels.unshift(["level", "ontology", "cohort"])
|
174
|
+
distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
|
175
|
+
return ontology_levels, distribution_percentage
|
176
|
+
end
|
177
|
+
|
178
|
+
def get_ic_analysis()
|
179
|
+
ont = @@ont[Cohort.act_ont]
|
180
|
+
onto_ic, freq_ic = ont.get_observed_ics_by_onto_and_freq # IC for TERMS
|
181
|
+
onto_ic_profile, freq_ic_profile = ont.get_profiles_resnik_dual_ICs # IC for PROFILES
|
182
|
+
return onto_ic, freq_ic, onto_ic_profile, freq_ic_profile
|
183
|
+
end
|
184
|
+
|
185
|
+
def get_profiles_mean_size
|
186
|
+
ont = @@ont[Cohort.act_ont]
|
187
|
+
profile_mean_size = ont.get_profiles_mean_size
|
188
|
+
return profile_mean_size
|
189
|
+
end
|
190
|
+
|
191
|
+
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
192
|
+
ont = @@ont[Cohort.act_ont]
|
193
|
+
length_percent = ont.get_profile_length_at_percentile(perc=perc, increasing_sort: increasing_sort)
|
194
|
+
return length_percent
|
195
|
+
end
|
196
|
+
|
197
|
+
def get_dataset_specifity_index(type)
|
198
|
+
ont = @@ont[Cohort.act_ont]
|
199
|
+
dsi = ont.get_dataset_specifity_index(type)
|
200
|
+
return dsi
|
201
|
+
end
|
202
|
+
|
203
|
+
def compare_profiles(options={})
|
204
|
+
ont = @@ont[Cohort.act_ont]
|
205
|
+
similarities = ont.compare_profiles(**options)
|
206
|
+
return similarities
|
207
|
+
end
|
208
|
+
|
209
|
+
def index_vars # equivalent to process_patient_data
|
210
|
+
each_var do |id, var|
|
211
|
+
@var_idx.merge(var, id)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def get_vars_sizes(summary=false)
|
216
|
+
if summary
|
217
|
+
return @var_idx.get_summary_sizes
|
218
|
+
else
|
219
|
+
return @var_idx.get_sizes
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
def generate_cluster_regions(meth, tag, lim)
|
224
|
+
@var_idx.generate_cluster_regions(meth, tag, lim)
|
225
|
+
end
|
226
|
+
|
227
|
+
def save(output_file, mode = :default, translate = false)
|
228
|
+
File.open(output_file, 'w') do |f|
|
229
|
+
f.puts "id\tchr\tstart\tstop\tterms" if mode == 'paco'
|
230
|
+
ont = @@ont[Cohort.act_ont]
|
231
|
+
@profiles.each do |id, terms|
|
232
|
+
terms, rejected = ont.translate_ids(terms) if translate
|
233
|
+
id_variants = @vars[id]
|
234
|
+
variants = []
|
235
|
+
if id_variants.nil? || id_variants.length == 0
|
236
|
+
variants << ['-', '-', '-']
|
237
|
+
else
|
238
|
+
id_variants.each do |chr, reg|
|
239
|
+
variants << [chr, reg[:start], reg[:stop]]
|
240
|
+
end
|
241
|
+
end
|
242
|
+
variants.each do |var|
|
243
|
+
if mode == :default
|
244
|
+
f.puts "#{id}\t#{terms.join('|')}\t#{var.join("\t")}"
|
245
|
+
elsif mode == :paco
|
246
|
+
f.puts "#{id}\t#{var.join("\t")}\t#{terms.join('|')}"
|
247
|
+
else
|
248
|
+
abort('Wrong save mode option, please try default or paco')
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def export_phenopackets(output_folder, genome_assembly, vcf_index: nil)
|
256
|
+
ont = @@ont[Cohort.act_ont]
|
257
|
+
metaData = {
|
258
|
+
"createdBy" => "PETS",
|
259
|
+
"resources" => [{
|
260
|
+
"id" => "hp",
|
261
|
+
"name" => "human phenotype ontology",
|
262
|
+
"namespacePrefix" => "HP",
|
263
|
+
"url" => "http://purl.obolibrary.org/obo/hp.owl",
|
264
|
+
# "version" => "2018-03-08",
|
265
|
+
"iriPrefix" => "http://purl.obolibrary.org/obo/HP_"
|
266
|
+
}]
|
267
|
+
}
|
268
|
+
|
269
|
+
@profiles.each do |id, terms|
|
270
|
+
phenopacket = {metaData: metaData}
|
271
|
+
query_sex = @extra_attr.dig(id, :sex)
|
272
|
+
sex = query_sex.nil? ? 'UNKNOWN_SEX' : query_sex
|
273
|
+
phenopacket[:subject] = {
|
274
|
+
id: id,
|
275
|
+
sex: sex
|
276
|
+
}
|
277
|
+
phenotypicFeatures = []
|
278
|
+
terms.each do |term|
|
279
|
+
term_name = ont.translate_id(term)
|
280
|
+
phenotypicFeatures << {
|
281
|
+
type: { id: term, label: term_name},
|
282
|
+
classOfOnset: {"id" => "HP:0003577", "label" => "Congenital onset"}
|
283
|
+
}
|
284
|
+
end
|
285
|
+
phenopacket[:phenotypicFeatures] = phenotypicFeatures
|
286
|
+
if !vcf_index.nil? && vcf_index.include?(id)
|
287
|
+
htsFiles = []
|
288
|
+
htsFiles << {
|
289
|
+
"uri" => "file:/" + vcf_index[id],
|
290
|
+
"description" => id,
|
291
|
+
"htsFormat" => "VCF",
|
292
|
+
"genomeAssembly" => genome_assembly,
|
293
|
+
"individualToSampleIdentifiers" => { "patient1" => id }
|
294
|
+
}
|
295
|
+
phenopacket[:htsFiles] = htsFiles
|
296
|
+
end
|
297
|
+
File.open(File.join(output_folder, id.to_s + ".json"), "w") { |f| f.write JSON.pretty_generate(phenopacket) }
|
298
|
+
id_variants = @vars[id]
|
299
|
+
variants = []
|
300
|
+
if id_variants.nil? || id_variants.length == 0
|
301
|
+
variants << ['-', '-', '-']
|
302
|
+
else
|
303
|
+
id_variants.each do |chr, reg|
|
304
|
+
variants << [chr, reg[:start], reg[:stop]]
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
options[:chromosome_col] = nil
|
2
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
3
|
+
options[:chromosome_col] = data
|
4
|
+
end
|
5
|
+
|
6
|
+
options[:id_col] = nil
|
7
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
8
|
+
options[:id_col] = data
|
9
|
+
end
|
10
|
+
|
11
|
+
options[:end_col] = nil
|
12
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
13
|
+
options[:end_col] = data
|
14
|
+
end
|
15
|
+
|
16
|
+
options[:genome_assembly] = 'hg38'
|
17
|
+
opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
|
18
|
+
options[:genome_assembly] = data
|
19
|
+
end
|
20
|
+
|
21
|
+
options[:header] = true
|
22
|
+
#chr\tstart\tstop
|
23
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
24
|
+
options[:header] = false
|
25
|
+
end
|
26
|
+
|
27
|
+
options[:sex_col] = nil
|
28
|
+
opts.on("-x", "--sex_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient sex") do |data|
|
29
|
+
options[:sex_col] = data
|
30
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
# Needs define ROOT_PATH constant in file requiring this file
|
2
|
+
COMMON_OPTPARSE = File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
|
3
|
+
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
4
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
5
|
+
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
6
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
7
|
+
MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
|
8
|
+
IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
|