pets 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
@@ -0,0 +1,309 @@
1
+ require 'json'
2
+ require 'semtools'
3
+
4
+ class Cohort
5
+ @@ont = {}
6
+ class << self # https://www.ruby-forum.com/t/attr-accessor-for-class-variable/136693
7
+ attr_accessor :act_ont # Which ontology use for ont related operations
8
+ end
9
+
10
+ attr_accessor :profiles
11
+
12
+ def self.get_ontology(ont_id)
13
+ return @@ont[ont_id]
14
+ end
15
+
16
+ def self.load_ontology(ont_name, ont_file, excluded_terms_file = nil)
17
+ ont = nil
18
+ if !ont_file.include?('.json')
19
+ if !excluded_terms_file.nil?
20
+ ont = Ontology.new(file: ont_file, load_file: true, removable_terms: read_excluded_ont_file(excluded_terms_file))
21
+ else
22
+ ont = Ontology.new(file: ont_file, load_file: true)
23
+ end
24
+ else
25
+ ont = Ontology.new
26
+ ont.read(ont_file)
27
+ end
28
+ @@ont[ont_name] = ont
29
+ end
30
+
31
+ def self.read_excluded_ont_file(file)
32
+ excluded_hpo = []
33
+ File.open(file).each do |line|
34
+ excluded_hpo << line.chomp
35
+ end
36
+ return excluded_hpo
37
+ end
38
+
39
+ def initialize()
40
+ @profiles = {}
41
+ @vars = {}
42
+ @extra_attr = {}
43
+ @var_idx = Genomic_Feature.new([])
44
+ end
45
+
46
+ def add_record(rec, extra_attr = nil) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
47
+ id, profile, vars = rec
48
+ @profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
49
+ @extra_attr[id] = extra_attr if !extra_attr.nil?
50
+ add_gen_feat(id, vars) if !vars.nil?
51
+ end
52
+
53
+ def delete(id)
54
+ @profiles.delete(id)
55
+ @vars.delete(id)
56
+ end
57
+
58
+ def select_by_profile!
59
+ @profiles.select!{|id, profile| yield(id, profile)}
60
+ current_ids = @profiles.keys
61
+ @vars.select!{|id, var| current_ids.include?(id)}
62
+ end
63
+
64
+ def select_by_var!
65
+ @vars.select!{|id, profile| yield(id, profile)}
66
+ current_ids = @vars.keys
67
+ @profiles.select!{|id, var| current_ids.include?(id)}
68
+ end
69
+
70
+ def filter_by_term_number(n_terms)
71
+ select_by_profile!{|id, profile| profile.length >= n_terms}
72
+ end
73
+
74
+ def remove_incomplete_records # remove resc that lacks of vars or phenotypes
75
+ ids_with_terms = @profiles.keys
76
+ ids_with_vars = []
77
+ @vars.each{|id, regs| ids_with_vars << id if regs.length > 0}
78
+ full_ids = ids_with_vars & ids_with_terms
79
+ @profiles.select!{|id, prof| full_ids.include?(id)}
80
+ @vars.select!{|id, var| full_ids.include?(id)}
81
+ end
82
+
83
+ def add_gen_feat(id, feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
84
+ @vars[id] = Genomic_Feature.new(feat_array)
85
+ end
86
+
87
+ def get_profile(id)
88
+ return @profiles[id]
89
+ end
90
+
91
+ def get_vars(id)
92
+ return @vars[id]
93
+ end
94
+
95
+ def each_profile()
96
+ @profiles.each do |id, profile|
97
+ yield(id, profile)
98
+ end
99
+ end
100
+
101
+ def each_var()
102
+ @vars.each do |id, var_info|
103
+ yield(id, var_info)
104
+ end
105
+ end
106
+
107
+ def get_general_profile(thr=0) # TODO move funcionality to semtools
108
+ term_count = Hash.new(0)
109
+ each_profile do |id, prof|
110
+ prof.each do |term|
111
+ term_count[term] += 1
112
+ end
113
+ end
114
+ records = @profiles.length
115
+ general_profile = []
116
+ term_count.each do |term, count|
117
+ general_profile << term if count.fdiv(records) >= thr
118
+ end
119
+ ont = @@ont[Cohort.act_ont]
120
+ return ont.clean_profile_hard(general_profile)
121
+ end
122
+
123
+ def check(hard=false) # OLD format_patient_data
124
+ ont = @@ont[Cohort.act_ont]
125
+ rejected_terms = []
126
+ rejected_recs = []
127
+ @profiles.each do |id, terms|
128
+ if hard
129
+ terms = ont.clean_profile_hard(terms)
130
+ rejec_terms = []
131
+ else
132
+ terms, rejec_terms = ont.check_ids(terms)
133
+ end
134
+ if !rejec_terms.empty?
135
+ STDERR.puts "WARNING: record #{id} has the unknown CODES '#{rejec_terms.join(',')}'. Codes removed."
136
+ rejected_terms.concat(rejec_terms)
137
+ end
138
+ if terms.empty?
139
+ rejected_recs << id
140
+ else
141
+ @profiles[id] = terms
142
+ end
143
+ end
144
+ @profiles.select!{|id, record| !rejected_recs.include?(id)}
145
+ @vars.select!{|id, record| !rejected_recs.include?(id)}
146
+ return rejected_terms.uniq, rejected_recs
147
+ end
148
+
149
+ def link2ont(ont_id)
150
+ @@ont[ont_id].load_profiles(@profiles)
151
+ end
152
+
153
+ def get_profile_redundancy
154
+ ont = @@ont[Cohort.act_ont]
155
+ profile_sizes, parental_terms_per_profile = ont.get_profile_redundancy
156
+ return profile_sizes, parental_terms_per_profile
157
+ end
158
+
159
+ def get_profiles_terms_frequency(options={})
160
+ ont = @@ont[Cohort.act_ont]
161
+ term_stats = ont.get_profiles_terms_frequency(**options) #https://www.ruby-lang.org/en/news/2019/12/12/separation-of-positional-and-keyword-arguments-in-ruby-3-0/
162
+ return term_stats
163
+ end
164
+
165
+ def compute_term_list_and_childs()
166
+ ont = @@ont[Cohort.act_ont]
167
+ suggested_childs, term_with_childs_ratio = ont.compute_term_list_and_childs()
168
+ end
169
+
170
+ def get_profile_ontology_distribution_tables()
171
+ ont = @@ont[Cohort.act_ont]
172
+ ontology_levels, distribution_percentage = ont.get_profile_ontology_distribution_tables
173
+ ontology_levels.unshift(["level", "ontology", "cohort"])
174
+ distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
175
+ return ontology_levels, distribution_percentage
176
+ end
177
+
178
+ def get_ic_analysis()
179
+ ont = @@ont[Cohort.act_ont]
180
+ onto_ic, freq_ic = ont.get_observed_ics_by_onto_and_freq # IC for TERMS
181
+ onto_ic_profile, freq_ic_profile = ont.get_profiles_resnik_dual_ICs # IC for PROFILES
182
+ return onto_ic, freq_ic, onto_ic_profile, freq_ic_profile
183
+ end
184
+
185
+ def get_profiles_mean_size
186
+ ont = @@ont[Cohort.act_ont]
187
+ profile_mean_size = ont.get_profiles_mean_size
188
+ return profile_mean_size
189
+ end
190
+
191
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
192
+ ont = @@ont[Cohort.act_ont]
193
+ length_percent = ont.get_profile_length_at_percentile(perc=perc, increasing_sort: increasing_sort)
194
+ return length_percent
195
+ end
196
+
197
+ def get_dataset_specifity_index(type)
198
+ ont = @@ont[Cohort.act_ont]
199
+ dsi = ont.get_dataset_specifity_index(type)
200
+ return dsi
201
+ end
202
+
203
+ def compare_profiles(options={})
204
+ ont = @@ont[Cohort.act_ont]
205
+ similarities = ont.compare_profiles(**options)
206
+ return similarities
207
+ end
208
+
209
+ def index_vars # equivalent to process_patient_data
210
+ each_var do |id, var|
211
+ @var_idx.merge(var, id)
212
+ end
213
+ end
214
+
215
+ def get_vars_sizes(summary=false)
216
+ if summary
217
+ return @var_idx.get_summary_sizes
218
+ else
219
+ return @var_idx.get_sizes
220
+ end
221
+ end
222
+
223
+ def generate_cluster_regions(meth, tag, lim)
224
+ @var_idx.generate_cluster_regions(meth, tag, lim)
225
+ end
226
+
227
+ def save(output_file, mode = :default, translate = false)
228
+ File.open(output_file, 'w') do |f|
229
+ f.puts "id\tchr\tstart\tstop\tterms" if mode == 'paco'
230
+ ont = @@ont[Cohort.act_ont]
231
+ @profiles.each do |id, terms|
232
+ terms, rejected = ont.translate_ids(terms) if translate
233
+ id_variants = @vars[id]
234
+ variants = []
235
+ if id_variants.nil? || id_variants.length == 0
236
+ variants << ['-', '-', '-']
237
+ else
238
+ id_variants.each do |chr, reg|
239
+ variants << [chr, reg[:start], reg[:stop]]
240
+ end
241
+ end
242
+ variants.each do |var|
243
+ if mode == :default
244
+ f.puts "#{id}\t#{terms.join('|')}\t#{var.join("\t")}"
245
+ elsif mode == :paco
246
+ f.puts "#{id}\t#{var.join("\t")}\t#{terms.join('|')}"
247
+ else
248
+ abort('Wrong save mode option, please try default or paco')
249
+ end
250
+ end
251
+ end
252
+ end
253
+ end
254
+
255
+ def export_phenopackets(output_folder, genome_assembly, vcf_index: nil)
256
+ ont = @@ont[Cohort.act_ont]
257
+ metaData = {
258
+ "createdBy" => "PETS",
259
+ "resources" => [{
260
+ "id" => "hp",
261
+ "name" => "human phenotype ontology",
262
+ "namespacePrefix" => "HP",
263
+ "url" => "http://purl.obolibrary.org/obo/hp.owl",
264
+ # "version" => "2018-03-08",
265
+ "iriPrefix" => "http://purl.obolibrary.org/obo/HP_"
266
+ }]
267
+ }
268
+
269
+ @profiles.each do |id, terms|
270
+ phenopacket = {metaData: metaData}
271
+ query_sex = @extra_attr.dig(id, :sex)
272
+ sex = query_sex.nil? ? 'UNKNOWN_SEX' : query_sex
273
+ phenopacket[:subject] = {
274
+ id: id,
275
+ sex: sex
276
+ }
277
+ phenotypicFeatures = []
278
+ terms.each do |term|
279
+ term_name = ont.translate_id(term)
280
+ phenotypicFeatures << {
281
+ type: { id: term, label: term_name},
282
+ classOfOnset: {"id" => "HP:0003577", "label" => "Congenital onset"}
283
+ }
284
+ end
285
+ phenopacket[:phenotypicFeatures] = phenotypicFeatures
286
+ if !vcf_index.nil? && vcf_index.include?(id)
287
+ htsFiles = []
288
+ htsFiles << {
289
+ "uri" => "file:/" + vcf_index[id],
290
+ "description" => id,
291
+ "htsFormat" => "VCF",
292
+ "genomeAssembly" => genome_assembly,
293
+ "individualToSampleIdentifiers" => { "patient1" => id }
294
+ }
295
+ phenopacket[:htsFiles] = htsFiles
296
+ end
297
+ File.open(File.join(output_folder, id.to_s + ".json"), "w") { |f| f.write JSON.pretty_generate(phenopacket) }
298
+ id_variants = @vars[id]
299
+ variants = []
300
+ if id_variants.nil? || id_variants.length == 0
301
+ variants << ['-', '-', '-']
302
+ else
303
+ id_variants.each do |chr, reg|
304
+ variants << [chr, reg[:start], reg[:stop]]
305
+ end
306
+ end
307
+ end
308
+ end
309
+ end
@@ -0,0 +1,30 @@
1
+ options[:chromosome_col] = nil
2
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
3
+ options[:chromosome_col] = data
4
+ end
5
+
6
+ options[:id_col] = nil
7
+ opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
8
+ options[:id_col] = data
9
+ end
10
+
11
+ options[:end_col] = nil
12
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
13
+ options[:end_col] = data
14
+ end
15
+
16
+ options[:genome_assembly] = 'hg38'
17
+ opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
18
+ options[:genome_assembly] = data
19
+ end
20
+
21
+ options[:header] = true
22
+ #chr\tstart\tstop
23
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
24
+ options[:header] = false
25
+ end
26
+
27
+ options[:sex_col] = nil
28
+ opts.on("-x", "--sex_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient sex") do |data|
29
+ options[:sex_col] = data
30
+ end
@@ -0,0 +1,8 @@
1
+ # Needs define ROOT_PATH constant in file requiring this file
2
+ COMMON_OPTPARSE = File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
3
+ REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
4
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
5
+ EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
6
+ HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
7
+ MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
8
+ IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')