pets 0.2.3 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
@@ -0,0 +1,309 @@
1
+ require 'json'
2
+ require 'semtools'
3
+
4
+ class Cohort
5
+ @@ont = {}
6
+ class << self # https://www.ruby-forum.com/t/attr-accessor-for-class-variable/136693
7
+ attr_accessor :act_ont # Which ontology use for ont related operations
8
+ end
9
+
10
+ attr_accessor :profiles
11
+
12
+ def self.get_ontology(ont_id)
13
+ return @@ont[ont_id]
14
+ end
15
+
16
+ def self.load_ontology(ont_name, ont_file, excluded_terms_file = nil)
17
+ ont = nil
18
+ if !ont_file.include?('.json')
19
+ if !excluded_terms_file.nil?
20
+ ont = Ontology.new(file: ont_file, load_file: true, removable_terms: read_excluded_ont_file(excluded_terms_file))
21
+ else
22
+ ont = Ontology.new(file: ont_file, load_file: true)
23
+ end
24
+ else
25
+ ont = Ontology.new
26
+ ont.read(ont_file)
27
+ end
28
+ @@ont[ont_name] = ont
29
+ end
30
+
31
+ def self.read_excluded_ont_file(file)
32
+ excluded_hpo = []
33
+ File.open(file).each do |line|
34
+ excluded_hpo << line.chomp
35
+ end
36
+ return excluded_hpo
37
+ end
38
+
39
+ def initialize()
40
+ @profiles = {}
41
+ @vars = {}
42
+ @extra_attr = {}
43
+ @var_idx = Genomic_Feature.new([])
44
+ end
45
+
46
+ def add_record(rec, extra_attr = nil) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
47
+ id, profile, vars = rec
48
+ @profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
49
+ @extra_attr[id] = extra_attr if !extra_attr.nil?
50
+ add_gen_feat(id, vars) if !vars.nil?
51
+ end
52
+
53
+ def delete(id)
54
+ @profiles.delete(id)
55
+ @vars.delete(id)
56
+ end
57
+
58
+ def select_by_profile!
59
+ @profiles.select!{|id, profile| yield(id, profile)}
60
+ current_ids = @profiles.keys
61
+ @vars.select!{|id, var| current_ids.include?(id)}
62
+ end
63
+
64
+ def select_by_var!
65
+ @vars.select!{|id, profile| yield(id, profile)}
66
+ current_ids = @vars.keys
67
+ @profiles.select!{|id, var| current_ids.include?(id)}
68
+ end
69
+
70
+ def filter_by_term_number(n_terms)
71
+ select_by_profile!{|id, profile| profile.length >= n_terms}
72
+ end
73
+
74
+ def remove_incomplete_records # remove resc that lacks of vars or phenotypes
75
+ ids_with_terms = @profiles.keys
76
+ ids_with_vars = []
77
+ @vars.each{|id, regs| ids_with_vars << id if regs.length > 0}
78
+ full_ids = ids_with_vars & ids_with_terms
79
+ @profiles.select!{|id, prof| full_ids.include?(id)}
80
+ @vars.select!{|id, var| full_ids.include?(id)}
81
+ end
82
+
83
+ def add_gen_feat(id, feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
84
+ @vars[id] = Genomic_Feature.new(feat_array)
85
+ end
86
+
87
+ def get_profile(id)
88
+ return @profiles[id]
89
+ end
90
+
91
+ def get_vars(id)
92
+ return @vars[id]
93
+ end
94
+
95
+ def each_profile()
96
+ @profiles.each do |id, profile|
97
+ yield(id, profile)
98
+ end
99
+ end
100
+
101
+ def each_var()
102
+ @vars.each do |id, var_info|
103
+ yield(id, var_info)
104
+ end
105
+ end
106
+
107
+ def get_general_profile(thr=0) # TODO move funcionality to semtools
108
+ term_count = Hash.new(0)
109
+ each_profile do |id, prof|
110
+ prof.each do |term|
111
+ term_count[term] += 1
112
+ end
113
+ end
114
+ records = @profiles.length
115
+ general_profile = []
116
+ term_count.each do |term, count|
117
+ general_profile << term if count.fdiv(records) >= thr
118
+ end
119
+ ont = @@ont[Cohort.act_ont]
120
+ return ont.clean_profile_hard(general_profile)
121
+ end
122
+
123
+ def check(hard=false) # OLD format_patient_data
124
+ ont = @@ont[Cohort.act_ont]
125
+ rejected_terms = []
126
+ rejected_recs = []
127
+ @profiles.each do |id, terms|
128
+ if hard
129
+ terms = ont.clean_profile_hard(terms)
130
+ rejec_terms = []
131
+ else
132
+ terms, rejec_terms = ont.check_ids(terms)
133
+ end
134
+ if !rejec_terms.empty?
135
+ STDERR.puts "WARNING: record #{id} has the unknown CODES '#{rejec_terms.join(',')}'. Codes removed."
136
+ rejected_terms.concat(rejec_terms)
137
+ end
138
+ if terms.empty?
139
+ rejected_recs << id
140
+ else
141
+ @profiles[id] = terms
142
+ end
143
+ end
144
+ @profiles.select!{|id, record| !rejected_recs.include?(id)}
145
+ @vars.select!{|id, record| !rejected_recs.include?(id)}
146
+ return rejected_terms.uniq, rejected_recs
147
+ end
148
+
149
+ def link2ont(ont_id)
150
+ @@ont[ont_id].load_profiles(@profiles)
151
+ end
152
+
153
+ def get_profile_redundancy
154
+ ont = @@ont[Cohort.act_ont]
155
+ profile_sizes, parental_terms_per_profile = ont.get_profile_redundancy
156
+ return profile_sizes, parental_terms_per_profile
157
+ end
158
+
159
+ def get_profiles_terms_frequency(options={})
160
+ ont = @@ont[Cohort.act_ont]
161
+ term_stats = ont.get_profiles_terms_frequency(**options) #https://www.ruby-lang.org/en/news/2019/12/12/separation-of-positional-and-keyword-arguments-in-ruby-3-0/
162
+ return term_stats
163
+ end
164
+
165
+ def compute_term_list_and_childs()
166
+ ont = @@ont[Cohort.act_ont]
167
+ suggested_childs, term_with_childs_ratio = ont.compute_term_list_and_childs()
168
+ end
169
+
170
+ def get_profile_ontology_distribution_tables()
171
+ ont = @@ont[Cohort.act_ont]
172
+ ontology_levels, distribution_percentage = ont.get_profile_ontology_distribution_tables
173
+ ontology_levels.unshift(["level", "ontology", "cohort"])
174
+ distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
175
+ return ontology_levels, distribution_percentage
176
+ end
177
+
178
+ def get_ic_analysis()
179
+ ont = @@ont[Cohort.act_ont]
180
+ onto_ic, freq_ic = ont.get_observed_ics_by_onto_and_freq # IC for TERMS
181
+ onto_ic_profile, freq_ic_profile = ont.get_profiles_resnik_dual_ICs # IC for PROFILES
182
+ return onto_ic, freq_ic, onto_ic_profile, freq_ic_profile
183
+ end
184
+
185
+ def get_profiles_mean_size
186
+ ont = @@ont[Cohort.act_ont]
187
+ profile_mean_size = ont.get_profiles_mean_size
188
+ return profile_mean_size
189
+ end
190
+
191
+ def get_profile_length_at_percentile(perc=50, increasing_sort: false)
192
+ ont = @@ont[Cohort.act_ont]
193
+ length_percent = ont.get_profile_length_at_percentile(perc=perc, increasing_sort: increasing_sort)
194
+ return length_percent
195
+ end
196
+
197
+ def get_dataset_specifity_index(type)
198
+ ont = @@ont[Cohort.act_ont]
199
+ dsi = ont.get_dataset_specifity_index(type)
200
+ return dsi
201
+ end
202
+
203
+ def compare_profiles(options={})
204
+ ont = @@ont[Cohort.act_ont]
205
+ similarities = ont.compare_profiles(**options)
206
+ return similarities
207
+ end
208
+
209
+ def index_vars # equivalent to process_patient_data
210
+ each_var do |id, var|
211
+ @var_idx.merge(var, id)
212
+ end
213
+ end
214
+
215
+ def get_vars_sizes(summary=false)
216
+ if summary
217
+ return @var_idx.get_summary_sizes
218
+ else
219
+ return @var_idx.get_sizes
220
+ end
221
+ end
222
+
223
+ def generate_cluster_regions(meth, tag, lim)
224
+ @var_idx.generate_cluster_regions(meth, tag, lim)
225
+ end
226
+
227
+ def save(output_file, mode = :default, translate = false)
228
+ File.open(output_file, 'w') do |f|
229
+ f.puts "id\tchr\tstart\tstop\tterms" if mode == 'paco'
230
+ ont = @@ont[Cohort.act_ont]
231
+ @profiles.each do |id, terms|
232
+ terms, rejected = ont.translate_ids(terms) if translate
233
+ id_variants = @vars[id]
234
+ variants = []
235
+ if id_variants.nil? || id_variants.length == 0
236
+ variants << ['-', '-', '-']
237
+ else
238
+ id_variants.each do |chr, reg|
239
+ variants << [chr, reg[:start], reg[:stop]]
240
+ end
241
+ end
242
+ variants.each do |var|
243
+ if mode == :default
244
+ f.puts "#{id}\t#{terms.join('|')}\t#{var.join("\t")}"
245
+ elsif mode == :paco
246
+ f.puts "#{id}\t#{var.join("\t")}\t#{terms.join('|')}"
247
+ else
248
+ abort('Wrong save mode option, please try default or paco')
249
+ end
250
+ end
251
+ end
252
+ end
253
+ end
254
+
255
+ def export_phenopackets(output_folder, genome_assembly, vcf_index: nil)
256
+ ont = @@ont[Cohort.act_ont]
257
+ metaData = {
258
+ "createdBy" => "PETS",
259
+ "resources" => [{
260
+ "id" => "hp",
261
+ "name" => "human phenotype ontology",
262
+ "namespacePrefix" => "HP",
263
+ "url" => "http://purl.obolibrary.org/obo/hp.owl",
264
+ # "version" => "2018-03-08",
265
+ "iriPrefix" => "http://purl.obolibrary.org/obo/HP_"
266
+ }]
267
+ }
268
+
269
+ @profiles.each do |id, terms|
270
+ phenopacket = {metaData: metaData}
271
+ query_sex = @extra_attr.dig(id, :sex)
272
+ sex = query_sex.nil? ? 'UNKNOWN_SEX' : query_sex
273
+ phenopacket[:subject] = {
274
+ id: id,
275
+ sex: sex
276
+ }
277
+ phenotypicFeatures = []
278
+ terms.each do |term|
279
+ term_name = ont.translate_id(term)
280
+ phenotypicFeatures << {
281
+ type: { id: term, label: term_name},
282
+ classOfOnset: {"id" => "HP:0003577", "label" => "Congenital onset"}
283
+ }
284
+ end
285
+ phenopacket[:phenotypicFeatures] = phenotypicFeatures
286
+ if !vcf_index.nil? && vcf_index.include?(id)
287
+ htsFiles = []
288
+ htsFiles << {
289
+ "uri" => "file:/" + vcf_index[id],
290
+ "description" => id,
291
+ "htsFormat" => "VCF",
292
+ "genomeAssembly" => genome_assembly,
293
+ "individualToSampleIdentifiers" => { "patient1" => id }
294
+ }
295
+ phenopacket[:htsFiles] = htsFiles
296
+ end
297
+ File.open(File.join(output_folder, id.to_s + ".json"), "w") { |f| f.write JSON.pretty_generate(phenopacket) }
298
+ id_variants = @vars[id]
299
+ variants = []
300
+ if id_variants.nil? || id_variants.length == 0
301
+ variants << ['-', '-', '-']
302
+ else
303
+ id_variants.each do |chr, reg|
304
+ variants << [chr, reg[:start], reg[:stop]]
305
+ end
306
+ end
307
+ end
308
+ end
309
+ end
@@ -0,0 +1,30 @@
1
+ options[:chromosome_col] = nil
2
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
3
+ options[:chromosome_col] = data
4
+ end
5
+
6
+ options[:id_col] = nil
7
+ opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
8
+ options[:id_col] = data
9
+ end
10
+
11
+ options[:end_col] = nil
12
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
13
+ options[:end_col] = data
14
+ end
15
+
16
+ options[:genome_assembly] = 'hg38'
17
+ opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
18
+ options[:genome_assembly] = data
19
+ end
20
+
21
+ options[:header] = true
22
+ #chr\tstart\tstop
23
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
24
+ options[:header] = false
25
+ end
26
+
27
+ options[:sex_col] = nil
28
+ opts.on("-x", "--sex_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient sex") do |data|
29
+ options[:sex_col] = data
30
+ end
@@ -0,0 +1,8 @@
1
+ # Needs define ROOT_PATH constant in file requiring this file
2
+ COMMON_OPTPARSE = File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
3
+ REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
4
+ EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
5
+ EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
6
+ HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
7
+ MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
8
+ IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')