pets 0.2.3 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f1d5c3ad0cb57b26b2c67e02b38a282139965472ded083acf0d1fcae48c0fec
|
4
|
+
data.tar.gz: 8b34f2440afe74f0b9c0e6024c2a05daee4a7be0efd0c6a3d80aef49673c7c7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3e9bc8559bb3f3e0c9a7ce1e0658645f54afc83fc49e7415cc3177576af975e4f7268a1f37e017d83fd88042197f102a4290cc29d7b0a16e12ec4964feea39d
|
7
|
+
data.tar.gz: a2aa8fe161b52d2f3e86e0d04f2a1a762298de95582098e553287ad905ff7b97eae6f96893de8bd78676c01b412a16973b1b53cf6ebbd9cb48e49c929c7f1d74
|
data/Gemfile
CHANGED
@@ -4,3 +4,5 @@ source "https://rubygems.org"
|
|
4
4
|
gemspec
|
5
5
|
semtools_dev_path = File.expand_path('~/dev_gems/semtools')
|
6
6
|
gem "semtools", github: "seoanezonjic/semtools", branch: "master" if Dir.exists?(semtools_dev_path)
|
7
|
+
expcalc_dev_path = File.expand_path('~/dev_gems/expcalc')
|
8
|
+
gem "expcalc", github: "seoanezonjic/expcalc", branch: "master" if Dir.exist?(expcalc_dev_path)
|
data/README.md
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
-
#
|
1
|
+
# PETS
|
2
2
|
|
3
|
-
|
3
|
+
PETS (Patient Exploration Tools Suite) include three different tools for the analysis of cohorts of patients with pathological phenotypes described in terms of the Human Phenotype Ontology (HPO) and the position their genomic variants clinically determined.
|
4
4
|
|
5
|
-
|
5
|
+
It can (1) determine the quality of information within a patient cohort with Cohort Analyzer (coPatReporter.rb); (2) associate genomic regions with their pathological phenotypes based on the cohort data with Reg2Phen (reg2phen.rb), and (3) predict the possible genetic variants that cause the clinically observed pathological phenotypes using phenotype-genotype association values with Phen2Reg (phen2reg.rb).
|
6
|
+
|
7
|
+
This tool has been developed to be used by the clinical community, to facilitate patient characterisation, help identify where data quality can be improved within a cohort and help diagnose patients with complex disease. Please cite us as Rojano E., Seoane-Zonjic P., Jabato F.M., Perkins J.R., Ranea J.A.G. (2020) Comprehensive Analysis of Patients with Undiagnosed Genetic Diseases Using the Patient Exploration Tools Suite (PETS). In: Rojas I., Valenzuela O., Rojas F., Herrera L., Ortuño F. (eds) Bioinformatics and Biomedical Engineering. IWBBIO 2020. Lecture Notes in Computer Science, vol 12108. Springer, Cham. https://doi.org/10.1007/978-3-030-45385-5_69.
|
6
8
|
|
7
|
-
Associations between pathological phenotypes and genomic regions (using genomic coordinates from GRCh37 human assembly) are previously calculated using NetAnalyzer (https://rubygems.org/gems/NetAnalyzer). Please cite us as Rojano E. et al (2017). Revealing the Relationship Between Human Genome Regions and Pathological Phenotypes Through Network Analysis. LNCS, 10208:197-207.
|
8
9
|
|
9
10
|
## Installation
|
10
11
|
|
@@ -22,9 +23,82 @@ Or install it yourself as:
|
|
22
23
|
|
23
24
|
$ gem install pets
|
24
25
|
|
26
|
+
|
27
|
+
After installing PETS Gem, R dependencies must be installed. For this, the user must run the following command:
|
28
|
+
|
29
|
+
$ install_deps.rb
|
30
|
+
|
25
31
|
## Usage
|
26
32
|
|
27
|
-
|
33
|
+
### 1) Cohort Analyzer
|
34
|
+
|
35
|
+
Cohort Analyzer measures the phenotyping quality of patient and disease cohorts by calculating multiple statistics to give a general overview of the cohort status in terms of the depth and breadth of phenotyping. It can work with cohorts defined exclusively with HPO terms or with both HPO terms and genomic coordinates.
|
36
|
+
|
37
|
+
#### Basic usage of Cohort Analyzer:
|
38
|
+
|
39
|
+
We provide an example of use of Cohort Analyzer with a dataset from Vulto-van Silfhout, A.T.; Hehir-Kwa, J.Y.; van Bon, B.W.M.; Schuurs-Hoeijmakers, J.H.M.; Meader, S.; Hellebrekers, C.J.M.; Thoonen, I.J.M.; de Brouwer, A.P.M.; Brunner, H.G.; Webber, C.; Pfundt, R.; de Leeuw, N.; De Vries, B.B.A. Clinical Significance of De Novo and Inherited Copy-Number Variation. Human Mutation 2013, 34, 1679–1687. doi:10.1002/humu.22442.
|
40
|
+
|
41
|
+
This dataset includes de novo and inherited CNVs to phenotypes related to intellectual disability/developmental delay occurring alongside multiple congenital anomalies. An example of an input file is available in the example_datasets folder within this repository and the code to execute its analysis is provided below:
|
42
|
+
|
43
|
+
```
|
44
|
+
coPatReporter.rb -i hummu_congenital_full_dataset.txt -o results -p phenotypes -c chr -d patient_id -s start -e stop -m lin
|
45
|
+
```
|
46
|
+
|
47
|
+
Where:
|
48
|
+
|
49
|
+
- -i -> Input cohort, a tab file with patient identifiers and the list of HPOs characterised for each patient.
|
50
|
+
- -o -> Output path.
|
51
|
+
- -p -> Column name with phenotypes.
|
52
|
+
- -c -> Column name with chromosomes.
|
53
|
+
- -d -> Column name with patient identifiers.
|
54
|
+
- -s -> Column name with start genomic coordinate.
|
55
|
+
- -e -> Column name with final genomic coordinate.
|
56
|
+
- -m -> Semantic similarity measure method.
|
57
|
+
- -C -> Maximum number of clusters to display.
|
58
|
+
|
59
|
+
Further information with all Cohort Analyzer capabilities for setup can be queried as follows:
|
60
|
+
|
61
|
+
```
|
62
|
+
coPatReporter.rb --help
|
63
|
+
```
|
64
|
+
|
65
|
+
### 2) Reg2Phen
|
66
|
+
|
67
|
+
This tool is a search engine that finds phenotypes associated with genomic regions or genes of interest. It uses two input files, one with phenotype-genotype associations previously calculated, and a list of genomic coordinates or gene identifiers to find their HPO associated. We provide an example of use in the example_datasets folder within this repository and the code to execute its analysis is provided below:
|
68
|
+
|
69
|
+
```
|
70
|
+
reg2phen.rb -t associations_file.txt -p genes.txt -b hpo_file -P -g -H -o results/patient1Genes.txt -F $current/results/patient1Genes.html
|
71
|
+
```
|
72
|
+
Where:
|
73
|
+
|
74
|
+
- -t -> Input phenotype-genotype associations file.
|
75
|
+
- -p -> List of genes to find HPOs associated.
|
76
|
+
- -b -> HPO obo file.
|
77
|
+
- -P -> Transform association values in P-values.
|
78
|
+
- -g -> Set if genes identifiers are provided instead of genome coordinates.
|
79
|
+
- -H -> Activate HTML reporting.
|
80
|
+
- -o -> Output folder.
|
81
|
+
- -F -> Semantic similarity measure method.
|
82
|
+
|
83
|
+
Associations between pathological phenotypes and genomic regions provided in this example were calculated with NetAnalyzer (https://rubygems.org/gems/NetAnalyzer, Rojano E. et al (2017). Revealing the Relationship Between Human Genome Regions and Pathological Phenotypes Through Network Analysis. LNCS, 10208:197-207) using randomised DECIPHER data (coordinates in the GRCh37 human genome assembly) and the hypergeometric association method.
|
84
|
+
|
85
|
+
### 3) Phen2Reg
|
86
|
+
|
87
|
+
Phen2Reg analyses the pathological phenotypes observed in a patient and predicts putative causal genomic regions. As in the case of Reg2Phen, it uses phenotype-genotype associations previously calculated. We provide an example of use in the example_datasets folder within this repository and the code to execute its analysis is provided below:
|
88
|
+
|
89
|
+
```
|
90
|
+
phen2reg.rb -t associations_file.txt -p example_patient_hpos.txt -i hpo2ci.txt -f hpo_file -T -Q > single_phens.txt
|
91
|
+
```
|
92
|
+
Where:
|
93
|
+
|
94
|
+
- -t -> Input phenotype-genotype associations file.
|
95
|
+
- -p -> List of HPOs characterised for a patient.
|
96
|
+
- -i -> HPO information coefficients (IC) file.
|
97
|
+
- -f -> HPO obo file.
|
98
|
+
- -T -> Deactivate HTML reporting.
|
99
|
+
- -Q -> Deactivate quality control.
|
100
|
+
|
101
|
+
Results are saved in the single_phens.txt output file.
|
28
102
|
|
29
103
|
## Development
|
30
104
|
|
data/bin/coPatReporter.rb
CHANGED
@@ -1,45 +1,13 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
3
|
ROOT_PATH = File.dirname(__FILE__)
|
4
|
-
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
5
|
-
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
6
|
-
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
7
|
-
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
8
|
-
IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
|
9
4
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
10
5
|
|
11
6
|
require 'benchmark'
|
12
7
|
require 'parallel'
|
13
8
|
require 'optparse'
|
14
|
-
require 'csv'
|
15
|
-
require 'npy'
|
16
|
-
require 'generalMethods.rb'
|
17
|
-
require 'coPatReporterMethods.rb'
|
18
9
|
require 'report_html'
|
19
|
-
require '
|
20
|
-
|
21
|
-
#Expand class (semtools modifications if necessary):
|
22
|
-
class Ontology
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
##########################
|
27
|
-
# FUNCTIONS
|
28
|
-
##########################
|
29
|
-
|
30
|
-
def translate_codes(clusters, hpo)
|
31
|
-
translated_clusters = []
|
32
|
-
clusters.each do |clusterID, num_of_pats, patientIDs_ary, patient_hpos_ary|
|
33
|
-
translate_codes = patient_hpos_ary.map{|patient_hpos| patient_hpos.map{|hpo_code| hpo.translate_id(hpo_code)}}
|
34
|
-
translated_clusters << [clusterID,
|
35
|
-
num_of_pats,
|
36
|
-
patientIDs_ary,
|
37
|
-
patient_hpos_ary,
|
38
|
-
translate_codes
|
39
|
-
]
|
40
|
-
end
|
41
|
-
return translated_clusters
|
42
|
-
end
|
10
|
+
require 'pets'
|
43
11
|
|
44
12
|
##########################
|
45
13
|
#OPT-PARSER
|
@@ -69,9 +37,14 @@ OptionParser.new do |opts|
|
|
69
37
|
options[:chromosome_col] = data
|
70
38
|
end
|
71
39
|
|
72
|
-
options[:
|
40
|
+
options[:id_col] = nil
|
73
41
|
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
74
|
-
options[:
|
42
|
+
options[:id_col] = data
|
43
|
+
end
|
44
|
+
|
45
|
+
options[:detailed_clusters] = false
|
46
|
+
opts.on("-D", "--detailed_clusters", "Show detiled cluster comparation using heatmaps. Default false") do
|
47
|
+
options[:detailed_clusters] = true
|
75
48
|
end
|
76
49
|
|
77
50
|
options[:excluded_hpo] = nil
|
@@ -120,9 +93,9 @@ OptionParser.new do |opts|
|
|
120
93
|
options[:clustering_methods] = data.split(',')
|
121
94
|
end
|
122
95
|
|
123
|
-
options[:
|
96
|
+
options[:names] = false
|
124
97
|
opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
|
125
|
-
options[:
|
98
|
+
options[:names] = true
|
126
99
|
end
|
127
100
|
|
128
101
|
options[:output_file] = nil
|
@@ -135,14 +108,14 @@ OptionParser.new do |opts|
|
|
135
108
|
options[:hpo_file] = value
|
136
109
|
end
|
137
110
|
|
138
|
-
options[:
|
111
|
+
options[:ont_col] = nil
|
139
112
|
opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
|
140
|
-
options[:
|
113
|
+
options[:ont_col] = data
|
141
114
|
end
|
142
115
|
|
143
|
-
options[:
|
116
|
+
options[:separator] = '|'
|
144
117
|
opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
|
145
|
-
options[:
|
118
|
+
options[:separator] = data
|
146
119
|
end
|
147
120
|
|
148
121
|
options[:start_col] = nil
|
@@ -165,7 +138,15 @@ OptionParser.new do |opts|
|
|
165
138
|
options[:threads] = data.to_i
|
166
139
|
end
|
167
140
|
|
141
|
+
options[:reference_profiles] = nil
|
142
|
+
opts.on("--reference_profiles PATH", "Path to file tabulated file with first column as id profile and second column with ontology terms separated by separator. ") do |opt|
|
143
|
+
options[:reference_profiles] = opt
|
144
|
+
end
|
168
145
|
|
146
|
+
options[:sim_thr] = nil
|
147
|
+
opts.on("--sim_thr FLOAT", "Keep pairs with similarity value >= FLOAT. ") do |opt|
|
148
|
+
options[:sim_thr] = opt.to_f
|
149
|
+
end
|
169
150
|
|
170
151
|
opts.on_tail("-h", "--help", "Show this message") do
|
171
152
|
puts opts
|
@@ -203,80 +184,68 @@ cluster_ic_data_file = File.join(temp_folder, 'cluster_ic_data.txt')
|
|
203
184
|
cluster_chromosome_data_file = File.join(temp_folder, 'cluster_chromosome_data.txt')
|
204
185
|
coverage_to_plot_file = File.join(temp_folder, 'coverage_data.txt')
|
205
186
|
sor_coverage_to_plot_file = File.join(temp_folder, 'sor_coverage_data.txt')
|
187
|
+
ronto_file = File.join(temp_folder, 'hpo_freq_colour')
|
188
|
+
|
206
189
|
|
207
190
|
Dir.mkdir(temp_folder) if !File.exists?(temp_folder)
|
208
191
|
|
209
192
|
hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
|
210
|
-
hpo
|
193
|
+
Cohort.load_ontology(:hpo, hpo_file, options[:excluded_hpo])
|
194
|
+
Cohort.act_ont = :hpo
|
211
195
|
|
212
|
-
patient_data =
|
196
|
+
patient_data, rejected_hpos_L, rejected_patients_L = Cohort_Parser.load(options)
|
197
|
+
rejected_hpos_C, rejected_patients_C = patient_data.check
|
198
|
+
rejected_hpos = rejected_hpos_L | rejected_hpos_C
|
199
|
+
rejected_patients = rejected_patients_L + rejected_patients_C
|
200
|
+
File.open(rejected_file, 'w'){|f| f.puts (rejected_patients).join("\n")}
|
213
201
|
|
214
|
-
|
215
|
-
File.open(rejected_file, 'w'){|f| f.puts rejected_patients.join("\n")}
|
216
|
-
patient_data.select!{|pat_id, patient_record| !rejected_patients.include?(pat_id)}
|
217
|
-
patient_uniq_profiles, equivalence = get_uniq_hpo_profiles(patient_data)
|
218
|
-
hpo.load_profiles(patient_uniq_profiles)
|
202
|
+
patient_data.link2ont(Cohort.act_ont) # TODO: check if method load should call to this and use the semtools checking methods (take care to only remove invalid terms)
|
219
203
|
|
220
|
-
profile_sizes, parental_hpos_per_profile = get_profile_redundancy
|
221
|
-
|
222
|
-
|
223
|
-
|
204
|
+
profile_sizes, parental_hpos_per_profile = patient_data.get_profile_redundancy
|
205
|
+
patient_data.check(hard=true)
|
206
|
+
hpo_stats = patient_data.get_profiles_terms_frequency() # hpo NAME, freq
|
207
|
+
hpo_stats.each{ |stat| stat[1] = stat[1]*100}
|
208
|
+
File.open(hpo_frequency_file, 'w') do |f|
|
209
|
+
patient_data.get_profiles_terms_frequency(translate: false).each do |hpo_code, freq| # hpo CODE, freq
|
210
|
+
f.puts "#{hpo_code.to_s}\t#{freq}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
suggested_childs, fraction_terms_specific_childs = patient_data.compute_term_list_and_childs()
|
214
|
+
ontology_levels, distribution_percentage = patient_data.get_profile_ontology_distribution_tables()
|
215
|
+
onto_ic, freq_ic, onto_ic_profile, freq_ic_profile = patient_data.get_ic_analysis()
|
224
216
|
|
225
|
-
onto_ic, freq_ic = hpo.get_observed_ics_by_onto_and_freq # IC for TERMS
|
226
|
-
onto_ic_profile, freq_ic_profile = hpo.get_profiles_resnik_dual_ICs # IC for PROFILES
|
227
217
|
if options[:ic_stats] == 'freq_internal'
|
228
|
-
ic_file = ENV['ic_file']
|
229
|
-
ic_file = IC_FILE if ic_file.nil?
|
218
|
+
ic_file = !ENV['ic_file'].nil? ? ENV['ic_file'] : IC_FILE
|
230
219
|
freq_ic = load_hpo_ci_values(ic_file)
|
231
220
|
phenotype_ic = freq_ic
|
232
221
|
freq_ic_profile = {}
|
233
|
-
|
222
|
+
patient_data.each_profile do |pat_id, phenotypes|
|
234
223
|
freq_ic_profile[pat_id] = get_profile_ic(phenotypes, phenotype_ic)
|
235
224
|
end
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
phenotype_ic = onto_ic
|
241
|
-
end
|
225
|
+
elsif options[:ic_stats] == 'freq'
|
226
|
+
phenotype_ic = freq_ic
|
227
|
+
elsif options[:ic_stats] == 'onto'
|
228
|
+
phenotype_ic = onto_ic
|
242
229
|
end
|
243
|
-
clustered_patients = cluster_patients(patient_uniq_profiles, cohort_hpos, matrix_file, clustered_patients_file)
|
244
|
-
all_ics, profile_lengths, cluster_data_by_chromosomes, top_cluster_phenotypes, multi_chromosome_patients = process_clustered_patients(options, clustered_patients, patient_uniq_profiles, patient_data, equivalence, hpo, phenotype_ic, options[:pat_id_col])
|
245
|
-
get_patient_hpo_frequency(patient_uniq_profiles, hpo_frequency_file)
|
246
230
|
|
247
|
-
|
248
|
-
|
249
|
-
summary_stats << ['DsI for uniq HP terms', hpo.get_dataset_specifity_index('uniq')]
|
250
|
-
summary_stats << ['DsI for frequency weigthed HP terms', hpo.get_dataset_specifity_index('weigthed')]
|
231
|
+
clustered_patients = dummy_cluster_patients(patient_data.profiles, matrix_file, clustered_patients_file)
|
232
|
+
all_ics, prof_lengths, clust_by_chr, top_clust_phen, multi_chr_pats = process_dummy_clustered_patients(options, clustered_patients, patient_data, phenotype_ic)
|
251
233
|
|
252
|
-
|
253
|
-
hpo_stats.each{ |stat| stat[1] = stat[1]*100}
|
254
|
-
summary_stats << ['Number of unknown phenotypes', rejected_hpos.length]
|
234
|
+
summary_stats = get_summary_stats(patient_data, rejected_patients, hpo_stats, fraction_terms_specific_childs, rejected_hpos)
|
255
235
|
|
256
236
|
all_cnvs_length = []
|
257
237
|
if !options[:chromosome_col].nil?
|
258
|
-
summary_stats << ['Number of clusters with mutations accross > 1 chromosomes',
|
238
|
+
summary_stats << ['Number of clusters with mutations accross > 1 chromosomes', multi_chr_pats]
|
259
239
|
|
260
240
|
#----------------------------------
|
261
241
|
# Prepare data to plot coverage
|
262
242
|
#----------------------------------
|
263
243
|
if options[:coverage_analysis]
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
end
|
270
|
-
end
|
271
|
-
cnv_size_average = cnv_sizes.inject{ |sum, el| sum + el }.fdiv(cnv_sizes.length.to_f)
|
272
|
-
patients_by_cluster, sors = generate_cluster_regions(processed_patient_data, 'A', 0)
|
273
|
-
total_patients_sharing_sors = []
|
274
|
-
all_patients = patients_by_cluster.keys
|
275
|
-
all_patients.each do |identifier|
|
276
|
-
total_patients_sharing_sors << identifier.split('_i').first
|
277
|
-
end
|
278
|
-
all_cnvs_length = get_cnvs_length(patient_data)
|
279
|
-
|
244
|
+
patient_data.index_vars
|
245
|
+
all_cnvs_length = patient_data.get_vars_sizes(true)
|
246
|
+
cnv_size_average = get_mean_size(all_cnvs_length)
|
247
|
+
patients_by_cluster, sors = patient_data.generate_cluster_regions(:reg_overlap, 'A', 0)
|
248
|
+
|
280
249
|
###1. Process CNVs
|
281
250
|
raw_coverage, n_cnv, nt, pats_per_region = calculate_coverage(sors)
|
282
251
|
summary_stats << ['Average variant size', cnv_size_average.round(4)]
|
@@ -288,7 +257,7 @@ if !options[:chromosome_col].nil?
|
|
288
257
|
###2. Process SORs
|
289
258
|
raw_sor_coverage, n_sor, nt, pats_per_region = calculate_coverage(sors, options[:patients_filter] - 1)
|
290
259
|
summary_stats << ["Number of genome window shared by >= #{options[:patients_filter]} patients", n_sor]
|
291
|
-
summary_stats << ["Number of patients with at least 1 SOR",
|
260
|
+
summary_stats << ["Number of patients with at least 1 SOR", patients_by_cluster.length]
|
292
261
|
summary_stats << ['Nucleotides affected by mutations', nt]
|
293
262
|
# summary_stats << ['Patient average per region', pats_per_region]
|
294
263
|
sor_coverage_to_plot = get_final_coverage(raw_sor_coverage, options[:bin_size])
|
@@ -304,20 +273,16 @@ write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluat
|
|
304
273
|
write_arrays4scatterplot(onto_ic.values, freq_ic.values, hpo_ic_file, 'OntoIC', 'FreqIC') # hP terms
|
305
274
|
write_arrays4scatterplot(onto_ic_profile.values, freq_ic_profile.values, hpo_profile_ic_file, 'OntoIC', 'FreqIC') #HP profiles
|
306
275
|
write_arrays4scatterplot(profile_sizes, parental_hpos_per_profile, parents_per_term_file, 'ProfileSize', 'ParentTerms')
|
276
|
+
write_cluster_ic_data(all_ics, prof_lengths, cluster_ic_data_file, options[:clusters2graph])
|
307
277
|
|
308
278
|
system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_ic_file} -o #{File.join(temp_folder, 'hpo_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology IC' --y_tag 'HP Frequency based IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_ics.pdf'))
|
309
279
|
system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{hpo_profile_ic_file} -o #{File.join(temp_folder, 'hpo_profile_ics.pdf')} -x 'OntoIC' -y 'FreqIC' --x_tag 'HP Ontology Profile IC' --y_tag 'HP Frequency based Profile IC' --x_lim '0,4.5' --y_lim '0,4.5'") if !File.exists?(File.join(temp_folder, 'hpo_profile_ics.pdf'))
|
310
280
|
system_call(EXTERNAL_CODE, 'plot_scatterplot_simple.R', "-i #{parents_per_term_file} -o #{File.join(temp_folder, 'parents_per_term.pdf')} -x 'ProfileSize' -y 'ParentTerms' --x_tag 'Patient HPO profile size' --y_tag 'Parent HPO terms within the profile'")
|
311
|
-
|
312
|
-
###Cohort frequency calculation
|
313
|
-
ronto_file = File.join(temp_folder, 'hpo_freq_colour')
|
314
|
-
system_call(EXTERNAL_CODE, 'ronto_plotter.R', "-i #{hpo_frequency_file} -o #{ronto_file} --root_node #{options[:root_node]} -O #{hpo_file.gsub('.json','.obo')}") if !File.exist?(ronto_file + '.png')
|
315
|
-
|
316
|
-
write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, options[:clusters2graph])
|
281
|
+
system_call(EXTERNAL_CODE, 'ronto_plotter.R', "-i #{hpo_frequency_file} -o #{ronto_file} --root_node #{options[:root_node]} -O #{hpo_file.gsub('.json','.obo')}") if !File.exist?(ronto_file + '.png') ###Cohort frequency calculation
|
317
282
|
system_call(EXTERNAL_CODE, 'plot_boxplot.R', "#{cluster_ic_data_file} #{temp_folder} cluster_id ic 'Cluster size/id' 'Information coefficient' 'Plen' 'Profile size'")
|
318
283
|
|
319
284
|
if !options[:chromosome_col].nil?
|
320
|
-
write_cluster_chromosome_data(
|
285
|
+
write_cluster_chromosome_data(clust_by_chr, cluster_chromosome_data_file, options[:clusters2graph])
|
321
286
|
system_call(EXTERNAL_CODE, 'plot_scatterplot.R', "#{cluster_chromosome_data_file} #{temp_folder} cluster_id chr count 'Cluster size/id' 'Chromosome' 'Patients'")
|
322
287
|
if options[:coverage_analysis]
|
323
288
|
###1. Process CNVs
|
@@ -332,69 +297,16 @@ end
|
|
332
297
|
#----------------------------------
|
333
298
|
# CLUSTER COHORT ANALYZER REPORT
|
334
299
|
#----------------------------------
|
335
|
-
|
336
|
-
matrix_filename = File.join(temp_folder, "similarity_matrix_#{method_name}.npy")
|
337
|
-
axis_file = matrix_filename.gsub('.npy','.lst')
|
338
|
-
profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
|
339
|
-
clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
|
340
|
-
if !File.exists?(matrix_filename)
|
341
|
-
profiles_similarity = hpo.compare_profiles(sim_type: method_name.to_sym)
|
342
|
-
write_profile_pairs(profiles_similarity, profiles_similarity_filename)
|
343
|
-
similarity_matrix, axis_names = format_profiles_similarity_data_numo(profiles_similarity)
|
344
|
-
File.open(axis_file, 'w'){|f| f.print axis_names.join("\n") }
|
345
|
-
Npy.save(matrix_filename, similarity_matrix)
|
346
|
-
end
|
347
|
-
ext_var = ''
|
348
|
-
if method_name == 'resnik'
|
349
|
-
ext_var = '-m max'
|
350
|
-
elsif method_name == 'lin'
|
351
|
-
ext_var = '-m comp1'
|
352
|
-
end
|
353
|
-
out_file = File.join(temp_folder, method_name)
|
354
|
-
system_call(EXTERNAL_CODE, 'plot_heatmap.R', "-y #{axis_file} -d #{matrix_filename} -o #{out_file} -M #{options[:minClusterProportion]} -t dynamic -H #{ext_var}") if !File.exists?(out_file + '_heatmap.png')
|
355
|
-
clusters_codes, clusters_info = parse_clusters_file(File.join(temp_folder, "#{method_name}_clusters.txt"), patient_uniq_profiles)
|
356
|
-
get_cluster_metadata(clusters_info, clusters_distribution_filename)
|
357
|
-
out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
|
358
|
-
system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
|
359
|
-
clusters = translate_codes(clusters_codes, hpo)
|
360
|
-
|
361
|
-
container = {
|
362
|
-
:temp_folder => temp_folder,
|
363
|
-
:cluster_name => method_name,
|
364
|
-
:clusters => clusters,
|
365
|
-
:hpo => hpo
|
366
|
-
}
|
367
|
-
|
368
|
-
template = File.open(File.join(REPORT_FOLDER, 'cluster_report.erb')).read
|
369
|
-
report = Report_html.new(container, 'Patient clusters report')
|
370
|
-
report.build(template)
|
371
|
-
report.write(options[:output_file]+"_#{method_name}_clusters.html")
|
372
|
-
end
|
373
|
-
|
374
|
-
system_call(EXTERNAL_CODE, 'generate_boxpot.R', "-i #{temp_folder} -o #{File.join(temp_folder, 'sim_boxplot')}") if !File.exists?(File.join(temp_folder, 'sim_boxplot.png'))
|
375
|
-
|
300
|
+
get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
376
301
|
|
377
302
|
#----------------------------------
|
378
303
|
# GENERAL COHORT ANALYZER REPORT
|
379
304
|
#----------------------------------
|
380
|
-
|
381
|
-
new_cluster_phenotypes = {}
|
382
|
-
phenotypes_frequency = Hash.new(0)
|
383
|
-
top_cluster_phenotypes.each_with_index do |cluster, clusterID|
|
384
|
-
total_patients = cluster.length
|
385
|
-
cluster.each do |phenotypes|
|
386
|
-
phenotypes.each do |p|
|
387
|
-
phenotypes_frequency[p] += 1
|
388
|
-
end
|
389
|
-
end
|
390
|
-
new_cluster_phenotypes[clusterID] = [total_patients, phenotypes_frequency.keys, phenotypes_frequency.values.map{|v| v.fdiv(total_patients) * 100}]
|
391
|
-
phenotypes_frequency = Hash.new(0)
|
392
|
-
end
|
393
|
-
|
305
|
+
new_cluster_phenotypes = get_top_dummy_clusters_stats(top_clust_phen)
|
394
306
|
|
395
307
|
container = {
|
396
308
|
:temp_folder => temp_folder,
|
397
|
-
# :
|
309
|
+
# :top_clust_phen => top_clust_phen.length,
|
398
310
|
:summary_stats => summary_stats,
|
399
311
|
:clustering_methods => options[:clustering_methods],
|
400
312
|
:hpo_stats => hpo_stats,
|
@@ -413,8 +325,8 @@ new_cluster_phenotypes.each do |clusterID, info|
|
|
413
325
|
container["clust_#{clusterID}"] = clust_info
|
414
326
|
clust_info = []
|
415
327
|
end
|
416
|
-
|
417
328
|
template = File.open(File.join(REPORT_FOLDER, 'cohort_report.erb')).read
|
418
329
|
report = Report_html.new(container, 'Cohort quality report')
|
419
330
|
report.build(template)
|
420
|
-
report.write(options[:output_file]+'.html')
|
331
|
+
report.write(options[:output_file]+'.html')
|
332
|
+
|
data/bin/comPatMondo.rb
CHANGED
@@ -4,15 +4,12 @@
|
|
4
4
|
# @author Fernando Moreno Jabato <jabato(at)uma(dot)es>
|
5
5
|
|
6
6
|
ROOT_PATH = File.dirname(__FILE__)
|
7
|
-
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
8
|
-
MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
|
9
|
-
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
|
10
|
-
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
11
7
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
12
8
|
|
13
9
|
require 'optparse'
|
14
10
|
require 'semtools'
|
15
11
|
require 'csv'
|
12
|
+
require 'constants.rb'
|
16
13
|
require 'coPatReporterMethods.rb'
|
17
14
|
|
18
15
|
##########################
|