pets 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
data/bin/evidence_profiler.rb
CHANGED
@@ -1,166 +1,63 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
3
|
ROOT_PATH = File.dirname(__FILE__)
|
4
|
-
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
5
|
-
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
6
|
-
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
7
|
-
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
8
4
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
9
5
|
|
10
6
|
require 'fileutils'
|
11
7
|
require 'optparse'
|
12
8
|
require 'report_html'
|
13
9
|
require 'semtools'
|
14
|
-
require '
|
15
|
-
|
16
|
-
|
17
|
-
class Report_html
|
18
|
-
def circular_genome(user_options = {}, &block)
|
19
|
-
default_options = {}.merge!(user_options)
|
20
|
-
coordinates = user_options[:genomic_coordinates]
|
21
|
-
html_string = canvasXpress_main(default_options, block) do |options, config, samples, vars, values, object_id, x, z|
|
22
|
-
config['graphType'] = 'Circular'
|
23
|
-
config["arcSegmentsSeparation"] = 3
|
24
|
-
config["colorScheme"] = "Tableau"
|
25
|
-
config["colors"] = ["#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#AA4466","#882255","#AA4499"]
|
26
|
-
config["showIdeogram"] = true
|
27
|
-
chr = []
|
28
|
-
pos = []
|
29
|
-
tags2remove = []
|
30
|
-
vars.each_with_index do |var, i|
|
31
|
-
coord = coordinates[var]
|
32
|
-
if !coord.nil?
|
33
|
-
tag = coord.first.gsub(/[^\dXY]/,'')
|
34
|
-
if tag == 'X' || tag == 'Y' || (tag.to_i > 0 && tag.to_i <= 22)
|
35
|
-
chr << coord.first.gsub(/[^\dXY]/,'')
|
36
|
-
pos << coord.last - 1
|
37
|
-
else
|
38
|
-
tags2remove << i
|
39
|
-
end
|
40
|
-
else
|
41
|
-
tags2remove << i
|
42
|
-
end
|
43
|
-
end
|
44
|
-
tags2remove.reverse_each{|i| ent = vars.delete_at(i); warn("Feature #{ent} has not valid coordinates")} # Remove entities with invalid coordinates
|
45
|
-
z['chr'] = chr
|
46
|
-
z['pos'] = pos
|
47
|
-
end
|
48
|
-
return html_string
|
49
|
-
end
|
50
|
-
end
|
10
|
+
require 'pets'
|
51
11
|
|
52
12
|
#############################################################################################
|
53
13
|
## METHODS
|
54
14
|
############################################################################################
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
id, profile = line.chomp.split("\t")
|
60
|
-
hpos = profile.split(',').map{|a| a.to_sym}
|
61
|
-
hpos, rejected_hpos = hpo.check_ids(hpos)
|
62
|
-
if !hpos.empty?
|
63
|
-
hpos = hpo.clean_profile(hpos)
|
64
|
-
profiles[id] = hpos if !hpos.empty?
|
65
|
-
end
|
66
|
-
end
|
67
|
-
return profiles
|
68
|
-
end
|
69
|
-
|
70
|
-
def load_variants(variant_folder)
|
71
|
-
variants = {}
|
72
|
-
coordinates = {}
|
73
|
-
count = 0
|
74
|
-
all_vars = {}
|
75
|
-
Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
|
76
|
-
profile_id = File.basename(path, '.tab')
|
77
|
-
vars = {}
|
78
|
-
File.open(path).each do |line|
|
79
|
-
fields = line.chomp.split("\t")
|
80
|
-
chr = fields[0]
|
81
|
-
start = fields[1].to_i
|
82
|
-
query = coordinates[chr]
|
83
|
-
if query.nil?
|
84
|
-
coordinates[chr] = [start]
|
85
|
-
count += 1
|
86
|
-
id = "var_#{count}"
|
87
|
-
else
|
88
|
-
if !query.include?(start)
|
89
|
-
query << start
|
90
|
-
count += 1
|
91
|
-
id = "var_#{count}"
|
92
|
-
else
|
93
|
-
id = all_vars.key([chr, start])
|
94
|
-
end
|
95
|
-
end
|
96
|
-
vars[id] = [chr, start]
|
97
|
-
end
|
98
|
-
all_vars.merge!(vars)
|
99
|
-
variants[profile_id] = vars
|
100
|
-
end
|
101
|
-
return variants
|
102
|
-
end
|
103
|
-
|
104
|
-
def load_evidences(evidences_path, hpo)
|
105
|
-
genomic_coordinates = {}
|
106
|
-
coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
|
107
|
-
coord_files.each do |cd_f|
|
108
|
-
entity = File.basename(cd_f, '.coords')
|
109
|
-
coordinates = load_coordinates(cd_f)
|
110
|
-
genomic_coordinates[entity] = coordinates
|
111
|
-
end
|
112
|
-
evidences = {}
|
113
|
-
evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
|
114
|
-
evidence_files.each do |e_f|
|
115
|
-
pair = File.basename(e_f, '.txt')
|
116
|
-
profiles, id2label = load_evidence_profiles(e_f, hpo)
|
117
|
-
evidences[pair] = {prof: profiles, id2lab: id2label}
|
118
|
-
end
|
119
|
-
return evidences, genomic_coordinates
|
15
|
+
def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
16
|
+
all_coordinates = genomic_coordinates[entity]
|
17
|
+
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)}
|
18
|
+
return coords
|
120
19
|
end
|
121
20
|
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
21
|
+
def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs, evidences, prof_vars, template, output)
|
22
|
+
var_ids, var_coors = format_variants4report(prof_vars)
|
23
|
+
container = {
|
24
|
+
profile_id: profile_id,
|
25
|
+
candidates: all_candidates.each{|c| c[0] = c.first.to_s},
|
26
|
+
genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
|
27
|
+
similarity_matrixs: similarity_matrixs,
|
28
|
+
evidences: evidences,
|
29
|
+
var_ids: var_ids,
|
30
|
+
var_coordinates: var_coors
|
31
|
+
}
|
32
|
+
report = Report_html.new(container, 'Evidence profile report')
|
33
|
+
report.build(template)
|
34
|
+
report.write(File.join(output, profile_id.to_s + '.html'))
|
135
35
|
end
|
136
36
|
|
137
|
-
def
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
37
|
+
def format_variants4report(var_data)
|
38
|
+
if var_data.nil?
|
39
|
+
var_ids, var_coors = nil
|
40
|
+
else
|
41
|
+
var_ids = []
|
42
|
+
var_coors = {}
|
43
|
+
count = 0
|
44
|
+
var_data.each do |chr, reg|
|
45
|
+
var_id = "var_#{count}"
|
46
|
+
var_ids << [var_id, 0]
|
47
|
+
var_coors[var_id] = [chr.to_s, reg[:start]]
|
48
|
+
count += 1
|
149
49
|
end
|
150
50
|
end
|
151
|
-
return
|
51
|
+
return var_ids, var_coors
|
152
52
|
end
|
153
53
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)}
|
159
|
-
return coords
|
54
|
+
def get_genome_hotspots(similarity_matrixs, all_genomic_coordinates)
|
55
|
+
regions = Genomic_Feature.new(all_genomic_coordinates.values.map{|g| g[0..2]})
|
56
|
+
candidates_by_window, genome_windows = regions.generate_cluster_regions(:reg_overlap, 'A', 1)
|
57
|
+
# TODO: COMPLETE UNTIL FULL PREDICTOR
|
160
58
|
end
|
161
59
|
|
162
60
|
|
163
|
-
|
164
61
|
#############################################################################################
|
165
62
|
## OPTPARSE
|
166
63
|
############################################################################################
|
@@ -245,17 +142,7 @@ profiles.each do |profile_id, reference_prof|
|
|
245
142
|
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
246
143
|
all_genomic_coordinates.merge!(coords)
|
247
144
|
end
|
145
|
+
get_genome_hotspots(similarity_matrixs, all_genomic_coordinates)
|
248
146
|
prof_vars = profile_variants[profile_id]
|
249
|
-
|
250
|
-
profile_id: profile_id,
|
251
|
-
candidates: all_candidates.each{|c| c[0] = c.first.to_s},
|
252
|
-
genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
|
253
|
-
similarity_matrixs: similarity_matrixs,
|
254
|
-
evidences: evidences,
|
255
|
-
var_ids: prof_vars.nil? ? nil : prof_vars.keys.map{|i| [i, 0]},
|
256
|
-
var_coordinates: prof_vars
|
257
|
-
}
|
258
|
-
report = Report_html.new(container, 'Evidence profile report')
|
259
|
-
report.build(template)
|
260
|
-
report.write(File.join(options[:output_folder], profile_id.to_s + '.html'))
|
147
|
+
make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs, evidences, prof_vars, template, options[:output_folder])
|
261
148
|
end
|
data/bin/get_network_nodes.rb
CHANGED
@@ -3,112 +3,34 @@
|
|
3
3
|
# Code to prepare data to get the associations between pathological phenotypes (HPO) and genomic regions (SOR)
|
4
4
|
|
5
5
|
ROOT_PATH = File.dirname(__FILE__)
|
6
|
-
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
7
|
-
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
|
8
6
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
9
7
|
|
10
8
|
##############################
|
11
9
|
#LIBRARIES
|
12
10
|
##############################
|
13
|
-
require 'generalMethods.rb'
|
14
11
|
require 'optparse'
|
15
|
-
require '
|
12
|
+
require 'pets'
|
16
13
|
|
17
14
|
###############################
|
18
15
|
#METHODS
|
19
16
|
###############################
|
20
17
|
|
21
|
-
def
|
22
|
-
patient2phenotype = {}
|
23
|
-
hpo_count = {}
|
24
|
-
not_found = []
|
25
|
-
patients_genomic_region_by_chr = {}
|
26
|
-
File.open(patient_file).each do |line|
|
27
|
-
line.chomp!
|
28
|
-
next if line.include?("#")
|
29
|
-
patient, chr, start, stop, phenotype_profile = line.split("\t", 5)
|
30
|
-
next if phenotype_profile.nil? #For skipping patients without phenotypes
|
31
|
-
phenotypes = phenotype_profile.split('|')
|
32
|
-
# phenotypes, rejected = hpo.translate_names2codes(phenotypes)
|
33
|
-
phenotypes, rejected = hpo.translate_names(phenotypes)
|
34
|
-
not_found = not_found | rejected
|
35
|
-
phenotypes.each do |hpo_code|
|
36
|
-
get_all_hpos(patient, hpo_code, patient2phenotype, hpo, hpo_count, add_parents) if !hpo.is_removable(hpo_code)
|
37
|
-
end
|
38
|
-
info = [patient, start.to_i, stop.to_i]
|
39
|
-
add_record(patients_genomic_region_by_chr, chr, info)
|
40
|
-
end
|
41
|
-
return patient2phenotype, hpo_count, not_found, patients_genomic_region_by_chr
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
def get_all_hpos(patient, hpo_code, patient2phenotype, hpo, hpo_count, add_parents)
|
46
|
-
add_record(hpo_count, hpo_code, patient)
|
47
|
-
add_record(patient2phenotype, patient, hpo_code)
|
48
|
-
if add_parents == 'root'
|
49
|
-
# hpo_parent_codes = hpo.get_parents(hpo_code)
|
50
|
-
hpo_parent_codes = hpo.get_ancestors(hpo_code)
|
51
|
-
hpo_parent_codes.each do |parent_code|
|
52
|
-
add_record(hpo_count, parent_code, patient)
|
53
|
-
add_record(patient2phenotype, patient, parent_code)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def build_tripartite_network(patients2hpo, hpo_stats, ic_threshold, patients_by_cluster)
|
18
|
+
def build_tripartite_network(patient_data, patients_by_cluster, add_parents, ont)
|
59
19
|
tripartite_network = []
|
60
20
|
patients_by_cluster.each do |patient, node_ids|
|
61
21
|
node_ids.each do |node_id|
|
62
22
|
tripartite_network << [node_id, patient]
|
63
23
|
end
|
64
24
|
end
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
end
|
71
|
-
end
|
25
|
+
patient_data.each_profile do |id, profile|
|
26
|
+
profile = profile.map{|term| ont.get_ancestors(term)}.flatten.uniq if add_parents == 'root'
|
27
|
+
profile.each do |term|
|
28
|
+
tripartite_network << [term, id]
|
29
|
+
end
|
72
30
|
end
|
73
31
|
return tripartite_network
|
74
32
|
end
|
75
33
|
|
76
|
-
def compute_hpo_stats(hpo_count, patient_number)
|
77
|
-
hpo_stats = {}
|
78
|
-
patient_hpo_ic = []
|
79
|
-
hpo_count.each do |hpo_code, patient_ids|
|
80
|
-
hpo_freq = patient_ids.length.fdiv(patient_number) #hpo frequency in patients
|
81
|
-
hpo_ic = -Math.log10(hpo_freq)
|
82
|
-
hpo_stats[hpo_code] = [hpo_freq, hpo_ic]
|
83
|
-
patient_ids.each do |patient_id|
|
84
|
-
patient_hpo_ic << [patient_id, hpo_code, hpo_ic]
|
85
|
-
end
|
86
|
-
end
|
87
|
-
return hpo_stats, patient_hpo_ic.sort{|a,b| a.first <=> b.first}
|
88
|
-
end
|
89
|
-
|
90
|
-
def write_hash(hash, file_path, header = [])
|
91
|
-
File.open(file_path, 'w') do |handler|
|
92
|
-
handler.puts header.join("\t") if !header.empty?
|
93
|
-
hash.each do |key, array|
|
94
|
-
handler.puts "#{key}\t#{array.join("\t")}"
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
def write_array(array, file_path)
|
100
|
-
File.open(file_path, 'w') do |handler|
|
101
|
-
array.each do |record|
|
102
|
-
if record.class == String
|
103
|
-
line = record
|
104
|
-
else
|
105
|
-
line = record.join("\t")
|
106
|
-
end
|
107
|
-
handler.puts line
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
34
|
##############################
|
113
35
|
#OPTPARSE
|
114
36
|
##############################
|
@@ -117,33 +39,75 @@ options = {}
|
|
117
39
|
OptionParser.new do |opts|
|
118
40
|
opts.banner = "Usage: #{__FILE__} [options]"
|
119
41
|
|
120
|
-
options[:
|
121
|
-
opts.on("-c", "--
|
122
|
-
options[:
|
123
|
-
end
|
42
|
+
options[:chromosome_col] = nil
|
43
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
44
|
+
options[:chromosome_col] = data
|
45
|
+
end
|
124
46
|
|
125
|
-
options[:
|
126
|
-
opts.on("-
|
127
|
-
options[:
|
47
|
+
options[:id_col] = nil
|
48
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
49
|
+
options[:id_col] = data
|
128
50
|
end
|
129
51
|
|
130
|
-
options[:
|
131
|
-
opts.on("-
|
132
|
-
options[:
|
52
|
+
options[:end_col] = nil
|
53
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
54
|
+
options[:end_col] = data
|
55
|
+
end
|
56
|
+
|
57
|
+
options[:ont_col] = nil
|
58
|
+
opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
|
59
|
+
options[:ont_col] = data
|
133
60
|
end
|
134
61
|
|
135
|
-
options[:
|
136
|
-
opts.on("-
|
137
|
-
options[:
|
62
|
+
options[:start_col] = nil
|
63
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
64
|
+
options[:start_col] = data
|
65
|
+
end
|
66
|
+
|
67
|
+
options[:separator] = '|'
|
68
|
+
opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
|
69
|
+
options[:separator] = data
|
70
|
+
end
|
71
|
+
|
72
|
+
options[:names] = false
|
73
|
+
opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
|
74
|
+
options[:names] = true
|
75
|
+
end
|
76
|
+
|
77
|
+
options[:header] = true
|
78
|
+
opts.on("-H", "--header", "File has a line header. Default true") do
|
79
|
+
options[:header] = false
|
80
|
+
end
|
81
|
+
|
82
|
+
#===================================================================
|
83
|
+
|
84
|
+
options[:input_file] = nil
|
85
|
+
opts.on("-i", "--input_file PATH", "Input file with patients for parsing phenotypes to HPO codes") do |value|
|
86
|
+
options[:input_file] = value
|
138
87
|
end
|
139
88
|
|
140
89
|
options[:output_file] = 'tripartite_network.txt'
|
141
90
|
opts.on("-o", "--output_file PATH", "Output file for the tripartite network") do |value|
|
142
91
|
options[:output_file] = value
|
92
|
+
end
|
93
|
+
|
94
|
+
options[:cluster_file] = 'cluster_coords.txt'
|
95
|
+
opts.on("-u", "--cluster_file PATH", "Cluster coords output file that will be used to translate SOR nodes") do |value|
|
96
|
+
options[:cluster_file] = File.basename(value)
|
143
97
|
end
|
144
98
|
|
99
|
+
options[:excluded_hpo] = nil
|
100
|
+
opts.on("-x", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
|
101
|
+
options[:excluded_hpo] = excluded_hpo
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:tag] = 'A'
|
105
|
+
opts.on("-m", "--mutation_type STRING", "Type of patient mutation, either it is a deletion (d) or duplication (D)") do |type|
|
106
|
+
options[:tag] = type
|
107
|
+
end
|
108
|
+
|
145
109
|
options[:hpo_file] = nil
|
146
|
-
opts.on("-
|
110
|
+
opts.on("-O", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
|
147
111
|
options[:hpo_file] = value
|
148
112
|
end
|
149
113
|
|
@@ -152,16 +116,6 @@ OptionParser.new do |opts|
|
|
152
116
|
options[:add_parents] = value
|
153
117
|
end
|
154
118
|
|
155
|
-
options[:hpo_stat_file] = 'hpo_stats.txt'
|
156
|
-
opts.on("-s", "--hpo_stat_file PATH", "Output file with HPO codes, their frequency and CI") do |value|
|
157
|
-
options[:hpo_stat_file] = File.basename(value)
|
158
|
-
end
|
159
|
-
|
160
|
-
options[:thresold] = 0
|
161
|
-
opts.on("-t", "--info_thresold FLOAT", "IC thresold to discard non informative hpo. Default: 0.") do |thresold|
|
162
|
-
options[:thresold] = thresold.to_f
|
163
|
-
end
|
164
|
-
|
165
119
|
opts.on_tail("-h", "--help", "Show this message") do
|
166
120
|
puts opts
|
167
121
|
exit
|
@@ -176,27 +130,20 @@ output_folder = File.dirname(File.expand_path(options[:output_file]))
|
|
176
130
|
Dir.mkdir(output_folder) if !File.exists?(output_folder)
|
177
131
|
|
178
132
|
hpo_file = options[:hpo_file]
|
179
|
-
hpo_file = ENV['hpo_file'] if hpo_file.nil?
|
180
|
-
hpo_file
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
tripartite_network = build_tripartite_network(patients2hpo, hpo_stats, options[:thresold], patients_by_cluster)
|
196
|
-
|
197
|
-
# write_array(not_found - hpo.excluded_codes, File.join(output_folder, 'missing_hpo_names'))
|
198
|
-
write_array(not_found - hpo.removable_terms, File.join(output_folder, 'missing_hpo_names'))
|
133
|
+
hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE if hpo_file.nil?
|
134
|
+
Cohort.load_ontology(:hpo, hpo_file, options[:excluded_hpo])
|
135
|
+
Cohort.act_ont = :hpo
|
136
|
+
hpo = Cohort.get_ontology(Cohort.act_ont)
|
137
|
+
|
138
|
+
patient_data, rejected_hpos_L, rejected_patients_L = Cohort_Parser.load(options)
|
139
|
+
rejected_hpos_C, rejected_patients_C = patient_data.check
|
140
|
+
rejected_hpos = rejected_hpos_L | rejected_hpos_C
|
141
|
+
rejected_patients = rejected_patients_L + rejected_patients_C
|
142
|
+
patient_data.remove_incomplete_records
|
143
|
+
patient_data.index_vars
|
144
|
+
patients_by_cluster, sors = patient_data.generate_cluster_regions(:reg_overlap, options[:tag], 1)
|
145
|
+
tripartite_network = build_tripartite_network(patient_data, patients_by_cluster, options[:add_parents], hpo)
|
146
|
+
|
147
|
+
write_array(rejected_hpos, File.join(output_folder, 'missing_hpo_names'))
|
199
148
|
write_array(sors, File.join(output_folder, options[:cluster_file]))
|
200
|
-
|
201
|
-
write_array(tripartite_network, options[:output_file])
|
202
|
-
write_array(patient_hpo_ic, File.join(output_folder, 'filtered_hpo.txt'))
|
149
|
+
write_array(tripartite_network, options[:output_file])
|
data/bin/get_sorted_profs.rb
CHANGED
@@ -1,33 +1,11 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
|
-
|
4
3
|
ROOT_PATH = File.dirname(__FILE__)
|
5
|
-
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
6
|
-
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
7
|
-
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
8
4
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
9
5
|
|
10
6
|
require 'optparse'
|
11
7
|
require 'report_html'
|
12
|
-
require '
|
13
|
-
require 'generalMethods.rb'
|
14
|
-
|
15
|
-
#############################################################################################
|
16
|
-
## METHODS
|
17
|
-
############################################################################################
|
18
|
-
def procces_patient_data(patient_data, hpo)
|
19
|
-
clean_profiles = {}
|
20
|
-
all_hpo = []
|
21
|
-
patient_data.each do |pat_id, data|
|
22
|
-
profile = hpo.clean_profile_hard(data.first.map{|c| c.to_sym})
|
23
|
-
if !profile.empty?
|
24
|
-
clean_profiles[pat_id] = profile
|
25
|
-
all_hpo.concat(profile)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
ref_prof = hpo.clean_profile_hard(all_hpo.uniq)
|
29
|
-
return ref_prof, clean_profiles
|
30
|
-
end
|
8
|
+
require 'pets'
|
31
9
|
|
32
10
|
#############################################################################################
|
33
11
|
## OPTPARSE
|
@@ -42,9 +20,9 @@ OptionParser.new do |opts|
|
|
42
20
|
options[:chromosome_col] = data
|
43
21
|
end
|
44
22
|
|
45
|
-
options[:
|
23
|
+
options[:id_col] = nil
|
46
24
|
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
47
|
-
options[:
|
25
|
+
options[:id_col] = data
|
48
26
|
end
|
49
27
|
|
50
28
|
options[:end_col] = nil
|
@@ -67,9 +45,9 @@ OptionParser.new do |opts|
|
|
67
45
|
options[:input_file] = value
|
68
46
|
end
|
69
47
|
|
70
|
-
options[:
|
48
|
+
options[:ont_col] = nil
|
71
49
|
opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
|
72
|
-
options[:
|
50
|
+
options[:ont_col] = data
|
73
51
|
end
|
74
52
|
|
75
53
|
options[:start_col] = nil
|
@@ -77,9 +55,14 @@ OptionParser.new do |opts|
|
|
77
55
|
options[:start_col] = data
|
78
56
|
end
|
79
57
|
|
80
|
-
options[:
|
58
|
+
options[:separator] = '|'
|
81
59
|
opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
|
82
|
-
options[:
|
60
|
+
options[:separator] = data
|
61
|
+
end
|
62
|
+
|
63
|
+
options[:term_freq] = 0
|
64
|
+
opts.on("-f", "--general_prof_freq INTEGER", "When reference profile is not given, a general ine is computed with all profiles. If a freq is defined (0-1), all terms with freq minor than limit are removed") do |data|
|
65
|
+
options[:term_freq] = data.to_i
|
83
66
|
end
|
84
67
|
|
85
68
|
options[:matrix_limits] = [20, 40]
|
@@ -101,15 +84,21 @@ end.parse!
|
|
101
84
|
#############################################################################################
|
102
85
|
## MAIN
|
103
86
|
############################################################################################
|
104
|
-
patient_data = load_patient_cohort(options)
|
105
87
|
|
106
88
|
hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
|
107
|
-
hpo
|
108
|
-
hpo
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
89
|
+
Cohort.load_ontology(:hpo, hpo_file)
|
90
|
+
Cohort.act_ont = :hpo
|
91
|
+
hpo = Cohort.get_ontology(Cohort.act_ont)
|
92
|
+
patient_data, _, _ = Cohort_Parser.load(options)
|
93
|
+
patient_data.check(hard=true)
|
94
|
+
|
95
|
+
clean_profiles = patient_data.profiles
|
96
|
+
if !options[:ref_prof].nil?
|
97
|
+
ref_profile = hpo.clean_profile_hard(options[:ref_prof])
|
98
|
+
else
|
99
|
+
ref_profile = patient_data.get_general_profile(options[:term_freq])
|
100
|
+
end
|
101
|
+
hpo.load_profiles({ref: ref_profile}, reset_stored: true)
|
113
102
|
|
114
103
|
similarities = hpo.compare_profiles(external_profiles: clean_profiles, sim_type: :lin, bidirectional: false)
|
115
104
|
|
data/bin/install_deps.rb
ADDED