pets 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
data/bin/evidence_profiler.rb
CHANGED
@@ -1,166 +1,102 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
3
|
ROOT_PATH = File.dirname(__FILE__)
|
4
|
-
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
5
|
-
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
6
|
-
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
7
|
-
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
8
4
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
9
5
|
|
10
6
|
require 'fileutils'
|
11
7
|
require 'optparse'
|
12
8
|
require 'report_html'
|
13
9
|
require 'semtools'
|
14
|
-
require '
|
15
|
-
|
16
|
-
|
17
|
-
class Report_html
|
18
|
-
def circular_genome(user_options = {}, &block)
|
19
|
-
default_options = {}.merge!(user_options)
|
20
|
-
coordinates = user_options[:genomic_coordinates]
|
21
|
-
html_string = canvasXpress_main(default_options, block) do |options, config, samples, vars, values, object_id, x, z|
|
22
|
-
config['graphType'] = 'Circular'
|
23
|
-
config["arcSegmentsSeparation"] = 3
|
24
|
-
config["colorScheme"] = "Tableau"
|
25
|
-
config["colors"] = ["#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#AA4466","#882255","#AA4499"]
|
26
|
-
config["showIdeogram"] = true
|
27
|
-
chr = []
|
28
|
-
pos = []
|
29
|
-
tags2remove = []
|
30
|
-
vars.each_with_index do |var, i|
|
31
|
-
coord = coordinates[var]
|
32
|
-
if !coord.nil?
|
33
|
-
tag = coord.first.gsub(/[^\dXY]/,'')
|
34
|
-
if tag == 'X' || tag == 'Y' || (tag.to_i > 0 && tag.to_i <= 22)
|
35
|
-
chr << coord.first.gsub(/[^\dXY]/,'')
|
36
|
-
pos << coord.last - 1
|
37
|
-
else
|
38
|
-
tags2remove << i
|
39
|
-
end
|
40
|
-
else
|
41
|
-
tags2remove << i
|
42
|
-
end
|
43
|
-
end
|
44
|
-
tags2remove.reverse_each{|i| ent = vars.delete_at(i); warn("Feature #{ent} has not valid coordinates")} # Remove entities with invalid coordinates
|
45
|
-
z['chr'] = chr
|
46
|
-
z['pos'] = pos
|
47
|
-
end
|
48
|
-
return html_string
|
49
|
-
end
|
50
|
-
end
|
10
|
+
require 'pets'
|
51
11
|
|
52
12
|
#############################################################################################
|
53
13
|
## METHODS
|
54
14
|
############################################################################################
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
hpos = profile.split(',').map{|a| a.to_sym}
|
61
|
-
hpos, rejected_hpos = hpo.check_ids(hpos)
|
62
|
-
if !hpos.empty?
|
63
|
-
hpos = hpo.clean_profile(hpos)
|
64
|
-
profiles[id] = hpos if !hpos.empty?
|
65
|
-
end
|
15
|
+
def load_pathogenic_scores(path)
|
16
|
+
scores = {}
|
17
|
+
File.open(path).each do |line|
|
18
|
+
feature, score = line.split("\t")
|
19
|
+
scores[feature] = score.to_f
|
66
20
|
end
|
67
|
-
return
|
21
|
+
return scores
|
68
22
|
end
|
69
23
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
|
76
|
-
profile_id = File.basename(path, '.tab')
|
77
|
-
vars = {}
|
78
|
-
File.open(path).each do |line|
|
79
|
-
fields = line.chomp.split("\t")
|
80
|
-
chr = fields[0]
|
81
|
-
start = fields[1].to_i
|
82
|
-
query = coordinates[chr]
|
83
|
-
if query.nil?
|
84
|
-
coordinates[chr] = [start]
|
85
|
-
count += 1
|
86
|
-
id = "var_#{count}"
|
87
|
-
else
|
88
|
-
if !query.include?(start)
|
89
|
-
query << start
|
90
|
-
count += 1
|
91
|
-
id = "var_#{count}"
|
92
|
-
else
|
93
|
-
id = all_vars.key([chr, start])
|
94
|
-
end
|
95
|
-
end
|
96
|
-
vars[id] = [chr, start]
|
97
|
-
end
|
98
|
-
all_vars.merge!(vars)
|
99
|
-
variants[profile_id] = vars
|
100
|
-
end
|
101
|
-
return variants
|
24
|
+
def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
25
|
+
coords = nil
|
26
|
+
all_coordinates = genomic_coordinates[entity]
|
27
|
+
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)} if !all_coordinates.nil?
|
28
|
+
return coords
|
102
29
|
end
|
103
30
|
|
104
|
-
def
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
genomic_coordinates
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
31
|
+
def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
|
32
|
+
evidences, prof_vars, hotspots_with_pat_vars, template, output)
|
33
|
+
var_ids, var_coors = format_variants4report(prof_vars)
|
34
|
+
container = {
|
35
|
+
profile_id: profile_id,
|
36
|
+
candidates: all_candidates.each{|c| c[0] = c.first.to_s},
|
37
|
+
genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
|
38
|
+
similarity_matrixs: similarity_matrixs,
|
39
|
+
evidences: evidences,
|
40
|
+
var_ids: var_ids,
|
41
|
+
var_coordinates: var_coors,
|
42
|
+
hotspot_table: hotspots_with_pat_vars
|
43
|
+
}
|
44
|
+
report = Report_html.new(container, 'Evidence profile report')
|
45
|
+
report.build(template)
|
46
|
+
report.write(File.join(output, profile_id.to_s + '.html'))
|
120
47
|
end
|
121
48
|
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
49
|
+
def format_variants4report(var_data)
|
50
|
+
if var_data.nil?
|
51
|
+
var_ids, var_coors = nil
|
52
|
+
else
|
53
|
+
var_ids = []
|
54
|
+
var_coors = {}
|
55
|
+
count = 0
|
56
|
+
var_data.each do |chr, reg|
|
57
|
+
var_id = "var_#{count}"
|
58
|
+
var_ids << [var_id, 0]
|
59
|
+
var_coors[var_id] = [chr.to_s, reg[:start]]
|
60
|
+
count += 1
|
132
61
|
end
|
133
62
|
end
|
134
|
-
return
|
63
|
+
return var_ids, var_coors
|
135
64
|
end
|
136
65
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
66
|
+
|
67
|
+
def generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
68
|
+
hotspots_with_pat_vars = []
|
69
|
+
if !prof_vars.nil?
|
70
|
+
phen_regions = Genomic_Feature.hash2genomic_feature(all_genomic_coordinates){|k, v| v[0..2].concat([k])}
|
71
|
+
phen_candidates_by_hotspot, phen_genome_hotspots = phen_regions.generate_cluster_regions(:reg_overlap, 'A', 0, true)
|
72
|
+
genome_matches = phen_genome_hotspots.match(prof_vars)
|
73
|
+
hotspot_with_phen_candidates = invert_hash(phen_candidates_by_hotspot)
|
74
|
+
genome_matches.each do |hotspot_id, pat_vars|
|
75
|
+
reg = phen_genome_hotspots.region_by_to(hotspot_id)
|
76
|
+
coords = [reg[:chr], reg[:start], reg[:stop]]
|
77
|
+
hotspots_with_pat_vars << [hotspot_id, coords, hotspot_with_phen_candidates[hotspot_id], pat_vars]
|
149
78
|
end
|
79
|
+
# TODO: see to use original similarities without use top candidates in similarity_matrixs
|
80
|
+
# TODO: COMPLETE UNTIL FULL PREDICTOR
|
150
81
|
end
|
151
|
-
return
|
82
|
+
return hotspots_with_pat_vars
|
152
83
|
end
|
153
84
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
85
|
+
def invert_hash(h)
|
86
|
+
new_h = {}
|
87
|
+
h.each do |k, vals|
|
88
|
+
vals.each do |v|
|
89
|
+
query = new_h[v]
|
90
|
+
if query.nil?
|
91
|
+
new_h[v] = [k]
|
92
|
+
else
|
93
|
+
query << k
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return new_h
|
160
98
|
end
|
161
99
|
|
162
|
-
|
163
|
-
|
164
100
|
#############################################################################################
|
165
101
|
## OPTPARSE
|
166
102
|
############################################################################################
|
@@ -199,6 +135,11 @@ OptionParser.new do |opts|
|
|
199
135
|
options[:variant_data] = item
|
200
136
|
end
|
201
137
|
|
138
|
+
options[:pathogenic_scores] = nil # TODO: Generalize to a folder with a table per patient
|
139
|
+
opts.on("-P", "--pathogenic_scores PATH", 'File with genome features an their pathogenic scores') do |item|
|
140
|
+
options[:pathogenic_scores] = item
|
141
|
+
end
|
142
|
+
|
202
143
|
opts.on_tail("-h", "--help", "Show this message") do
|
203
144
|
puts opts
|
204
145
|
exit
|
@@ -211,12 +152,12 @@ end.parse!
|
|
211
152
|
############################################################################################
|
212
153
|
|
213
154
|
hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
|
214
|
-
hpo = Ontology.new
|
215
|
-
hpo.read(hpo_file)
|
155
|
+
hpo = Ontology.new(file: hpo_file, load_file: true)
|
216
156
|
|
217
157
|
profiles = load_profiles(options[:profiles_file], hpo)
|
218
158
|
profile_variants = options[:variant_data].nil? ? {} : load_variants(options[:variant_data])
|
219
159
|
evidences, genomic_coordinates = load_evidences(options[:evidences], hpo)
|
160
|
+
pathogenic_scores = options[:pathogenic_scores].nil? ? {} : load_pathogenic_scores(options[:pathogenic_scores])
|
220
161
|
|
221
162
|
hpo.load_profiles(profiles)
|
222
163
|
evidences_similarity = {}
|
@@ -225,7 +166,8 @@ evidences.each do |pair, data|
|
|
225
166
|
if profile_type == 'HP'
|
226
167
|
evidence_profiles = data[:prof]
|
227
168
|
evidence_profiles.transform_keys!{|prof_id, terms| prof_id.to_sym}
|
228
|
-
|
169
|
+
similarities = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
|
170
|
+
evidences_similarity[pair] = similarities if !similarities.empty?
|
229
171
|
end
|
230
172
|
end
|
231
173
|
|
@@ -239,23 +181,33 @@ profiles.each do |profile_id, reference_prof|
|
|
239
181
|
entity = pair.split('_').first
|
240
182
|
similarities = ev_profiles_similarity[profile_id.to_sym]
|
241
183
|
candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(reference_prof, similarities, evidences[pair][:prof], hpo, 40, 40)
|
242
|
-
|
184
|
+
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
185
|
+
candidate_sim_matrix.unshift(['HP'] + candidates_ids)
|
186
|
+
if !pathogenic_scores.empty? # priorize by pathogenic scores
|
187
|
+
candidate_sim_matrix_patho, candidates_patho, candidates_ids_patho = get_similarity_matrix(
|
188
|
+
reference_prof, similarities,
|
189
|
+
evidences[pair][:prof], hpo, 40, 40,
|
190
|
+
other_scores = pathogenic_scores, id2label = evidences[pair][:id2lab])
|
191
|
+
if !candidate_sim_matrix_patho.empty?
|
192
|
+
candidate_sim_matrix_patho.unshift(['HP'] + candidates_ids_patho)
|
193
|
+
similarity_matrixs[pair + '_path_vars'] = candidate_sim_matrix_patho
|
194
|
+
evidences[pair + '_path_vars'] = evidences[pair]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
next if coords.nil?
|
243
198
|
all_candidates.concat(candidates)
|
244
199
|
similarity_matrixs[pair] = candidate_sim_matrix
|
245
|
-
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
246
200
|
all_genomic_coordinates.merge!(coords)
|
247
201
|
end
|
248
202
|
prof_vars = profile_variants[profile_id]
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
report.build(template)
|
260
|
-
report.write(File.join(options[:output_folder], profile_id.to_s + '.html'))
|
203
|
+
hotspots_with_pat_vars = generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
204
|
+
make_report(
|
205
|
+
profile_id,
|
206
|
+
all_candidates,
|
207
|
+
all_genomic_coordinates,
|
208
|
+
similarity_matrixs,
|
209
|
+
evidences, prof_vars,
|
210
|
+
hotspots_with_pat_vars,
|
211
|
+
template, options[:output_folder]
|
212
|
+
)
|
261
213
|
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
require 'pets'
|
8
|
+
|
9
|
+
##################################
|
10
|
+
## METHODS
|
11
|
+
##################################
|
12
|
+
|
13
|
+
def get_data(options)
|
14
|
+
fields2extract = get_fields2extract(options)
|
15
|
+
field_numbers = fields2extract.values
|
16
|
+
records = read_records(options, fields2extract, field_numbers)
|
17
|
+
end
|
18
|
+
|
19
|
+
def read_records(options, fields2extract, field_numbers) # Modified from cohort_parset
|
20
|
+
records = []
|
21
|
+
count = 0
|
22
|
+
File.open(options[:input_file]).each do |line|
|
23
|
+
line.chomp!
|
24
|
+
if options[:header] && count == 0
|
25
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
26
|
+
field_names = line.split("\t")
|
27
|
+
get_field_numbers2extract(field_names, fields2extract)
|
28
|
+
field_numbers = fields2extract.values
|
29
|
+
else
|
30
|
+
fields = line.split("\t")
|
31
|
+
record = field_numbers.map{|n| fields[n]}
|
32
|
+
if fields2extract[:id_col].nil?
|
33
|
+
id = "rec_#{count}" #generate ids
|
34
|
+
else
|
35
|
+
id = record.shift
|
36
|
+
end
|
37
|
+
record[1] = record[1].to_i
|
38
|
+
record[2] = record[2].to_i
|
39
|
+
record << id
|
40
|
+
records << record
|
41
|
+
end
|
42
|
+
count +=1
|
43
|
+
end
|
44
|
+
return records
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_fields2extract(options)
|
48
|
+
fields2extract = {}
|
49
|
+
[:id_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
col = options[field]
|
51
|
+
if !col.nil?
|
52
|
+
col = col.to_i if !options[:header]
|
53
|
+
fields2extract[field] = col
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return fields2extract
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_field_numbers2extract(field_names, fields2extract)
|
60
|
+
fields2extract.each do |field, name|
|
61
|
+
fields2extract[field] = field_names.index(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
##########################
|
66
|
+
#OPT-PARSER
|
67
|
+
##########################
|
68
|
+
|
69
|
+
options = {}
|
70
|
+
OptionParser.new do |opts|
|
71
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
72
|
+
|
73
|
+
options[:chromosome_col] = nil
|
74
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
75
|
+
options[:chromosome_col] = data
|
76
|
+
end
|
77
|
+
|
78
|
+
options[:id_col] = nil
|
79
|
+
opts.on("-d", "--id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the id") do |data|
|
80
|
+
options[:id_col] = data
|
81
|
+
end
|
82
|
+
|
83
|
+
options[:end_col] = nil
|
84
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
85
|
+
options[:end_col] = data
|
86
|
+
end
|
87
|
+
|
88
|
+
options[:header] = true
|
89
|
+
#chr\tstart\tstop
|
90
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
91
|
+
options[:header] = false
|
92
|
+
end
|
93
|
+
|
94
|
+
options[:input_file] = nil
|
95
|
+
opts.on("-i", "--input_file PATH", "Input file path") do |data|
|
96
|
+
options[:input_file] = data
|
97
|
+
end
|
98
|
+
|
99
|
+
options[:reference_file] = nil
|
100
|
+
opts.on("-r", "--reference_file PATH", "Reference file with genome annotation") do |data|
|
101
|
+
options[:reference_file] = data
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:output_file] = nil
|
105
|
+
opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
|
106
|
+
options[:output_file] = data
|
107
|
+
end
|
108
|
+
|
109
|
+
options[:start_col] = nil
|
110
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
111
|
+
options[:start_col] = data
|
112
|
+
end
|
113
|
+
|
114
|
+
options[:feature_type] = nil
|
115
|
+
opts.on("-t", "--feature_type STRING", "Keep features from reference whose are tagged with this feature type") do |data|
|
116
|
+
options[:feature_type] = data
|
117
|
+
end
|
118
|
+
|
119
|
+
options[:feature_name] = nil
|
120
|
+
opts.on("-n", "--feature_name STRING", "Use this feature id that is present in attributes/annotation field of reference") do |data|
|
121
|
+
options[:feature_name] = data
|
122
|
+
end
|
123
|
+
|
124
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
125
|
+
puts opts
|
126
|
+
exit
|
127
|
+
end
|
128
|
+
|
129
|
+
end.parse!
|
130
|
+
|
131
|
+
regions = Genomic_Feature.new(get_data(options))
|
132
|
+
Genomic_Feature.add_reference(
|
133
|
+
Reference_parser.load(
|
134
|
+
options[:reference_file],
|
135
|
+
feature_type: options[:feature_type]
|
136
|
+
)
|
137
|
+
)
|
138
|
+
gene_features = regions.get_features(attr_type: options[:feature_name])
|
139
|
+
|
140
|
+
File.open(options[:output_file], 'w') do |f|
|
141
|
+
gene_features.each do |id, feat_ids|
|
142
|
+
feat_ids.each do |ft_id|
|
143
|
+
f.puts "#{id}\t#{ft_id}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|