pets 0.2.3 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +68 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +102 -150
- data/bin/get_gen_features.rb +146 -0
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +8 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +86 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +16 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +172 -424
- data/lib/pets/cohort.rb +309 -0
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +8 -0
- data/lib/pets/generalMethods.rb +29 -319
- data/lib/pets/genomic_features.rb +240 -0
- data/lib/pets/io.rb +481 -0
- data/lib/pets/parsers/cohort_parser.rb +111 -0
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +9 -0
- data/pets.gemspec +7 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/cohort_report.erb +5 -7
- data/templates/evidence_profile.erb +20 -4
- data/templates/patient_report.erb +1 -1
- metadata +96 -5
data/bin/evidence_profiler.rb
CHANGED
@@ -1,166 +1,102 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
3
|
ROOT_PATH = File.dirname(__FILE__)
|
4
|
-
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
5
|
-
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
6
|
-
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
7
|
-
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
8
4
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
9
5
|
|
10
6
|
require 'fileutils'
|
11
7
|
require 'optparse'
|
12
8
|
require 'report_html'
|
13
9
|
require 'semtools'
|
14
|
-
require '
|
15
|
-
|
16
|
-
|
17
|
-
class Report_html
|
18
|
-
def circular_genome(user_options = {}, &block)
|
19
|
-
default_options = {}.merge!(user_options)
|
20
|
-
coordinates = user_options[:genomic_coordinates]
|
21
|
-
html_string = canvasXpress_main(default_options, block) do |options, config, samples, vars, values, object_id, x, z|
|
22
|
-
config['graphType'] = 'Circular'
|
23
|
-
config["arcSegmentsSeparation"] = 3
|
24
|
-
config["colorScheme"] = "Tableau"
|
25
|
-
config["colors"] = ["#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#AA4466","#882255","#AA4499"]
|
26
|
-
config["showIdeogram"] = true
|
27
|
-
chr = []
|
28
|
-
pos = []
|
29
|
-
tags2remove = []
|
30
|
-
vars.each_with_index do |var, i|
|
31
|
-
coord = coordinates[var]
|
32
|
-
if !coord.nil?
|
33
|
-
tag = coord.first.gsub(/[^\dXY]/,'')
|
34
|
-
if tag == 'X' || tag == 'Y' || (tag.to_i > 0 && tag.to_i <= 22)
|
35
|
-
chr << coord.first.gsub(/[^\dXY]/,'')
|
36
|
-
pos << coord.last - 1
|
37
|
-
else
|
38
|
-
tags2remove << i
|
39
|
-
end
|
40
|
-
else
|
41
|
-
tags2remove << i
|
42
|
-
end
|
43
|
-
end
|
44
|
-
tags2remove.reverse_each{|i| ent = vars.delete_at(i); warn("Feature #{ent} has not valid coordinates")} # Remove entities with invalid coordinates
|
45
|
-
z['chr'] = chr
|
46
|
-
z['pos'] = pos
|
47
|
-
end
|
48
|
-
return html_string
|
49
|
-
end
|
50
|
-
end
|
10
|
+
require 'pets'
|
51
11
|
|
52
12
|
#############################################################################################
|
53
13
|
## METHODS
|
54
14
|
############################################################################################
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
hpos = profile.split(',').map{|a| a.to_sym}
|
61
|
-
hpos, rejected_hpos = hpo.check_ids(hpos)
|
62
|
-
if !hpos.empty?
|
63
|
-
hpos = hpo.clean_profile(hpos)
|
64
|
-
profiles[id] = hpos if !hpos.empty?
|
65
|
-
end
|
15
|
+
def load_pathogenic_scores(path)
|
16
|
+
scores = {}
|
17
|
+
File.open(path).each do |line|
|
18
|
+
feature, score = line.split("\t")
|
19
|
+
scores[feature] = score.to_f
|
66
20
|
end
|
67
|
-
return
|
21
|
+
return scores
|
68
22
|
end
|
69
23
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
|
76
|
-
profile_id = File.basename(path, '.tab')
|
77
|
-
vars = {}
|
78
|
-
File.open(path).each do |line|
|
79
|
-
fields = line.chomp.split("\t")
|
80
|
-
chr = fields[0]
|
81
|
-
start = fields[1].to_i
|
82
|
-
query = coordinates[chr]
|
83
|
-
if query.nil?
|
84
|
-
coordinates[chr] = [start]
|
85
|
-
count += 1
|
86
|
-
id = "var_#{count}"
|
87
|
-
else
|
88
|
-
if !query.include?(start)
|
89
|
-
query << start
|
90
|
-
count += 1
|
91
|
-
id = "var_#{count}"
|
92
|
-
else
|
93
|
-
id = all_vars.key([chr, start])
|
94
|
-
end
|
95
|
-
end
|
96
|
-
vars[id] = [chr, start]
|
97
|
-
end
|
98
|
-
all_vars.merge!(vars)
|
99
|
-
variants[profile_id] = vars
|
100
|
-
end
|
101
|
-
return variants
|
24
|
+
def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
25
|
+
coords = nil
|
26
|
+
all_coordinates = genomic_coordinates[entity]
|
27
|
+
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)} if !all_coordinates.nil?
|
28
|
+
return coords
|
102
29
|
end
|
103
30
|
|
104
|
-
def
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
genomic_coordinates
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
31
|
+
def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
|
32
|
+
evidences, prof_vars, hotspots_with_pat_vars, template, output)
|
33
|
+
var_ids, var_coors = format_variants4report(prof_vars)
|
34
|
+
container = {
|
35
|
+
profile_id: profile_id,
|
36
|
+
candidates: all_candidates.each{|c| c[0] = c.first.to_s},
|
37
|
+
genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
|
38
|
+
similarity_matrixs: similarity_matrixs,
|
39
|
+
evidences: evidences,
|
40
|
+
var_ids: var_ids,
|
41
|
+
var_coordinates: var_coors,
|
42
|
+
hotspot_table: hotspots_with_pat_vars
|
43
|
+
}
|
44
|
+
report = Report_html.new(container, 'Evidence profile report')
|
45
|
+
report.build(template)
|
46
|
+
report.write(File.join(output, profile_id.to_s + '.html'))
|
120
47
|
end
|
121
48
|
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
49
|
+
def format_variants4report(var_data)
|
50
|
+
if var_data.nil?
|
51
|
+
var_ids, var_coors = nil
|
52
|
+
else
|
53
|
+
var_ids = []
|
54
|
+
var_coors = {}
|
55
|
+
count = 0
|
56
|
+
var_data.each do |chr, reg|
|
57
|
+
var_id = "var_#{count}"
|
58
|
+
var_ids << [var_id, 0]
|
59
|
+
var_coors[var_id] = [chr.to_s, reg[:start]]
|
60
|
+
count += 1
|
132
61
|
end
|
133
62
|
end
|
134
|
-
return
|
63
|
+
return var_ids, var_coors
|
135
64
|
end
|
136
65
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
66
|
+
|
67
|
+
def generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
68
|
+
hotspots_with_pat_vars = []
|
69
|
+
if !prof_vars.nil?
|
70
|
+
phen_regions = Genomic_Feature.hash2genomic_feature(all_genomic_coordinates){|k, v| v[0..2].concat([k])}
|
71
|
+
phen_candidates_by_hotspot, phen_genome_hotspots = phen_regions.generate_cluster_regions(:reg_overlap, 'A', 0, true)
|
72
|
+
genome_matches = phen_genome_hotspots.match(prof_vars)
|
73
|
+
hotspot_with_phen_candidates = invert_hash(phen_candidates_by_hotspot)
|
74
|
+
genome_matches.each do |hotspot_id, pat_vars|
|
75
|
+
reg = phen_genome_hotspots.region_by_to(hotspot_id)
|
76
|
+
coords = [reg[:chr], reg[:start], reg[:stop]]
|
77
|
+
hotspots_with_pat_vars << [hotspot_id, coords, hotspot_with_phen_candidates[hotspot_id], pat_vars]
|
149
78
|
end
|
79
|
+
# TODO: see to use original similarities without use top candidates in similarity_matrixs
|
80
|
+
# TODO: COMPLETE UNTIL FULL PREDICTOR
|
150
81
|
end
|
151
|
-
return
|
82
|
+
return hotspots_with_pat_vars
|
152
83
|
end
|
153
84
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
85
|
+
def invert_hash(h)
|
86
|
+
new_h = {}
|
87
|
+
h.each do |k, vals|
|
88
|
+
vals.each do |v|
|
89
|
+
query = new_h[v]
|
90
|
+
if query.nil?
|
91
|
+
new_h[v] = [k]
|
92
|
+
else
|
93
|
+
query << k
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return new_h
|
160
98
|
end
|
161
99
|
|
162
|
-
|
163
|
-
|
164
100
|
#############################################################################################
|
165
101
|
## OPTPARSE
|
166
102
|
############################################################################################
|
@@ -199,6 +135,11 @@ OptionParser.new do |opts|
|
|
199
135
|
options[:variant_data] = item
|
200
136
|
end
|
201
137
|
|
138
|
+
options[:pathogenic_scores] = nil # TODO: Generalize to a folder with a table per patient
|
139
|
+
opts.on("-P", "--pathogenic_scores PATH", 'File with genome features an their pathogenic scores') do |item|
|
140
|
+
options[:pathogenic_scores] = item
|
141
|
+
end
|
142
|
+
|
202
143
|
opts.on_tail("-h", "--help", "Show this message") do
|
203
144
|
puts opts
|
204
145
|
exit
|
@@ -211,12 +152,12 @@ end.parse!
|
|
211
152
|
############################################################################################
|
212
153
|
|
213
154
|
hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
|
214
|
-
hpo = Ontology.new
|
215
|
-
hpo.read(hpo_file)
|
155
|
+
hpo = Ontology.new(file: hpo_file, load_file: true)
|
216
156
|
|
217
157
|
profiles = load_profiles(options[:profiles_file], hpo)
|
218
158
|
profile_variants = options[:variant_data].nil? ? {} : load_variants(options[:variant_data])
|
219
159
|
evidences, genomic_coordinates = load_evidences(options[:evidences], hpo)
|
160
|
+
pathogenic_scores = options[:pathogenic_scores].nil? ? {} : load_pathogenic_scores(options[:pathogenic_scores])
|
220
161
|
|
221
162
|
hpo.load_profiles(profiles)
|
222
163
|
evidences_similarity = {}
|
@@ -225,7 +166,8 @@ evidences.each do |pair, data|
|
|
225
166
|
if profile_type == 'HP'
|
226
167
|
evidence_profiles = data[:prof]
|
227
168
|
evidence_profiles.transform_keys!{|prof_id, terms| prof_id.to_sym}
|
228
|
-
|
169
|
+
similarities = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
|
170
|
+
evidences_similarity[pair] = similarities if !similarities.empty?
|
229
171
|
end
|
230
172
|
end
|
231
173
|
|
@@ -239,23 +181,33 @@ profiles.each do |profile_id, reference_prof|
|
|
239
181
|
entity = pair.split('_').first
|
240
182
|
similarities = ev_profiles_similarity[profile_id.to_sym]
|
241
183
|
candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(reference_prof, similarities, evidences[pair][:prof], hpo, 40, 40)
|
242
|
-
|
184
|
+
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
185
|
+
candidate_sim_matrix.unshift(['HP'] + candidates_ids)
|
186
|
+
if !pathogenic_scores.empty? # priorize by pathogenic scores
|
187
|
+
candidate_sim_matrix_patho, candidates_patho, candidates_ids_patho = get_similarity_matrix(
|
188
|
+
reference_prof, similarities,
|
189
|
+
evidences[pair][:prof], hpo, 40, 40,
|
190
|
+
other_scores = pathogenic_scores, id2label = evidences[pair][:id2lab])
|
191
|
+
if !candidate_sim_matrix_patho.empty?
|
192
|
+
candidate_sim_matrix_patho.unshift(['HP'] + candidates_ids_patho)
|
193
|
+
similarity_matrixs[pair + '_path_vars'] = candidate_sim_matrix_patho
|
194
|
+
evidences[pair + '_path_vars'] = evidences[pair]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
next if coords.nil?
|
243
198
|
all_candidates.concat(candidates)
|
244
199
|
similarity_matrixs[pair] = candidate_sim_matrix
|
245
|
-
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
246
200
|
all_genomic_coordinates.merge!(coords)
|
247
201
|
end
|
248
202
|
prof_vars = profile_variants[profile_id]
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
report.build(template)
|
260
|
-
report.write(File.join(options[:output_folder], profile_id.to_s + '.html'))
|
203
|
+
hotspots_with_pat_vars = generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
204
|
+
make_report(
|
205
|
+
profile_id,
|
206
|
+
all_candidates,
|
207
|
+
all_genomic_coordinates,
|
208
|
+
similarity_matrixs,
|
209
|
+
evidences, prof_vars,
|
210
|
+
hotspots_with_pat_vars,
|
211
|
+
template, options[:output_folder]
|
212
|
+
)
|
261
213
|
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
require 'pets'
|
8
|
+
|
9
|
+
##################################
|
10
|
+
## METHODS
|
11
|
+
##################################
|
12
|
+
|
13
|
+
def get_data(options)
|
14
|
+
fields2extract = get_fields2extract(options)
|
15
|
+
field_numbers = fields2extract.values
|
16
|
+
records = read_records(options, fields2extract, field_numbers)
|
17
|
+
end
|
18
|
+
|
19
|
+
def read_records(options, fields2extract, field_numbers) # Modified from cohort_parset
|
20
|
+
records = []
|
21
|
+
count = 0
|
22
|
+
File.open(options[:input_file]).each do |line|
|
23
|
+
line.chomp!
|
24
|
+
if options[:header] && count == 0
|
25
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
26
|
+
field_names = line.split("\t")
|
27
|
+
get_field_numbers2extract(field_names, fields2extract)
|
28
|
+
field_numbers = fields2extract.values
|
29
|
+
else
|
30
|
+
fields = line.split("\t")
|
31
|
+
record = field_numbers.map{|n| fields[n]}
|
32
|
+
if fields2extract[:id_col].nil?
|
33
|
+
id = "rec_#{count}" #generate ids
|
34
|
+
else
|
35
|
+
id = record.shift
|
36
|
+
end
|
37
|
+
record[1] = record[1].to_i
|
38
|
+
record[2] = record[2].to_i
|
39
|
+
record << id
|
40
|
+
records << record
|
41
|
+
end
|
42
|
+
count +=1
|
43
|
+
end
|
44
|
+
return records
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_fields2extract(options)
|
48
|
+
fields2extract = {}
|
49
|
+
[:id_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
col = options[field]
|
51
|
+
if !col.nil?
|
52
|
+
col = col.to_i if !options[:header]
|
53
|
+
fields2extract[field] = col
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return fields2extract
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_field_numbers2extract(field_names, fields2extract)
|
60
|
+
fields2extract.each do |field, name|
|
61
|
+
fields2extract[field] = field_names.index(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
##########################
|
66
|
+
#OPT-PARSER
|
67
|
+
##########################
|
68
|
+
|
69
|
+
options = {}
|
70
|
+
OptionParser.new do |opts|
|
71
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
72
|
+
|
73
|
+
options[:chromosome_col] = nil
|
74
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
75
|
+
options[:chromosome_col] = data
|
76
|
+
end
|
77
|
+
|
78
|
+
options[:id_col] = nil
|
79
|
+
opts.on("-d", "--id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the id") do |data|
|
80
|
+
options[:id_col] = data
|
81
|
+
end
|
82
|
+
|
83
|
+
options[:end_col] = nil
|
84
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
85
|
+
options[:end_col] = data
|
86
|
+
end
|
87
|
+
|
88
|
+
options[:header] = true
|
89
|
+
#chr\tstart\tstop
|
90
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
91
|
+
options[:header] = false
|
92
|
+
end
|
93
|
+
|
94
|
+
options[:input_file] = nil
|
95
|
+
opts.on("-i", "--input_file PATH", "Input file path") do |data|
|
96
|
+
options[:input_file] = data
|
97
|
+
end
|
98
|
+
|
99
|
+
options[:reference_file] = nil
|
100
|
+
opts.on("-r", "--reference_file PATH", "Reference file with genome annotation") do |data|
|
101
|
+
options[:reference_file] = data
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:output_file] = nil
|
105
|
+
opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
|
106
|
+
options[:output_file] = data
|
107
|
+
end
|
108
|
+
|
109
|
+
options[:start_col] = nil
|
110
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
111
|
+
options[:start_col] = data
|
112
|
+
end
|
113
|
+
|
114
|
+
options[:feature_type] = nil
|
115
|
+
opts.on("-t", "--feature_type STRING", "Keep features from reference whose are tagged with this feature type") do |data|
|
116
|
+
options[:feature_type] = data
|
117
|
+
end
|
118
|
+
|
119
|
+
options[:feature_name] = nil
|
120
|
+
opts.on("-n", "--feature_name STRING", "Use this feature id that is present in attributes/annotation field of reference") do |data|
|
121
|
+
options[:feature_name] = data
|
122
|
+
end
|
123
|
+
|
124
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
125
|
+
puts opts
|
126
|
+
exit
|
127
|
+
end
|
128
|
+
|
129
|
+
end.parse!
|
130
|
+
|
131
|
+
regions = Genomic_Feature.new(get_data(options))
|
132
|
+
Genomic_Feature.add_reference(
|
133
|
+
Reference_parser.load(
|
134
|
+
options[:reference_file],
|
135
|
+
feature_type: options[:feature_type]
|
136
|
+
)
|
137
|
+
)
|
138
|
+
gene_features = regions.get_features(attr_type: options[:feature_name])
|
139
|
+
|
140
|
+
File.open(options[:output_file], 'w') do |f|
|
141
|
+
gene_features.each do |id, feat_ids|
|
142
|
+
feat_ids.each do |ft_id|
|
143
|
+
f.puts "#{id}\t#{ft_id}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|