pets 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/coPatReporter.rb +5 -0
- data/bin/evidence_profiler.rb +79 -14
- data/bin/get_gen_features.rb +146 -0
- data/bin/install_deps.rb +3 -2
- data/bin/profiles2phenopacket.rb +1 -25
- data/external_code/install_R_dependencies.R +6 -1
- data/lib/pets/coPatReporterMethods.rb +50 -4
- data/lib/pets/cohort.rb +10 -8
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +2 -1
- data/lib/pets/generalMethods.rb +21 -2
- data/lib/pets/genomic_features.rb +106 -10
- data/lib/pets/io.rb +32 -8
- data/lib/pets/parsers/cohort_parser.rb +8 -3
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +2 -1
- data/pets.gemspec +6 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/evidence_profile.erb +20 -4
- metadata +65 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f1d5c3ad0cb57b26b2c67e02b38a282139965472ded083acf0d1fcae48c0fec
|
4
|
+
data.tar.gz: 8b34f2440afe74f0b9c0e6024c2a05daee4a7be0efd0c6a3d80aef49673c7c7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3e9bc8559bb3f3e0c9a7ce1e0658645f54afc83fc49e7415cc3177576af975e4f7268a1f37e017d83fd88042197f102a4290cc29d7b0a16e12ec4964feea39d
|
7
|
+
data.tar.gz: a2aa8fe161b52d2f3e86e0d04f2a1a762298de95582098e553287ad905ff7b97eae6f96893de8bd78676c01b412a16973b1b53cf6ebbd9cb48e49c929c7f1d74
|
data/bin/coPatReporter.rb
CHANGED
@@ -42,6 +42,11 @@ OptionParser.new do |opts|
|
|
42
42
|
options[:id_col] = data
|
43
43
|
end
|
44
44
|
|
45
|
+
options[:detailed_clusters] = false
|
46
|
+
opts.on("-D", "--detailed_clusters", "Show detiled cluster comparation using heatmaps. Default false") do
|
47
|
+
options[:detailed_clusters] = true
|
48
|
+
end
|
49
|
+
|
45
50
|
options[:excluded_hpo] = nil
|
46
51
|
opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
|
47
52
|
options[:excluded_hpo] = excluded_hpo
|
data/bin/evidence_profiler.rb
CHANGED
@@ -12,13 +12,24 @@ require 'pets'
|
|
12
12
|
#############################################################################################
|
13
13
|
## METHODS
|
14
14
|
############################################################################################
|
15
|
+
def load_pathogenic_scores(path)
|
16
|
+
scores = {}
|
17
|
+
File.open(path).each do |line|
|
18
|
+
feature, score = line.split("\t")
|
19
|
+
scores[feature] = score.to_f
|
20
|
+
end
|
21
|
+
return scores
|
22
|
+
end
|
23
|
+
|
15
24
|
def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
25
|
+
coords = nil
|
16
26
|
all_coordinates = genomic_coordinates[entity]
|
17
|
-
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)}
|
27
|
+
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)} if !all_coordinates.nil?
|
18
28
|
return coords
|
19
29
|
end
|
20
30
|
|
21
|
-
def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
|
31
|
+
def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
|
32
|
+
evidences, prof_vars, hotspots_with_pat_vars, template, output)
|
22
33
|
var_ids, var_coors = format_variants4report(prof_vars)
|
23
34
|
container = {
|
24
35
|
profile_id: profile_id,
|
@@ -27,7 +38,8 @@ def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_
|
|
27
38
|
similarity_matrixs: similarity_matrixs,
|
28
39
|
evidences: evidences,
|
29
40
|
var_ids: var_ids,
|
30
|
-
var_coordinates: var_coors
|
41
|
+
var_coordinates: var_coors,
|
42
|
+
hotspot_table: hotspots_with_pat_vars
|
31
43
|
}
|
32
44
|
report = Report_html.new(container, 'Evidence profile report')
|
33
45
|
report.build(template)
|
@@ -51,12 +63,39 @@ def format_variants4report(var_data)
|
|
51
63
|
return var_ids, var_coors
|
52
64
|
end
|
53
65
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
66
|
+
|
67
|
+
def generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
68
|
+
hotspots_with_pat_vars = []
|
69
|
+
if !prof_vars.nil?
|
70
|
+
phen_regions = Genomic_Feature.hash2genomic_feature(all_genomic_coordinates){|k, v| v[0..2].concat([k])}
|
71
|
+
phen_candidates_by_hotspot, phen_genome_hotspots = phen_regions.generate_cluster_regions(:reg_overlap, 'A', 0, true)
|
72
|
+
genome_matches = phen_genome_hotspots.match(prof_vars)
|
73
|
+
hotspot_with_phen_candidates = invert_hash(phen_candidates_by_hotspot)
|
74
|
+
genome_matches.each do |hotspot_id, pat_vars|
|
75
|
+
reg = phen_genome_hotspots.region_by_to(hotspot_id)
|
76
|
+
coords = [reg[:chr], reg[:start], reg[:stop]]
|
77
|
+
hotspots_with_pat_vars << [hotspot_id, coords, hotspot_with_phen_candidates[hotspot_id], pat_vars]
|
78
|
+
end
|
79
|
+
# TODO: see to use original similarities without use top candidates in similarity_matrixs
|
80
|
+
# TODO: COMPLETE UNTIL FULL PREDICTOR
|
81
|
+
end
|
82
|
+
return hotspots_with_pat_vars
|
58
83
|
end
|
59
84
|
|
85
|
+
def invert_hash(h)
|
86
|
+
new_h = {}
|
87
|
+
h.each do |k, vals|
|
88
|
+
vals.each do |v|
|
89
|
+
query = new_h[v]
|
90
|
+
if query.nil?
|
91
|
+
new_h[v] = [k]
|
92
|
+
else
|
93
|
+
query << k
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return new_h
|
98
|
+
end
|
60
99
|
|
61
100
|
#############################################################################################
|
62
101
|
## OPTPARSE
|
@@ -96,6 +135,11 @@ OptionParser.new do |opts|
|
|
96
135
|
options[:variant_data] = item
|
97
136
|
end
|
98
137
|
|
138
|
+
options[:pathogenic_scores] = nil # TODO: Generalize to a folder with a table per patient
|
139
|
+
opts.on("-P", "--pathogenic_scores PATH", 'File with genome features an their pathogenic scores') do |item|
|
140
|
+
options[:pathogenic_scores] = item
|
141
|
+
end
|
142
|
+
|
99
143
|
opts.on_tail("-h", "--help", "Show this message") do
|
100
144
|
puts opts
|
101
145
|
exit
|
@@ -108,12 +152,12 @@ end.parse!
|
|
108
152
|
############################################################################################
|
109
153
|
|
110
154
|
hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
|
111
|
-
hpo = Ontology.new
|
112
|
-
hpo.read(hpo_file)
|
155
|
+
hpo = Ontology.new(file: hpo_file, load_file: true)
|
113
156
|
|
114
157
|
profiles = load_profiles(options[:profiles_file], hpo)
|
115
158
|
profile_variants = options[:variant_data].nil? ? {} : load_variants(options[:variant_data])
|
116
159
|
evidences, genomic_coordinates = load_evidences(options[:evidences], hpo)
|
160
|
+
pathogenic_scores = options[:pathogenic_scores].nil? ? {} : load_pathogenic_scores(options[:pathogenic_scores])
|
117
161
|
|
118
162
|
hpo.load_profiles(profiles)
|
119
163
|
evidences_similarity = {}
|
@@ -122,7 +166,8 @@ evidences.each do |pair, data|
|
|
122
166
|
if profile_type == 'HP'
|
123
167
|
evidence_profiles = data[:prof]
|
124
168
|
evidence_profiles.transform_keys!{|prof_id, terms| prof_id.to_sym}
|
125
|
-
|
169
|
+
similarities = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
|
170
|
+
evidences_similarity[pair] = similarities if !similarities.empty?
|
126
171
|
end
|
127
172
|
end
|
128
173
|
|
@@ -136,13 +181,33 @@ profiles.each do |profile_id, reference_prof|
|
|
136
181
|
entity = pair.split('_').first
|
137
182
|
similarities = ev_profiles_similarity[profile_id.to_sym]
|
138
183
|
candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(reference_prof, similarities, evidences[pair][:prof], hpo, 40, 40)
|
139
|
-
|
184
|
+
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
185
|
+
candidate_sim_matrix.unshift(['HP'] + candidates_ids)
|
186
|
+
if !pathogenic_scores.empty? # priorize by pathogenic scores
|
187
|
+
candidate_sim_matrix_patho, candidates_patho, candidates_ids_patho = get_similarity_matrix(
|
188
|
+
reference_prof, similarities,
|
189
|
+
evidences[pair][:prof], hpo, 40, 40,
|
190
|
+
other_scores = pathogenic_scores, id2label = evidences[pair][:id2lab])
|
191
|
+
if !candidate_sim_matrix_patho.empty?
|
192
|
+
candidate_sim_matrix_patho.unshift(['HP'] + candidates_ids_patho)
|
193
|
+
similarity_matrixs[pair + '_path_vars'] = candidate_sim_matrix_patho
|
194
|
+
evidences[pair + '_path_vars'] = evidences[pair]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
next if coords.nil?
|
140
198
|
all_candidates.concat(candidates)
|
141
199
|
similarity_matrixs[pair] = candidate_sim_matrix
|
142
|
-
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
143
200
|
all_genomic_coordinates.merge!(coords)
|
144
201
|
end
|
145
|
-
get_genome_hotspots(similarity_matrixs, all_genomic_coordinates)
|
146
202
|
prof_vars = profile_variants[profile_id]
|
147
|
-
|
203
|
+
hotspots_with_pat_vars = generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
204
|
+
make_report(
|
205
|
+
profile_id,
|
206
|
+
all_candidates,
|
207
|
+
all_genomic_coordinates,
|
208
|
+
similarity_matrixs,
|
209
|
+
evidences, prof_vars,
|
210
|
+
hotspots_with_pat_vars,
|
211
|
+
template, options[:output_folder]
|
212
|
+
)
|
148
213
|
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
require 'pets'
|
8
|
+
|
9
|
+
##################################
|
10
|
+
## METHODS
|
11
|
+
##################################
|
12
|
+
|
13
|
+
def get_data(options)
|
14
|
+
fields2extract = get_fields2extract(options)
|
15
|
+
field_numbers = fields2extract.values
|
16
|
+
records = read_records(options, fields2extract, field_numbers)
|
17
|
+
end
|
18
|
+
|
19
|
+
def read_records(options, fields2extract, field_numbers) # Modified from cohort_parset
|
20
|
+
records = []
|
21
|
+
count = 0
|
22
|
+
File.open(options[:input_file]).each do |line|
|
23
|
+
line.chomp!
|
24
|
+
if options[:header] && count == 0
|
25
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
26
|
+
field_names = line.split("\t")
|
27
|
+
get_field_numbers2extract(field_names, fields2extract)
|
28
|
+
field_numbers = fields2extract.values
|
29
|
+
else
|
30
|
+
fields = line.split("\t")
|
31
|
+
record = field_numbers.map{|n| fields[n]}
|
32
|
+
if fields2extract[:id_col].nil?
|
33
|
+
id = "rec_#{count}" #generate ids
|
34
|
+
else
|
35
|
+
id = record.shift
|
36
|
+
end
|
37
|
+
record[1] = record[1].to_i
|
38
|
+
record[2] = record[2].to_i
|
39
|
+
record << id
|
40
|
+
records << record
|
41
|
+
end
|
42
|
+
count +=1
|
43
|
+
end
|
44
|
+
return records
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_fields2extract(options)
|
48
|
+
fields2extract = {}
|
49
|
+
[:id_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
col = options[field]
|
51
|
+
if !col.nil?
|
52
|
+
col = col.to_i if !options[:header]
|
53
|
+
fields2extract[field] = col
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return fields2extract
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_field_numbers2extract(field_names, fields2extract)
|
60
|
+
fields2extract.each do |field, name|
|
61
|
+
fields2extract[field] = field_names.index(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
##########################
|
66
|
+
#OPT-PARSER
|
67
|
+
##########################
|
68
|
+
|
69
|
+
options = {}
|
70
|
+
OptionParser.new do |opts|
|
71
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
72
|
+
|
73
|
+
options[:chromosome_col] = nil
|
74
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
75
|
+
options[:chromosome_col] = data
|
76
|
+
end
|
77
|
+
|
78
|
+
options[:id_col] = nil
|
79
|
+
opts.on("-d", "--id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the id") do |data|
|
80
|
+
options[:id_col] = data
|
81
|
+
end
|
82
|
+
|
83
|
+
options[:end_col] = nil
|
84
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
85
|
+
options[:end_col] = data
|
86
|
+
end
|
87
|
+
|
88
|
+
options[:header] = true
|
89
|
+
#chr\tstart\tstop
|
90
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
91
|
+
options[:header] = false
|
92
|
+
end
|
93
|
+
|
94
|
+
options[:input_file] = nil
|
95
|
+
opts.on("-i", "--input_file PATH", "Input file path") do |data|
|
96
|
+
options[:input_file] = data
|
97
|
+
end
|
98
|
+
|
99
|
+
options[:reference_file] = nil
|
100
|
+
opts.on("-r", "--reference_file PATH", "Reference file with genome annotation") do |data|
|
101
|
+
options[:reference_file] = data
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:output_file] = nil
|
105
|
+
opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
|
106
|
+
options[:output_file] = data
|
107
|
+
end
|
108
|
+
|
109
|
+
options[:start_col] = nil
|
110
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
111
|
+
options[:start_col] = data
|
112
|
+
end
|
113
|
+
|
114
|
+
options[:feature_type] = nil
|
115
|
+
opts.on("-t", "--feature_type STRING", "Keep features from reference whose are tagged with this feature type") do |data|
|
116
|
+
options[:feature_type] = data
|
117
|
+
end
|
118
|
+
|
119
|
+
options[:feature_name] = nil
|
120
|
+
opts.on("-n", "--feature_name STRING", "Use this feature id that is present in attributes/annotation field of reference") do |data|
|
121
|
+
options[:feature_name] = data
|
122
|
+
end
|
123
|
+
|
124
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
125
|
+
puts opts
|
126
|
+
exit
|
127
|
+
end
|
128
|
+
|
129
|
+
end.parse!
|
130
|
+
|
131
|
+
regions = Genomic_Feature.new(get_data(options))
|
132
|
+
Genomic_Feature.add_reference(
|
133
|
+
Reference_parser.load(
|
134
|
+
options[:reference_file],
|
135
|
+
feature_type: options[:feature_type]
|
136
|
+
)
|
137
|
+
)
|
138
|
+
gene_features = regions.get_features(attr_type: options[:feature_name])
|
139
|
+
|
140
|
+
File.open(options[:output_file], 'w') do |f|
|
141
|
+
gene_features.each do |id, feat_ids|
|
142
|
+
feat_ids.each do |ft_id|
|
143
|
+
f.puts "#{id}\t#{ft_id}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
data/bin/install_deps.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
3
|
ROOT_PATH = File.dirname(__FILE__)
|
4
|
-
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code')
|
4
|
+
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
5
5
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
6
|
+
require 'pets'
|
6
7
|
|
7
|
-
system_call(EXTERNAL_CODE, 'install_R_dependencies.R')
|
8
|
+
system_call(EXTERNAL_CODE, 'install_R_dependencies.R', '')
|
data/bin/profiles2phenopacket.rb
CHANGED
@@ -27,31 +27,7 @@ options = {}
|
|
27
27
|
OptionParser.new do |opts|
|
28
28
|
opts.banner = "Usage: #{__FILE__} [options]"
|
29
29
|
|
30
|
-
|
31
|
-
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
32
|
-
options[:chromosome_col] = data
|
33
|
-
end
|
34
|
-
|
35
|
-
options[:id_col] = nil
|
36
|
-
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
37
|
-
options[:id_col] = data
|
38
|
-
end
|
39
|
-
|
40
|
-
options[:end_col] = nil
|
41
|
-
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
42
|
-
options[:end_col] = data
|
43
|
-
end
|
44
|
-
|
45
|
-
options[:genome_assembly] = 'hg38'
|
46
|
-
opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
|
47
|
-
options[:genome_assembly] = data
|
48
|
-
end
|
49
|
-
|
50
|
-
options[:header] = true
|
51
|
-
#chr\tstart\tstop
|
52
|
-
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
53
|
-
options[:header] = false
|
54
|
-
end
|
30
|
+
eval(File.open(COMMON_OPTPARSE).read)
|
55
31
|
|
56
32
|
options[:input_file] = nil
|
57
33
|
opts.on("-i", "--input_file PATH", "Input file with patient data") do |data|
|
@@ -7,5 +7,10 @@ print("Installing libraries from CRAN")
|
|
7
7
|
packages_list <-c("optparse","RcppCNPy","ggplot2","fastcluster","dplyr","gplots","RColorBrewer","tidyr","data.table","gridExtra", "dynamicTreeCut", "ggExtra", "ontologyIndex", "magrittr")
|
8
8
|
installed <- library()$results[,1]
|
9
9
|
packages_list <- setdiff(packages_list, installed)
|
10
|
-
|
10
|
+
if(length(packages_list) == 0){
|
11
|
+
print('All needed packages are installed')
|
12
|
+
}else{
|
13
|
+
install.packages(packages_list, repos='https://cloud.r-project.org')
|
14
|
+
}
|
15
|
+
|
11
16
|
|
@@ -62,7 +62,8 @@ def get_profile_ic(hpo_names, phenotype_ic)
|
|
62
62
|
profile_length = 0
|
63
63
|
hpo_names.each do |hpo_id|
|
64
64
|
hpo_ic = phenotype_ic[hpo_id]
|
65
|
-
ic
|
65
|
+
raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil?
|
66
|
+
ic += hpo_ic
|
66
67
|
profile_length += 1
|
67
68
|
end
|
68
69
|
profile_length = 1 if profile_length == 0
|
@@ -187,7 +188,18 @@ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
|
187
188
|
profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
|
188
189
|
clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
|
189
190
|
if !File.exists?(matrix_filename)
|
190
|
-
|
191
|
+
if reference_profiles.nil?
|
192
|
+
profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
|
193
|
+
else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation
|
194
|
+
ont = Cohort.get_ontology(:hpo)
|
195
|
+
pat_profiles = ont.profiles
|
196
|
+
ont.load_profiles(reference_profiles, reset_stored: true)
|
197
|
+
profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym,
|
198
|
+
external_profiles: pat_profiles,
|
199
|
+
bidirectional: false)
|
200
|
+
ont.load_profiles(pat_profiles, reset_stored: true)
|
201
|
+
profiles_similarity = invert_nested_hash(profiles_similarity)
|
202
|
+
end
|
191
203
|
remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
|
192
204
|
write_profile_pairs(profiles_similarity, profiles_similarity_filename)
|
193
205
|
if reference_profiles.nil?
|
@@ -219,13 +231,30 @@ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
|
219
231
|
write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
|
220
232
|
out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
|
221
233
|
system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
|
234
|
+
sim_mat4cluster = {}
|
235
|
+
if options[:detailed_clusters]
|
236
|
+
clusters_codes.each do |cluster|
|
237
|
+
cluster_cohort = Cohort.new
|
238
|
+
clID, patient_number, patient_ids, hpo_codes = cluster
|
239
|
+
patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])}
|
240
|
+
cluster_profiles = cluster_cohort.profiles
|
241
|
+
ref_profile = cluster_cohort.get_general_profile
|
242
|
+
hpo.load_profiles({ref: ref_profile}, reset_stored: true)
|
243
|
+
similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false)
|
244
|
+
candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100)
|
245
|
+
candidate_sim_matrix.unshift(['HP'] + candidates_ids)
|
246
|
+
sim_mat4cluster[clID] = candidate_sim_matrix
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
|
222
251
|
clusters = translate_codes(clusters_codes, hpo)
|
223
|
-
|
224
252
|
container = {
|
225
253
|
:temp_folder => temp_folder,
|
226
254
|
:cluster_name => method_name,
|
227
255
|
:clusters => clusters,
|
228
|
-
:hpo => hpo
|
256
|
+
:hpo => hpo,
|
257
|
+
:sim_mat4cluster => sim_mat4cluster
|
229
258
|
}
|
230
259
|
|
231
260
|
report = Report_html.new(container, 'Patient clusters report')
|
@@ -235,6 +264,23 @@ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
|
235
264
|
end
|
236
265
|
end
|
237
266
|
|
267
|
+
def invert_nested_hash(h)
|
268
|
+
new_h = {}
|
269
|
+
h.each do |k1, vals1|
|
270
|
+
vals1.each do |v1|
|
271
|
+
vals1.each do |k2, vals2|
|
272
|
+
query = new_h[k2]
|
273
|
+
if query.nil?
|
274
|
+
new_h[k2] = {k1 => vals2}
|
275
|
+
else
|
276
|
+
query[k1] = vals2
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
return new_h
|
282
|
+
end
|
283
|
+
|
238
284
|
def get_cluster_metadata(clusters_info)
|
239
285
|
average_hp_per_pat_distribution = []
|
240
286
|
clusters_info.each do |cl_id, pat_info|
|
data/lib/pets/cohort.rb
CHANGED
@@ -24,11 +24,6 @@ class Cohort
|
|
24
24
|
else
|
25
25
|
ont = Ontology.new
|
26
26
|
ont.read(ont_file)
|
27
|
-
if !excluded_terms_file.nil?
|
28
|
-
ont.add_removable_terms(read_excluded_ont_file(excluded_terms_file))
|
29
|
-
ont.remove_removable()
|
30
|
-
ont.build_index()
|
31
|
-
end
|
32
27
|
end
|
33
28
|
@@ont[ont_name] = ont
|
34
29
|
end
|
@@ -44,12 +39,14 @@ class Cohort
|
|
44
39
|
def initialize()
|
45
40
|
@profiles = {}
|
46
41
|
@vars = {}
|
42
|
+
@extra_attr = {}
|
47
43
|
@var_idx = Genomic_Feature.new([])
|
48
44
|
end
|
49
45
|
|
50
|
-
def add_record(rec) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
46
|
+
def add_record(rec, extra_attr = nil) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
51
47
|
id, profile, vars = rec
|
52
48
|
@profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
|
49
|
+
@extra_attr[id] = extra_attr if !extra_attr.nil?
|
53
50
|
add_gen_feat(id, vars) if !vars.nil?
|
54
51
|
end
|
55
52
|
|
@@ -111,7 +108,7 @@ class Cohort
|
|
111
108
|
term_count = Hash.new(0)
|
112
109
|
each_profile do |id, prof|
|
113
110
|
prof.each do |term|
|
114
|
-
|
111
|
+
term_count[term] += 1
|
115
112
|
end
|
116
113
|
end
|
117
114
|
records = @profiles.length
|
@@ -271,7 +268,12 @@ class Cohort
|
|
271
268
|
|
272
269
|
@profiles.each do |id, terms|
|
273
270
|
phenopacket = {metaData: metaData}
|
274
|
-
|
271
|
+
query_sex = @extra_attr.dig(id, :sex)
|
272
|
+
sex = query_sex.nil? ? 'UNKNOWN_SEX' : query_sex
|
273
|
+
phenopacket[:subject] = {
|
274
|
+
id: id,
|
275
|
+
sex: sex
|
276
|
+
}
|
275
277
|
phenotypicFeatures = []
|
276
278
|
terms.each do |term|
|
277
279
|
term_name = ont.translate_id(term)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
options[:chromosome_col] = nil
|
2
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
3
|
+
options[:chromosome_col] = data
|
4
|
+
end
|
5
|
+
|
6
|
+
options[:id_col] = nil
|
7
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
8
|
+
options[:id_col] = data
|
9
|
+
end
|
10
|
+
|
11
|
+
options[:end_col] = nil
|
12
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
13
|
+
options[:end_col] = data
|
14
|
+
end
|
15
|
+
|
16
|
+
options[:genome_assembly] = 'hg38'
|
17
|
+
opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
|
18
|
+
options[:genome_assembly] = data
|
19
|
+
end
|
20
|
+
|
21
|
+
options[:header] = true
|
22
|
+
#chr\tstart\tstop
|
23
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
24
|
+
options[:header] = false
|
25
|
+
end
|
26
|
+
|
27
|
+
options[:sex_col] = nil
|
28
|
+
opts.on("-x", "--sex_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient sex") do |data|
|
29
|
+
options[:sex_col] = data
|
30
|
+
end
|
data/lib/pets/constants.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
# Needs define ROOT_PATH constant in file requiring this file
|
1
|
+
# Needs define ROOT_PATH constant in file requiring this file
|
2
|
+
COMMON_OPTPARSE = File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
|
2
3
|
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
3
4
|
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
4
5
|
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
data/lib/pets/generalMethods.rb
CHANGED
@@ -243,8 +243,27 @@ def get_detailed_similarity(profile, candidates, evidences, hpo)
|
|
243
243
|
return matrix
|
244
244
|
end
|
245
245
|
|
246
|
-
def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit)
|
247
|
-
candidates = similarities.to_a
|
246
|
+
def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {})
|
247
|
+
candidates = similarities.to_a
|
248
|
+
if other_scores.empty?
|
249
|
+
candidates.sort!{|s1, s2| s2.last <=> s1.last}
|
250
|
+
candidates = candidates.first(candidate_limit)
|
251
|
+
else # Prioritize first by the external list of scores, select the candidates and then rioritize by similarities
|
252
|
+
selected_candidates = []
|
253
|
+
candidates.each do |cand|
|
254
|
+
cand_id = cand[0]
|
255
|
+
cand_lab = id2label[cand_id.to_s]
|
256
|
+
next if cand_lab.nil?
|
257
|
+
other_score = other_scores[cand_lab]
|
258
|
+
next if other_score.nil?
|
259
|
+
cand << other_score
|
260
|
+
selected_candidates << cand
|
261
|
+
end
|
262
|
+
selected_candidates.sort!{|e1, e2| e2[2] <=> e1[2]}
|
263
|
+
candidates = selected_candidates.first(candidate_limit)
|
264
|
+
candidates.sort!{|e1, e2| e2[1] <=> e1[1]}
|
265
|
+
candidates.each{|c| c.pop}
|
266
|
+
end
|
248
267
|
candidates_ids = candidates.map{|c| c.first}
|
249
268
|
candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
|
250
269
|
candidate_similarity_matrix.each_with_index do |row, i|
|
@@ -1,23 +1,59 @@
|
|
1
1
|
class Genomic_Feature
|
2
|
+
@@ref = nil
|
3
|
+
|
4
|
+
def self.array2genomic_feature(arr)
|
5
|
+
new(arr.map{|r| yield(r)})
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.hash2genomic_feature(h)
|
9
|
+
vars = []
|
10
|
+
h.each do |h, v|
|
11
|
+
vars << yield(h, v)
|
12
|
+
end
|
13
|
+
new(vars)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.add_reference(genomic_regions)
|
17
|
+
@@ref = genomic_regions
|
18
|
+
end
|
19
|
+
|
2
20
|
#If any method use gen_fet as name is a Genomic_Feature object
|
3
|
-
def initialize(feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
21
|
+
def initialize(feat_array, annotations: nil) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
4
22
|
@regions = {}
|
23
|
+
@reg_by_to = {}
|
5
24
|
@reg_id = -1
|
6
25
|
load_features(feat_array)
|
26
|
+
load_annotations(annotations) if !annotations.nil?
|
7
27
|
end
|
8
28
|
|
9
29
|
def load_features(feat_array)
|
10
|
-
feat_array.each do |chr, start, stop|
|
30
|
+
feat_array.each do |chr, start, stop, to|
|
11
31
|
chr = chr.to_sym
|
12
|
-
|
32
|
+
@reg_id +=1
|
33
|
+
id = to.nil? ? @reg_id : to
|
34
|
+
region = {chr: chr, start: start, stop: stop, to: id }
|
35
|
+
@reg_by_to[id] = region
|
13
36
|
add_record(@regions, chr, region)
|
14
37
|
end
|
15
38
|
end
|
16
39
|
|
40
|
+
def load_annotations(annotations)
|
41
|
+
each do |chr, reg|
|
42
|
+
annot = annotations[reg[:to]]
|
43
|
+
reg[:attrs] = annot if !annot.nil?
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
17
47
|
def length
|
18
48
|
return @regions.length
|
19
49
|
end
|
20
50
|
|
51
|
+
def each_chr()
|
52
|
+
@regions.each do |chr, regs|
|
53
|
+
yield(chr, regs)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
21
57
|
def each()
|
22
58
|
@regions.each do |chr, regs|
|
23
59
|
regs.each do |region|
|
@@ -30,6 +66,14 @@ class Genomic_Feature
|
|
30
66
|
return @regions.keys
|
31
67
|
end
|
32
68
|
|
69
|
+
def get_chr_regs(chr)
|
70
|
+
return @regions[chr]
|
71
|
+
end
|
72
|
+
|
73
|
+
def region_by_to(to)
|
74
|
+
return @reg_by_to[to]
|
75
|
+
end
|
76
|
+
|
33
77
|
def get_sizes
|
34
78
|
sizes = []
|
35
79
|
each do |chr, region|
|
@@ -39,6 +83,35 @@ class Genomic_Feature
|
|
39
83
|
return sizes
|
40
84
|
end
|
41
85
|
|
86
|
+
def get_features(attr_type: nil)
|
87
|
+
features = match(@@ref)
|
88
|
+
if !attr_type.nil?
|
89
|
+
features.each do |reg_id, feat_ids|
|
90
|
+
new_feat_ids = feat_ids.map{|fi| @@ref.region_by_to(fi).dig(:attrs, attr_type)}
|
91
|
+
features[reg_id] = new_feat_ids.compact.uniq
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return features
|
95
|
+
end
|
96
|
+
|
97
|
+
def match(other_gen_feat)
|
98
|
+
all_matches = {}
|
99
|
+
each_chr do |chr, regs|
|
100
|
+
other_regs = other_gen_feat.get_chr_regs(chr)
|
101
|
+
next if other_regs.nil?
|
102
|
+
regs.each do |reg|
|
103
|
+
local_matches = []
|
104
|
+
start = reg[:start]
|
105
|
+
stop = reg[:stop]
|
106
|
+
other_regs.each do |other_reg|
|
107
|
+
local_matches << other_reg[:to] if coor_overlap?(start, stop, other_reg)
|
108
|
+
end
|
109
|
+
all_matches[reg[:to]] = local_matches if !local_matches.empty?
|
110
|
+
end
|
111
|
+
end
|
112
|
+
return all_matches
|
113
|
+
end
|
114
|
+
|
42
115
|
def get_summary_sizes
|
43
116
|
sizes = Hash.new(0)
|
44
117
|
each do |chr, region|
|
@@ -60,21 +133,21 @@ class Genomic_Feature
|
|
60
133
|
reference.each do |start, stop|
|
61
134
|
reg_ids = []
|
62
135
|
genomic_ranges.each do |reg|
|
63
|
-
|
136
|
+
overlap = coor_overlap?(start, stop, reg)
|
137
|
+
reg_ids << reg[:to] if overlap
|
64
138
|
end
|
65
139
|
overlaps << reg_ids.uniq
|
66
140
|
end
|
67
141
|
return overlaps
|
68
142
|
end
|
69
143
|
|
70
|
-
def generate_cluster_regions(meth, tag, ids_per_reg = 1)
|
144
|
+
def generate_cluster_regions(meth, tag, ids_per_reg = 1, obj = false)
|
71
145
|
compute_windows(meth) # Get putative genome windows
|
72
|
-
patients_out_of_cluster = 0
|
73
146
|
ids_by_cluster = {}
|
74
147
|
annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
|
75
148
|
@regions.each do |chr, regs|
|
76
149
|
reference = @windows[chr]
|
77
|
-
overlaps = get_reference_overlaps(regs, reference)
|
150
|
+
overlaps = get_reference_overlaps(regs, reference)
|
78
151
|
clust_numb = 0
|
79
152
|
reference.each_with_index do |ref, i|
|
80
153
|
current_ids = overlaps[i]
|
@@ -87,6 +160,7 @@ class Genomic_Feature
|
|
87
160
|
end
|
88
161
|
end
|
89
162
|
end
|
163
|
+
annotated_full_ref = Genomic_Feature.array2genomic_feature(annotated_full_ref){|r| [r[2], r[0], r[1], r[3]]} if obj
|
90
164
|
return ids_by_cluster, annotated_full_ref
|
91
165
|
end
|
92
166
|
|
@@ -116,15 +190,37 @@ class Genomic_Feature
|
|
116
190
|
|
117
191
|
def compute_region_overlap_windows(genomic_ranges)
|
118
192
|
reference = []
|
119
|
-
|
120
|
-
|
193
|
+
single_nt = []
|
194
|
+
genomic_ranges.each do |gr|
|
195
|
+
start = gr[:start]
|
196
|
+
stop = gr[:stop]
|
197
|
+
if stop - start > 0
|
198
|
+
reference << start # get start
|
199
|
+
reference << stop # get stop
|
200
|
+
else # Build a window of at least one nt for snv
|
201
|
+
single_nt << start
|
202
|
+
end
|
203
|
+
end
|
121
204
|
reference.uniq!
|
205
|
+
single_nt.each do |snt| # add start stop for snv
|
206
|
+
reference << snt
|
207
|
+
reference << snt
|
208
|
+
end
|
122
209
|
reference.sort!
|
123
210
|
#Define overlap ranges
|
124
211
|
final_reference = []
|
212
|
+
last_len = 1
|
125
213
|
reference.each_with_index do |coord,i|
|
126
214
|
next_coord = reference[i + 1]
|
127
|
-
|
215
|
+
if !next_coord.nil?
|
216
|
+
current_len = next_coord - coord
|
217
|
+
coord = coord + 1 if last_len == 0 # Separate SNV window from others
|
218
|
+
if current_len == 0 && last_len > 0 && !final_reference.empty?
|
219
|
+
final_reference.last[1] -= 1 # Separate SNV window from others
|
220
|
+
end
|
221
|
+
final_reference << [coord, next_coord]
|
222
|
+
last_len = current_len
|
223
|
+
end
|
128
224
|
end
|
129
225
|
return final_reference
|
130
226
|
end
|
data/lib/pets/io.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'csv'
|
2
|
+
require 'bio-vcf'
|
2
3
|
|
3
4
|
def load_hpo_ontology(hpo_file, excluded_hpo_file)
|
4
5
|
hpo = nil
|
@@ -201,20 +202,39 @@ end
|
|
201
202
|
|
202
203
|
def load_variants(variant_folder)
|
203
204
|
variants = {}
|
204
|
-
Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
|
205
|
-
profile_id = File.basename(path,
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
start = fields[1].to_i
|
211
|
-
vars << [chr, start, start]
|
205
|
+
Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path|
|
206
|
+
profile_id, ext = File.basename(path).split(".", 2)
|
207
|
+
if ext == 'tab' || ext == 'txt'
|
208
|
+
vars = load_tabular_vars(path)
|
209
|
+
elsif ext == 'vcf' || ext == 'vcf.gz'
|
210
|
+
vars = load_vcf(path, ext)
|
212
211
|
end
|
213
212
|
variants[profile_id] = Genomic_Feature.new(vars)
|
214
213
|
end
|
215
214
|
return variants
|
216
215
|
end
|
217
216
|
|
217
|
+
def load_tabular_vars(path)
|
218
|
+
vars = []
|
219
|
+
File.open(path).each do |line|
|
220
|
+
fields = line.chomp.split("\t")
|
221
|
+
chr = fields[0].gsub('chr','')
|
222
|
+
start = fields[1].to_i
|
223
|
+
vars << [chr, start, start]
|
224
|
+
end
|
225
|
+
return vars
|
226
|
+
end
|
227
|
+
|
228
|
+
def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
|
229
|
+
vars = [] # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file
|
230
|
+
vcf = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false )
|
231
|
+
vcf.each do |var|
|
232
|
+
vars << [var.chrom.gsub('chr',''), var.pos, var.pos]
|
233
|
+
end
|
234
|
+
puts vars.length
|
235
|
+
return vars
|
236
|
+
end
|
237
|
+
|
218
238
|
def load_evidences(evidences_path, hpo)
|
219
239
|
genomic_coordinates = {}
|
220
240
|
coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
|
@@ -242,6 +262,10 @@ def load_coordinates(file_path)
|
|
242
262
|
header = false
|
243
263
|
else
|
244
264
|
entity, chr, strand, start, stop = fields
|
265
|
+
if chr == 'NA'
|
266
|
+
STDERR.puts "Warning: Record #{fields.inspect} is undefined"
|
267
|
+
next
|
268
|
+
end
|
245
269
|
coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
|
246
270
|
end
|
247
271
|
end
|
@@ -3,6 +3,7 @@ class Cohort_Parser
|
|
3
3
|
fields2extract = get_fields2extract(options)
|
4
4
|
field_numbers = fields2extract.values
|
5
5
|
records = read_records(options, fields2extract, field_numbers)
|
6
|
+
options[:extracted_fields] = fields2extract.keys
|
6
7
|
cohort, rejected_terms, rejected_recs = create_cohort(records, options)
|
7
8
|
return cohort, rejected_terms, rejected_recs
|
8
9
|
end
|
@@ -46,7 +47,7 @@ class Cohort_Parser
|
|
46
47
|
|
47
48
|
def self.get_fields2extract(options)
|
48
49
|
fields2extract = {}
|
49
|
-
[:id_col, :ont_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
[:id_col, :ont_col, :chromosome_col, :start_col, :end_col, :sex_col].each do |field|
|
50
51
|
col = options[field]
|
51
52
|
if !col.nil?
|
52
53
|
col = col.to_i if !options[:header]
|
@@ -70,7 +71,7 @@ class Cohort_Parser
|
|
70
71
|
records.each do |id, record|
|
71
72
|
rec = record.first
|
72
73
|
terms = rec.first
|
73
|
-
if options[:names]
|
74
|
+
if options[:names] # Translate hpo names 2 codes
|
74
75
|
init_term_number = terms.length
|
75
76
|
terms, rec_rejected_terms = ont.translate_names(terms)
|
76
77
|
if !rec_rejected_terms.empty?
|
@@ -87,7 +88,11 @@ class Cohort_Parser
|
|
87
88
|
else
|
88
89
|
variants = [] # Not exists genomic region attributes so we create a empty array
|
89
90
|
end
|
90
|
-
|
91
|
+
other_attr = {}
|
92
|
+
if options[:extracted_fields].include?(:sex_col) # Check for additional attributes. -1 is applied to ignore :id in extracted fields
|
93
|
+
other_attr[:sex] = record.first[options[:extracted_fields].index(:sex_col) -1]
|
94
|
+
end
|
95
|
+
cohort.add_record([id, terms, check_variants(variants)], other_attr)
|
91
96
|
end
|
92
97
|
return cohort, rejected_terms.uniq, rejected_recs
|
93
98
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'genomic_features'
|
2
|
+
class Reference_parser
|
3
|
+
|
4
|
+
def self.load(file_path, file_format: nil, feature_type: nil)
|
5
|
+
file_format = file_path.split('.', 2).last if file_format.nil?
|
6
|
+
if file_format == 'gtf'
|
7
|
+
regions, all_attrs = parse_gtf(file_path, feature_type: feature_type)
|
8
|
+
end
|
9
|
+
|
10
|
+
return Genomic_Feature.new(regions, annotations: all_attrs)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parse_gtf(file_path, feature_type: nil) # https://www.ensembl.org/info/website/upload/gff.html
|
14
|
+
features = []
|
15
|
+
all_attrs = {}
|
16
|
+
File.open(file_path).each do |line|
|
17
|
+
next if /^#/ =~ line
|
18
|
+
seqname, source, feature, start, stop, score, strand, frame, attribute = line.chomp.split("\t")
|
19
|
+
if feature_type.nil? || feature_type == feature
|
20
|
+
attrs = process_attrs(attribute, ';', ' ')
|
21
|
+
attrs['source'] = source
|
22
|
+
attrs['feature'] = feature
|
23
|
+
id = attrs['gene_id']
|
24
|
+
features << [seqname.gsub('chr',''), start.to_i, stop.to_i, id]
|
25
|
+
all_attrs[id] = attrs
|
26
|
+
end
|
27
|
+
end
|
28
|
+
return features, all_attrs
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def self.process_attrs(attributes, tuple_sep, field_sep)
|
33
|
+
return attributes.split(tuple_sep).map{|attr_pair|
|
34
|
+
tuple = attr_pair.strip.split(field_sep, 2)
|
35
|
+
tuple.last.gsub!('"','')
|
36
|
+
tuple
|
37
|
+
}.to_h
|
38
|
+
end
|
39
|
+
end
|
data/lib/pets/version.rb
CHANGED
data/lib/pets.rb
CHANGED
data/pets.gemspec
CHANGED
@@ -38,15 +38,18 @@ Gem::Specification.new do |spec|
|
|
38
38
|
|
39
39
|
spec.add_development_dependency "bundler", "~> 2.0"
|
40
40
|
spec.add_development_dependency "rake", "~> 13.0.3"
|
41
|
-
spec.add_development_dependency "rspec", "~> 3.
|
41
|
+
spec.add_development_dependency "rspec", "~> 3.11.0"
|
42
42
|
spec.add_dependency "statistics2"
|
43
43
|
spec.add_dependency "terminal-table"
|
44
44
|
spec.add_dependency "semtools", "~> 0.1.0"
|
45
|
+
spec.add_dependency "NetAnalyzer"
|
45
46
|
spec.add_dependency "report_html"
|
46
47
|
spec.add_dependency "numo-narray"
|
47
48
|
spec.add_dependency "npy"
|
48
49
|
spec.add_dependency "expcalc"
|
49
|
-
spec.add_dependency "
|
50
|
-
|
50
|
+
spec.add_dependency "bio-vcf"
|
51
|
+
spec.add_dependency "parallel", "~> 1.20.1"
|
52
|
+
spec.add_runtime_dependency 'net-ftp'
|
53
|
+
spec.add_runtime_dependency 'net-http'
|
51
54
|
end
|
52
55
|
|
@@ -25,14 +25,34 @@
|
|
25
25
|
<div style="width: 90%; background-color:#ecf0f1; margin: 0 auto;">
|
26
26
|
<h1 style="text-align: center; background-color:#d6eaf8">Patient HPO profiles by cluster.</h1>
|
27
27
|
<%= table(id: :clusters, header: true, border: 2, row_names: false, text: true,
|
28
|
-
cell_align: %w( center )) do |data|
|
29
|
-
|
28
|
+
cell_align: %w( center ), styled: 'dt', attrib: {'class' => 'table'}) do |data|
|
29
|
+
patient_list = []
|
30
|
+
data.each do |element| # Cluster
|
31
|
+
clID, patient_number, patient_ids, hpo_codes, hpo_names = element
|
32
|
+
# TODO: mostrar registro por paciente
|
30
33
|
#STDERR.puts element.inspect
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
patient_ids.each_with_index do |patID, i|
|
35
|
+
patient_record = [clID, patient_number]
|
36
|
+
patient_record << patID
|
37
|
+
patient_record << hpo_codes[i].map{|hpo_code| get_hpo_link(hpo_code)}.join(', ')
|
38
|
+
patient_record << hpo_names[i].join(', ')
|
39
|
+
patient_list << patient_record
|
40
|
+
end
|
34
41
|
end
|
42
|
+
data.clear
|
43
|
+
data.concat(patient_list)
|
35
44
|
data.unshift(["Cluster ID","Patients in Cluster","Patient IDs", "HPO codes", "Phenotypes"])
|
36
45
|
end
|
37
46
|
%>
|
38
47
|
</div>
|
48
|
+
|
49
|
+
<div style="width: 90%; background-color:#ecf0f1; margin: 0 auto;">
|
50
|
+
<h1 style="text-align: center; background-color:#d6eaf8"> Cluster detailed view.</h1>
|
51
|
+
<%
|
52
|
+
@hash_vars[:sim_mat4cluster].each do |clID, sim_matrix|
|
53
|
+
@hash_vars[:sim_matrix] = sim_matrix %>
|
54
|
+
<%= heatmap(id: :sim_matrix, header: true, row_names: true, title: "Cluster #{clID}" )%>
|
55
|
+
<% end
|
56
|
+
%>
|
57
|
+
|
58
|
+
</div>
|
@@ -4,7 +4,7 @@
|
|
4
4
|
<% @hash_vars[:similarity_matrixs].each do |pair, similarity_matrix| %>
|
5
5
|
<%
|
6
6
|
matrix_name = pair + '_sim_matrix'
|
7
|
-
if pair
|
7
|
+
if pair.include?('gene_HP')
|
8
8
|
dict = @hash_vars[:evidences][pair][:id2lab]
|
9
9
|
header = similarity_matrix.first
|
10
10
|
header.map! do |item|
|
@@ -27,8 +27,24 @@
|
|
27
27
|
<% end %>
|
28
28
|
<%=circular_genome(id: :candidates, header: false, row_names: true, transpose: false,
|
29
29
|
genomic_coordinates: @hash_vars[:genomic_coordinates] )%>
|
30
|
-
<% if !@hash_vars[:var_ids].nil?
|
31
|
-
|
32
|
-
|
30
|
+
<% if !@hash_vars[:var_ids].nil?
|
31
|
+
if @hash_vars[:var_ids].length > 200 %>
|
32
|
+
<p> Too much variant records</p>
|
33
|
+
<% else %>
|
34
|
+
<%=circular_genome(id: :var_ids, header: false,
|
35
|
+
row_names: true, transpose: false,
|
36
|
+
genomic_coordinates: @hash_vars[:var_coordinates] )%>
|
37
|
+
<% end %>
|
33
38
|
<% end %>
|
39
|
+
|
40
|
+
<h1 style="text-align: center; background-color:#d6eaf8">Candidate regions</h1>
|
41
|
+
|
42
|
+
<%= table(id: :hotspot_table) do |data|
|
43
|
+
data.each do |row|
|
44
|
+
row[1] = row[1].join(" ")
|
45
|
+
row[2] = row[2].join(",")
|
46
|
+
row[3] = row[3].join(",")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
%>
|
34
50
|
</div>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elena Rojano, Pedro Seoane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 3.
|
47
|
+
version: 3.11.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 3.
|
54
|
+
version: 3.11.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: statistics2
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: 0.1.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: NetAnalyzer
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: report_html
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -150,6 +164,20 @@ dependencies:
|
|
150
164
|
- - ">="
|
151
165
|
- !ruby/object:Gem::Version
|
152
166
|
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: bio-vcf
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
153
181
|
- !ruby/object:Gem::Dependency
|
154
182
|
name: parallel
|
155
183
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,6 +192,34 @@ dependencies:
|
|
164
192
|
- - "~>"
|
165
193
|
- !ruby/object:Gem::Version
|
166
194
|
version: 1.20.1
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: net-ftp
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :runtime
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: net-http
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :runtime
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
167
223
|
description: PETS suite includes three different tools. CohortAnalyzer performs the
|
168
224
|
calculation of several statistics that gives an overview of a cohort of patients
|
169
225
|
to analyse. Reg2Phen uses associations between pathological phenotypes and regions
|
@@ -185,6 +241,7 @@ executables:
|
|
185
241
|
- fmeasure_index.rb
|
186
242
|
- generate_HPO_IC_table.rb
|
187
243
|
- get_PR_values.rb
|
244
|
+
- get_gen_features.rb
|
188
245
|
- get_network_nodes.rb
|
189
246
|
- get_sorted_profs.rb
|
190
247
|
- install_deps.rb
|
@@ -218,6 +275,7 @@ files:
|
|
218
275
|
- bin/fmeasure_index.rb
|
219
276
|
- bin/generate_HPO_IC_table.rb
|
220
277
|
- bin/get_PR_values.rb
|
278
|
+
- bin/get_gen_features.rb
|
221
279
|
- bin/get_network_nodes.rb
|
222
280
|
- bin/get_sorted_profs.rb
|
223
281
|
- bin/install_deps.rb
|
@@ -269,11 +327,13 @@ files:
|
|
269
327
|
- lib/pets.rb
|
270
328
|
- lib/pets/coPatReporterMethods.rb
|
271
329
|
- lib/pets/cohort.rb
|
330
|
+
- lib/pets/common_optparse.rb
|
272
331
|
- lib/pets/constants.rb
|
273
332
|
- lib/pets/generalMethods.rb
|
274
333
|
- lib/pets/genomic_features.rb
|
275
334
|
- lib/pets/io.rb
|
276
335
|
- lib/pets/parsers/cohort_parser.rb
|
336
|
+
- lib/pets/parsers/reference_parser.rb
|
277
337
|
- lib/pets/phen2reg_methods.rb
|
278
338
|
- lib/pets/reg2phen_methods.rb
|
279
339
|
- lib/pets/version.rb
|
@@ -303,7 +363,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
303
363
|
- !ruby/object:Gem::Version
|
304
364
|
version: '0'
|
305
365
|
requirements: []
|
306
|
-
rubygems_version: 3.
|
366
|
+
rubygems_version: 3.3.7
|
307
367
|
signing_key:
|
308
368
|
specification_version: 4
|
309
369
|
summary: Suite with predictive tools.
|