pets 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/coPatReporter.rb +5 -0
- data/bin/evidence_profiler.rb +79 -14
- data/bin/get_gen_features.rb +146 -0
- data/bin/install_deps.rb +3 -2
- data/bin/profiles2phenopacket.rb +1 -25
- data/external_code/install_R_dependencies.R +6 -1
- data/lib/pets/coPatReporterMethods.rb +50 -4
- data/lib/pets/cohort.rb +10 -8
- data/lib/pets/common_optparse.rb +30 -0
- data/lib/pets/constants.rb +2 -1
- data/lib/pets/generalMethods.rb +21 -2
- data/lib/pets/genomic_features.rb +106 -10
- data/lib/pets/io.rb +32 -8
- data/lib/pets/parsers/cohort_parser.rb +8 -3
- data/lib/pets/parsers/reference_parser.rb +39 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +2 -1
- data/pets.gemspec +6 -3
- data/templates/cluster_report.erb +25 -5
- data/templates/evidence_profile.erb +20 -4
- metadata +65 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f1d5c3ad0cb57b26b2c67e02b38a282139965472ded083acf0d1fcae48c0fec
|
4
|
+
data.tar.gz: 8b34f2440afe74f0b9c0e6024c2a05daee4a7be0efd0c6a3d80aef49673c7c7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3e9bc8559bb3f3e0c9a7ce1e0658645f54afc83fc49e7415cc3177576af975e4f7268a1f37e017d83fd88042197f102a4290cc29d7b0a16e12ec4964feea39d
|
7
|
+
data.tar.gz: a2aa8fe161b52d2f3e86e0d04f2a1a762298de95582098e553287ad905ff7b97eae6f96893de8bd78676c01b412a16973b1b53cf6ebbd9cb48e49c929c7f1d74
|
data/bin/coPatReporter.rb
CHANGED
@@ -42,6 +42,11 @@ OptionParser.new do |opts|
|
|
42
42
|
options[:id_col] = data
|
43
43
|
end
|
44
44
|
|
45
|
+
options[:detailed_clusters] = false
|
46
|
+
opts.on("-D", "--detailed_clusters", "Show detiled cluster comparation using heatmaps. Default false") do
|
47
|
+
options[:detailed_clusters] = true
|
48
|
+
end
|
49
|
+
|
45
50
|
options[:excluded_hpo] = nil
|
46
51
|
opts.on("-E", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
|
47
52
|
options[:excluded_hpo] = excluded_hpo
|
data/bin/evidence_profiler.rb
CHANGED
@@ -12,13 +12,24 @@ require 'pets'
|
|
12
12
|
#############################################################################################
|
13
13
|
## METHODS
|
14
14
|
############################################################################################
|
15
|
+
def load_pathogenic_scores(path)
|
16
|
+
scores = {}
|
17
|
+
File.open(path).each do |line|
|
18
|
+
feature, score = line.split("\t")
|
19
|
+
scores[feature] = score.to_f
|
20
|
+
end
|
21
|
+
return scores
|
22
|
+
end
|
23
|
+
|
15
24
|
def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
25
|
+
coords = nil
|
16
26
|
all_coordinates = genomic_coordinates[entity]
|
17
|
-
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)}
|
27
|
+
coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)} if !all_coordinates.nil?
|
18
28
|
return coords
|
19
29
|
end
|
20
30
|
|
21
|
-
def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
|
31
|
+
def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
|
32
|
+
evidences, prof_vars, hotspots_with_pat_vars, template, output)
|
22
33
|
var_ids, var_coors = format_variants4report(prof_vars)
|
23
34
|
container = {
|
24
35
|
profile_id: profile_id,
|
@@ -27,7 +38,8 @@ def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_
|
|
27
38
|
similarity_matrixs: similarity_matrixs,
|
28
39
|
evidences: evidences,
|
29
40
|
var_ids: var_ids,
|
30
|
-
var_coordinates: var_coors
|
41
|
+
var_coordinates: var_coors,
|
42
|
+
hotspot_table: hotspots_with_pat_vars
|
31
43
|
}
|
32
44
|
report = Report_html.new(container, 'Evidence profile report')
|
33
45
|
report.build(template)
|
@@ -51,12 +63,39 @@ def format_variants4report(var_data)
|
|
51
63
|
return var_ids, var_coors
|
52
64
|
end
|
53
65
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
66
|
+
|
67
|
+
def generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
68
|
+
hotspots_with_pat_vars = []
|
69
|
+
if !prof_vars.nil?
|
70
|
+
phen_regions = Genomic_Feature.hash2genomic_feature(all_genomic_coordinates){|k, v| v[0..2].concat([k])}
|
71
|
+
phen_candidates_by_hotspot, phen_genome_hotspots = phen_regions.generate_cluster_regions(:reg_overlap, 'A', 0, true)
|
72
|
+
genome_matches = phen_genome_hotspots.match(prof_vars)
|
73
|
+
hotspot_with_phen_candidates = invert_hash(phen_candidates_by_hotspot)
|
74
|
+
genome_matches.each do |hotspot_id, pat_vars|
|
75
|
+
reg = phen_genome_hotspots.region_by_to(hotspot_id)
|
76
|
+
coords = [reg[:chr], reg[:start], reg[:stop]]
|
77
|
+
hotspots_with_pat_vars << [hotspot_id, coords, hotspot_with_phen_candidates[hotspot_id], pat_vars]
|
78
|
+
end
|
79
|
+
# TODO: see to use original similarities without use top candidates in similarity_matrixs
|
80
|
+
# TODO: COMPLETE UNTIL FULL PREDICTOR
|
81
|
+
end
|
82
|
+
return hotspots_with_pat_vars
|
58
83
|
end
|
59
84
|
|
85
|
+
def invert_hash(h)
|
86
|
+
new_h = {}
|
87
|
+
h.each do |k, vals|
|
88
|
+
vals.each do |v|
|
89
|
+
query = new_h[v]
|
90
|
+
if query.nil?
|
91
|
+
new_h[v] = [k]
|
92
|
+
else
|
93
|
+
query << k
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
return new_h
|
98
|
+
end
|
60
99
|
|
61
100
|
#############################################################################################
|
62
101
|
## OPTPARSE
|
@@ -96,6 +135,11 @@ OptionParser.new do |opts|
|
|
96
135
|
options[:variant_data] = item
|
97
136
|
end
|
98
137
|
|
138
|
+
options[:pathogenic_scores] = nil # TODO: Generalize to a folder with a table per patient
|
139
|
+
opts.on("-P", "--pathogenic_scores PATH", 'File with genome features an their pathogenic scores') do |item|
|
140
|
+
options[:pathogenic_scores] = item
|
141
|
+
end
|
142
|
+
|
99
143
|
opts.on_tail("-h", "--help", "Show this message") do
|
100
144
|
puts opts
|
101
145
|
exit
|
@@ -108,12 +152,12 @@ end.parse!
|
|
108
152
|
############################################################################################
|
109
153
|
|
110
154
|
hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
|
111
|
-
hpo = Ontology.new
|
112
|
-
hpo.read(hpo_file)
|
155
|
+
hpo = Ontology.new(file: hpo_file, load_file: true)
|
113
156
|
|
114
157
|
profiles = load_profiles(options[:profiles_file], hpo)
|
115
158
|
profile_variants = options[:variant_data].nil? ? {} : load_variants(options[:variant_data])
|
116
159
|
evidences, genomic_coordinates = load_evidences(options[:evidences], hpo)
|
160
|
+
pathogenic_scores = options[:pathogenic_scores].nil? ? {} : load_pathogenic_scores(options[:pathogenic_scores])
|
117
161
|
|
118
162
|
hpo.load_profiles(profiles)
|
119
163
|
evidences_similarity = {}
|
@@ -122,7 +166,8 @@ evidences.each do |pair, data|
|
|
122
166
|
if profile_type == 'HP'
|
123
167
|
evidence_profiles = data[:prof]
|
124
168
|
evidence_profiles.transform_keys!{|prof_id, terms| prof_id.to_sym}
|
125
|
-
|
169
|
+
similarities = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
|
170
|
+
evidences_similarity[pair] = similarities if !similarities.empty?
|
126
171
|
end
|
127
172
|
end
|
128
173
|
|
@@ -136,13 +181,33 @@ profiles.each do |profile_id, reference_prof|
|
|
136
181
|
entity = pair.split('_').first
|
137
182
|
similarities = ev_profiles_similarity[profile_id.to_sym]
|
138
183
|
candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(reference_prof, similarities, evidences[pair][:prof], hpo, 40, 40)
|
139
|
-
|
184
|
+
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
185
|
+
candidate_sim_matrix.unshift(['HP'] + candidates_ids)
|
186
|
+
if !pathogenic_scores.empty? # priorize by pathogenic scores
|
187
|
+
candidate_sim_matrix_patho, candidates_patho, candidates_ids_patho = get_similarity_matrix(
|
188
|
+
reference_prof, similarities,
|
189
|
+
evidences[pair][:prof], hpo, 40, 40,
|
190
|
+
other_scores = pathogenic_scores, id2label = evidences[pair][:id2lab])
|
191
|
+
if !candidate_sim_matrix_patho.empty?
|
192
|
+
candidate_sim_matrix_patho.unshift(['HP'] + candidates_ids_patho)
|
193
|
+
similarity_matrixs[pair + '_path_vars'] = candidate_sim_matrix_patho
|
194
|
+
evidences[pair + '_path_vars'] = evidences[pair]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
next if coords.nil?
|
140
198
|
all_candidates.concat(candidates)
|
141
199
|
similarity_matrixs[pair] = candidate_sim_matrix
|
142
|
-
coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
|
143
200
|
all_genomic_coordinates.merge!(coords)
|
144
201
|
end
|
145
|
-
get_genome_hotspots(similarity_matrixs, all_genomic_coordinates)
|
146
202
|
prof_vars = profile_variants[profile_id]
|
147
|
-
|
203
|
+
hotspots_with_pat_vars = generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
|
204
|
+
make_report(
|
205
|
+
profile_id,
|
206
|
+
all_candidates,
|
207
|
+
all_genomic_coordinates,
|
208
|
+
similarity_matrixs,
|
209
|
+
evidences, prof_vars,
|
210
|
+
hotspots_with_pat_vars,
|
211
|
+
template, options[:output_folder]
|
212
|
+
)
|
148
213
|
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
require 'pets'
|
8
|
+
|
9
|
+
##################################
|
10
|
+
## METHODS
|
11
|
+
##################################
|
12
|
+
|
13
|
+
def get_data(options)
|
14
|
+
fields2extract = get_fields2extract(options)
|
15
|
+
field_numbers = fields2extract.values
|
16
|
+
records = read_records(options, fields2extract, field_numbers)
|
17
|
+
end
|
18
|
+
|
19
|
+
def read_records(options, fields2extract, field_numbers) # Modified from cohort_parset
|
20
|
+
records = []
|
21
|
+
count = 0
|
22
|
+
File.open(options[:input_file]).each do |line|
|
23
|
+
line.chomp!
|
24
|
+
if options[:header] && count == 0
|
25
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
26
|
+
field_names = line.split("\t")
|
27
|
+
get_field_numbers2extract(field_names, fields2extract)
|
28
|
+
field_numbers = fields2extract.values
|
29
|
+
else
|
30
|
+
fields = line.split("\t")
|
31
|
+
record = field_numbers.map{|n| fields[n]}
|
32
|
+
if fields2extract[:id_col].nil?
|
33
|
+
id = "rec_#{count}" #generate ids
|
34
|
+
else
|
35
|
+
id = record.shift
|
36
|
+
end
|
37
|
+
record[1] = record[1].to_i
|
38
|
+
record[2] = record[2].to_i
|
39
|
+
record << id
|
40
|
+
records << record
|
41
|
+
end
|
42
|
+
count +=1
|
43
|
+
end
|
44
|
+
return records
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_fields2extract(options)
|
48
|
+
fields2extract = {}
|
49
|
+
[:id_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
col = options[field]
|
51
|
+
if !col.nil?
|
52
|
+
col = col.to_i if !options[:header]
|
53
|
+
fields2extract[field] = col
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return fields2extract
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_field_numbers2extract(field_names, fields2extract)
|
60
|
+
fields2extract.each do |field, name|
|
61
|
+
fields2extract[field] = field_names.index(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
##########################
|
66
|
+
#OPT-PARSER
|
67
|
+
##########################
|
68
|
+
|
69
|
+
options = {}
|
70
|
+
OptionParser.new do |opts|
|
71
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
72
|
+
|
73
|
+
options[:chromosome_col] = nil
|
74
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
75
|
+
options[:chromosome_col] = data
|
76
|
+
end
|
77
|
+
|
78
|
+
options[:id_col] = nil
|
79
|
+
opts.on("-d", "--id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the id") do |data|
|
80
|
+
options[:id_col] = data
|
81
|
+
end
|
82
|
+
|
83
|
+
options[:end_col] = nil
|
84
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
85
|
+
options[:end_col] = data
|
86
|
+
end
|
87
|
+
|
88
|
+
options[:header] = true
|
89
|
+
#chr\tstart\tstop
|
90
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
91
|
+
options[:header] = false
|
92
|
+
end
|
93
|
+
|
94
|
+
options[:input_file] = nil
|
95
|
+
opts.on("-i", "--input_file PATH", "Input file path") do |data|
|
96
|
+
options[:input_file] = data
|
97
|
+
end
|
98
|
+
|
99
|
+
options[:reference_file] = nil
|
100
|
+
opts.on("-r", "--reference_file PATH", "Reference file with genome annotation") do |data|
|
101
|
+
options[:reference_file] = data
|
102
|
+
end
|
103
|
+
|
104
|
+
options[:output_file] = nil
|
105
|
+
opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
|
106
|
+
options[:output_file] = data
|
107
|
+
end
|
108
|
+
|
109
|
+
options[:start_col] = nil
|
110
|
+
opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
|
111
|
+
options[:start_col] = data
|
112
|
+
end
|
113
|
+
|
114
|
+
options[:feature_type] = nil
|
115
|
+
opts.on("-t", "--feature_type STRING", "Keep features from reference whose are tagged with this feature type") do |data|
|
116
|
+
options[:feature_type] = data
|
117
|
+
end
|
118
|
+
|
119
|
+
options[:feature_name] = nil
|
120
|
+
opts.on("-n", "--feature_name STRING", "Use this feature id that is present in attributes/annotation field of reference") do |data|
|
121
|
+
options[:feature_name] = data
|
122
|
+
end
|
123
|
+
|
124
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
125
|
+
puts opts
|
126
|
+
exit
|
127
|
+
end
|
128
|
+
|
129
|
+
end.parse!
|
130
|
+
|
131
|
+
regions = Genomic_Feature.new(get_data(options))
|
132
|
+
Genomic_Feature.add_reference(
|
133
|
+
Reference_parser.load(
|
134
|
+
options[:reference_file],
|
135
|
+
feature_type: options[:feature_type]
|
136
|
+
)
|
137
|
+
)
|
138
|
+
gene_features = regions.get_features(attr_type: options[:feature_name])
|
139
|
+
|
140
|
+
File.open(options[:output_file], 'w') do |f|
|
141
|
+
gene_features.each do |id, feat_ids|
|
142
|
+
feat_ids.each do |ft_id|
|
143
|
+
f.puts "#{id}\t#{ft_id}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
data/bin/install_deps.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
|
3
3
|
ROOT_PATH = File.dirname(__FILE__)
|
4
|
-
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code')
|
4
|
+
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
5
5
|
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
|
6
|
+
require 'pets'
|
6
7
|
|
7
|
-
system_call(EXTERNAL_CODE, 'install_R_dependencies.R')
|
8
|
+
system_call(EXTERNAL_CODE, 'install_R_dependencies.R', '')
|
data/bin/profiles2phenopacket.rb
CHANGED
@@ -27,31 +27,7 @@ options = {}
|
|
27
27
|
OptionParser.new do |opts|
|
28
28
|
opts.banner = "Usage: #{__FILE__} [options]"
|
29
29
|
|
30
|
-
|
31
|
-
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
32
|
-
options[:chromosome_col] = data
|
33
|
-
end
|
34
|
-
|
35
|
-
options[:id_col] = nil
|
36
|
-
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
37
|
-
options[:id_col] = data
|
38
|
-
end
|
39
|
-
|
40
|
-
options[:end_col] = nil
|
41
|
-
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
42
|
-
options[:end_col] = data
|
43
|
-
end
|
44
|
-
|
45
|
-
options[:genome_assembly] = 'hg38'
|
46
|
-
opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
|
47
|
-
options[:genome_assembly] = data
|
48
|
-
end
|
49
|
-
|
50
|
-
options[:header] = true
|
51
|
-
#chr\tstart\tstop
|
52
|
-
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
53
|
-
options[:header] = false
|
54
|
-
end
|
30
|
+
eval(File.open(COMMON_OPTPARSE).read)
|
55
31
|
|
56
32
|
options[:input_file] = nil
|
57
33
|
opts.on("-i", "--input_file PATH", "Input file with patient data") do |data|
|
@@ -7,5 +7,10 @@ print("Installing libraries from CRAN")
|
|
7
7
|
packages_list <-c("optparse","RcppCNPy","ggplot2","fastcluster","dplyr","gplots","RColorBrewer","tidyr","data.table","gridExtra", "dynamicTreeCut", "ggExtra", "ontologyIndex", "magrittr")
|
8
8
|
installed <- library()$results[,1]
|
9
9
|
packages_list <- setdiff(packages_list, installed)
|
10
|
-
|
10
|
+
if(length(packages_list) == 0){
|
11
|
+
print('All needed packages are installed')
|
12
|
+
}else{
|
13
|
+
install.packages(packages_list, repos='https://cloud.r-project.org')
|
14
|
+
}
|
15
|
+
|
11
16
|
|
@@ -62,7 +62,8 @@ def get_profile_ic(hpo_names, phenotype_ic)
|
|
62
62
|
profile_length = 0
|
63
63
|
hpo_names.each do |hpo_id|
|
64
64
|
hpo_ic = phenotype_ic[hpo_id]
|
65
|
-
ic
|
65
|
+
raise("The term #{hpo_id} not exists in the given ic table") if hpo_ic.nil?
|
66
|
+
ic += hpo_ic
|
66
67
|
profile_length += 1
|
67
68
|
end
|
68
69
|
profile_length = 1 if profile_length == 0
|
@@ -187,7 +188,18 @@ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
|
187
188
|
profiles_similarity_filename = File.join(temp_folder, ['profiles_similarity', method_name].join('_').concat('.txt'))
|
188
189
|
clusters_distribution_filename = File.join(temp_folder, ['clusters_distribution', method_name].join('_').concat('.txt'))
|
189
190
|
if !File.exists?(matrix_filename)
|
190
|
-
|
191
|
+
if reference_profiles.nil?
|
192
|
+
profiles_similarity = patient_data.compare_profiles(sim_type: method_name.to_sym, external_profiles: reference_profiles)
|
193
|
+
else # AS reference profiles are constant, the sematic comparation will be A => B (A reference). So, we have to invert the elements to perform the comparation
|
194
|
+
ont = Cohort.get_ontology(:hpo)
|
195
|
+
pat_profiles = ont.profiles
|
196
|
+
ont.load_profiles(reference_profiles, reset_stored: true)
|
197
|
+
profiles_similarity = ont.compare_profiles(sim_type: method_name.to_sym,
|
198
|
+
external_profiles: pat_profiles,
|
199
|
+
bidirectional: false)
|
200
|
+
ont.load_profiles(pat_profiles, reset_stored: true)
|
201
|
+
profiles_similarity = invert_nested_hash(profiles_similarity)
|
202
|
+
end
|
191
203
|
remove_nested_entries(profiles_similarity){|id, sim| sim >= options[:sim_thr] } if !options[:sim_thr].nil?
|
192
204
|
write_profile_pairs(profiles_similarity, profiles_similarity_filename)
|
193
205
|
if reference_profiles.nil?
|
@@ -219,13 +231,30 @@ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
|
219
231
|
write_patient_hpo_stat(get_cluster_metadata(clusters_info), clusters_distribution_filename)
|
220
232
|
out_file = File.join(temp_folder, ['clusters_distribution', method_name].join('_'))
|
221
233
|
system_call(EXTERNAL_CODE, 'xyplot_graph.R', "-d #{clusters_distribution_filename} -o #{out_file} -x PatientsNumber -y HPOAverage") if !File.exists?(out_file)
|
234
|
+
sim_mat4cluster = {}
|
235
|
+
if options[:detailed_clusters]
|
236
|
+
clusters_codes.each do |cluster|
|
237
|
+
cluster_cohort = Cohort.new
|
238
|
+
clID, patient_number, patient_ids, hpo_codes = cluster
|
239
|
+
patient_ids.each_with_index {|patID, i| cluster_cohort.add_record([patID, hpo_codes[i], []])}
|
240
|
+
cluster_profiles = cluster_cohort.profiles
|
241
|
+
ref_profile = cluster_cohort.get_general_profile
|
242
|
+
hpo.load_profiles({ref: ref_profile}, reset_stored: true)
|
243
|
+
similarities = hpo.compare_profiles(external_profiles: cluster_profiles, sim_type: :lin, bidirectional: false)
|
244
|
+
candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(ref_profile, similarities[:ref], cluster_profiles, hpo, 100, 100)
|
245
|
+
candidate_sim_matrix.unshift(['HP'] + candidates_ids)
|
246
|
+
sim_mat4cluster[clID] = candidate_sim_matrix
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
|
222
251
|
clusters = translate_codes(clusters_codes, hpo)
|
223
|
-
|
224
252
|
container = {
|
225
253
|
:temp_folder => temp_folder,
|
226
254
|
:cluster_name => method_name,
|
227
255
|
:clusters => clusters,
|
228
|
-
:hpo => hpo
|
256
|
+
:hpo => hpo,
|
257
|
+
:sim_mat4cluster => sim_mat4cluster
|
229
258
|
}
|
230
259
|
|
231
260
|
report = Report_html.new(container, 'Patient clusters report')
|
@@ -235,6 +264,23 @@ def get_semantic_similarity_clustering(options, patient_data, temp_folder)
|
|
235
264
|
end
|
236
265
|
end
|
237
266
|
|
267
|
+
def invert_nested_hash(h)
|
268
|
+
new_h = {}
|
269
|
+
h.each do |k1, vals1|
|
270
|
+
vals1.each do |v1|
|
271
|
+
vals1.each do |k2, vals2|
|
272
|
+
query = new_h[k2]
|
273
|
+
if query.nil?
|
274
|
+
new_h[k2] = {k1 => vals2}
|
275
|
+
else
|
276
|
+
query[k1] = vals2
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
return new_h
|
282
|
+
end
|
283
|
+
|
238
284
|
def get_cluster_metadata(clusters_info)
|
239
285
|
average_hp_per_pat_distribution = []
|
240
286
|
clusters_info.each do |cl_id, pat_info|
|
data/lib/pets/cohort.rb
CHANGED
@@ -24,11 +24,6 @@ class Cohort
|
|
24
24
|
else
|
25
25
|
ont = Ontology.new
|
26
26
|
ont.read(ont_file)
|
27
|
-
if !excluded_terms_file.nil?
|
28
|
-
ont.add_removable_terms(read_excluded_ont_file(excluded_terms_file))
|
29
|
-
ont.remove_removable()
|
30
|
-
ont.build_index()
|
31
|
-
end
|
32
27
|
end
|
33
28
|
@@ont[ont_name] = ont
|
34
29
|
end
|
@@ -44,12 +39,14 @@ class Cohort
|
|
44
39
|
def initialize()
|
45
40
|
@profiles = {}
|
46
41
|
@vars = {}
|
42
|
+
@extra_attr = {}
|
47
43
|
@var_idx = Genomic_Feature.new([])
|
48
44
|
end
|
49
45
|
|
50
|
-
def add_record(rec) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
46
|
+
def add_record(rec, extra_attr = nil) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
51
47
|
id, profile, vars = rec
|
52
48
|
@profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
|
49
|
+
@extra_attr[id] = extra_attr if !extra_attr.nil?
|
53
50
|
add_gen_feat(id, vars) if !vars.nil?
|
54
51
|
end
|
55
52
|
|
@@ -111,7 +108,7 @@ class Cohort
|
|
111
108
|
term_count = Hash.new(0)
|
112
109
|
each_profile do |id, prof|
|
113
110
|
prof.each do |term|
|
114
|
-
|
111
|
+
term_count[term] += 1
|
115
112
|
end
|
116
113
|
end
|
117
114
|
records = @profiles.length
|
@@ -271,7 +268,12 @@ class Cohort
|
|
271
268
|
|
272
269
|
@profiles.each do |id, terms|
|
273
270
|
phenopacket = {metaData: metaData}
|
274
|
-
|
271
|
+
query_sex = @extra_attr.dig(id, :sex)
|
272
|
+
sex = query_sex.nil? ? 'UNKNOWN_SEX' : query_sex
|
273
|
+
phenopacket[:subject] = {
|
274
|
+
id: id,
|
275
|
+
sex: sex
|
276
|
+
}
|
275
277
|
phenotypicFeatures = []
|
276
278
|
terms.each do |term|
|
277
279
|
term_name = ont.translate_id(term)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
options[:chromosome_col] = nil
|
2
|
+
opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
|
3
|
+
options[:chromosome_col] = data
|
4
|
+
end
|
5
|
+
|
6
|
+
options[:id_col] = nil
|
7
|
+
opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
|
8
|
+
options[:id_col] = data
|
9
|
+
end
|
10
|
+
|
11
|
+
options[:end_col] = nil
|
12
|
+
opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
|
13
|
+
options[:end_col] = data
|
14
|
+
end
|
15
|
+
|
16
|
+
options[:genome_assembly] = 'hg38'
|
17
|
+
opts.on("-G", "--genome_assembly STRING", "Genome assembly version. Please choose between hg18, hg19 and hg38. Default hg38") do |data|
|
18
|
+
options[:genome_assembly] = data
|
19
|
+
end
|
20
|
+
|
21
|
+
options[:header] = true
|
22
|
+
#chr\tstart\tstop
|
23
|
+
opts.on("-H", "--header", "Set if the file has a line header. Default true") do
|
24
|
+
options[:header] = false
|
25
|
+
end
|
26
|
+
|
27
|
+
options[:sex_col] = nil
|
28
|
+
opts.on("-x", "--sex_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient sex") do |data|
|
29
|
+
options[:sex_col] = data
|
30
|
+
end
|
data/lib/pets/constants.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
# Needs define ROOT_PATH constant in file requiring this file
|
1
|
+
# Needs define ROOT_PATH constant in file requiring this file
|
2
|
+
COMMON_OPTPARSE = File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets', 'common_optparse.rb'))
|
2
3
|
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
3
4
|
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
4
5
|
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
data/lib/pets/generalMethods.rb
CHANGED
@@ -243,8 +243,27 @@ def get_detailed_similarity(profile, candidates, evidences, hpo)
|
|
243
243
|
return matrix
|
244
244
|
end
|
245
245
|
|
246
|
-
def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit)
|
247
|
-
candidates = similarities.to_a
|
246
|
+
def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit, other_scores = {}, id2label = {})
|
247
|
+
candidates = similarities.to_a
|
248
|
+
if other_scores.empty?
|
249
|
+
candidates.sort!{|s1, s2| s2.last <=> s1.last}
|
250
|
+
candidates = candidates.first(candidate_limit)
|
251
|
+
else # Prioritize first by the external list of scores, select the candidates and then rioritize by similarities
|
252
|
+
selected_candidates = []
|
253
|
+
candidates.each do |cand|
|
254
|
+
cand_id = cand[0]
|
255
|
+
cand_lab = id2label[cand_id.to_s]
|
256
|
+
next if cand_lab.nil?
|
257
|
+
other_score = other_scores[cand_lab]
|
258
|
+
next if other_score.nil?
|
259
|
+
cand << other_score
|
260
|
+
selected_candidates << cand
|
261
|
+
end
|
262
|
+
selected_candidates.sort!{|e1, e2| e2[2] <=> e1[2]}
|
263
|
+
candidates = selected_candidates.first(candidate_limit)
|
264
|
+
candidates.sort!{|e1, e2| e2[1] <=> e1[1]}
|
265
|
+
candidates.each{|c| c.pop}
|
266
|
+
end
|
248
267
|
candidates_ids = candidates.map{|c| c.first}
|
249
268
|
candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
|
250
269
|
candidate_similarity_matrix.each_with_index do |row, i|
|
@@ -1,23 +1,59 @@
|
|
1
1
|
class Genomic_Feature
|
2
|
+
@@ref = nil
|
3
|
+
|
4
|
+
def self.array2genomic_feature(arr)
|
5
|
+
new(arr.map{|r| yield(r)})
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.hash2genomic_feature(h)
|
9
|
+
vars = []
|
10
|
+
h.each do |h, v|
|
11
|
+
vars << yield(h, v)
|
12
|
+
end
|
13
|
+
new(vars)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.add_reference(genomic_regions)
|
17
|
+
@@ref = genomic_regions
|
18
|
+
end
|
19
|
+
|
2
20
|
#If any method use gen_fet as name is a Genomic_Feature object
|
3
|
-
def initialize(feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
21
|
+
def initialize(feat_array, annotations: nil) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
4
22
|
@regions = {}
|
23
|
+
@reg_by_to = {}
|
5
24
|
@reg_id = -1
|
6
25
|
load_features(feat_array)
|
26
|
+
load_annotations(annotations) if !annotations.nil?
|
7
27
|
end
|
8
28
|
|
9
29
|
def load_features(feat_array)
|
10
|
-
feat_array.each do |chr, start, stop|
|
30
|
+
feat_array.each do |chr, start, stop, to|
|
11
31
|
chr = chr.to_sym
|
12
|
-
|
32
|
+
@reg_id +=1
|
33
|
+
id = to.nil? ? @reg_id : to
|
34
|
+
region = {chr: chr, start: start, stop: stop, to: id }
|
35
|
+
@reg_by_to[id] = region
|
13
36
|
add_record(@regions, chr, region)
|
14
37
|
end
|
15
38
|
end
|
16
39
|
|
40
|
+
def load_annotations(annotations)
|
41
|
+
each do |chr, reg|
|
42
|
+
annot = annotations[reg[:to]]
|
43
|
+
reg[:attrs] = annot if !annot.nil?
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
17
47
|
def length
|
18
48
|
return @regions.length
|
19
49
|
end
|
20
50
|
|
51
|
+
def each_chr()
|
52
|
+
@regions.each do |chr, regs|
|
53
|
+
yield(chr, regs)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
21
57
|
def each()
|
22
58
|
@regions.each do |chr, regs|
|
23
59
|
regs.each do |region|
|
@@ -30,6 +66,14 @@ class Genomic_Feature
|
|
30
66
|
return @regions.keys
|
31
67
|
end
|
32
68
|
|
69
|
+
def get_chr_regs(chr)
|
70
|
+
return @regions[chr]
|
71
|
+
end
|
72
|
+
|
73
|
+
def region_by_to(to)
|
74
|
+
return @reg_by_to[to]
|
75
|
+
end
|
76
|
+
|
33
77
|
def get_sizes
|
34
78
|
sizes = []
|
35
79
|
each do |chr, region|
|
@@ -39,6 +83,35 @@ class Genomic_Feature
|
|
39
83
|
return sizes
|
40
84
|
end
|
41
85
|
|
86
|
+
def get_features(attr_type: nil)
|
87
|
+
features = match(@@ref)
|
88
|
+
if !attr_type.nil?
|
89
|
+
features.each do |reg_id, feat_ids|
|
90
|
+
new_feat_ids = feat_ids.map{|fi| @@ref.region_by_to(fi).dig(:attrs, attr_type)}
|
91
|
+
features[reg_id] = new_feat_ids.compact.uniq
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return features
|
95
|
+
end
|
96
|
+
|
97
|
+
def match(other_gen_feat)
|
98
|
+
all_matches = {}
|
99
|
+
each_chr do |chr, regs|
|
100
|
+
other_regs = other_gen_feat.get_chr_regs(chr)
|
101
|
+
next if other_regs.nil?
|
102
|
+
regs.each do |reg|
|
103
|
+
local_matches = []
|
104
|
+
start = reg[:start]
|
105
|
+
stop = reg[:stop]
|
106
|
+
other_regs.each do |other_reg|
|
107
|
+
local_matches << other_reg[:to] if coor_overlap?(start, stop, other_reg)
|
108
|
+
end
|
109
|
+
all_matches[reg[:to]] = local_matches if !local_matches.empty?
|
110
|
+
end
|
111
|
+
end
|
112
|
+
return all_matches
|
113
|
+
end
|
114
|
+
|
42
115
|
def get_summary_sizes
|
43
116
|
sizes = Hash.new(0)
|
44
117
|
each do |chr, region|
|
@@ -60,21 +133,21 @@ class Genomic_Feature
|
|
60
133
|
reference.each do |start, stop|
|
61
134
|
reg_ids = []
|
62
135
|
genomic_ranges.each do |reg|
|
63
|
-
|
136
|
+
overlap = coor_overlap?(start, stop, reg)
|
137
|
+
reg_ids << reg[:to] if overlap
|
64
138
|
end
|
65
139
|
overlaps << reg_ids.uniq
|
66
140
|
end
|
67
141
|
return overlaps
|
68
142
|
end
|
69
143
|
|
70
|
-
def generate_cluster_regions(meth, tag, ids_per_reg = 1)
|
144
|
+
def generate_cluster_regions(meth, tag, ids_per_reg = 1, obj = false)
|
71
145
|
compute_windows(meth) # Get putative genome windows
|
72
|
-
patients_out_of_cluster = 0
|
73
146
|
ids_by_cluster = {}
|
74
147
|
annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
|
75
148
|
@regions.each do |chr, regs|
|
76
149
|
reference = @windows[chr]
|
77
|
-
overlaps = get_reference_overlaps(regs, reference)
|
150
|
+
overlaps = get_reference_overlaps(regs, reference)
|
78
151
|
clust_numb = 0
|
79
152
|
reference.each_with_index do |ref, i|
|
80
153
|
current_ids = overlaps[i]
|
@@ -87,6 +160,7 @@ class Genomic_Feature
|
|
87
160
|
end
|
88
161
|
end
|
89
162
|
end
|
163
|
+
annotated_full_ref = Genomic_Feature.array2genomic_feature(annotated_full_ref){|r| [r[2], r[0], r[1], r[3]]} if obj
|
90
164
|
return ids_by_cluster, annotated_full_ref
|
91
165
|
end
|
92
166
|
|
@@ -116,15 +190,37 @@ class Genomic_Feature
|
|
116
190
|
|
117
191
|
def compute_region_overlap_windows(genomic_ranges)
|
118
192
|
reference = []
|
119
|
-
|
120
|
-
|
193
|
+
single_nt = []
|
194
|
+
genomic_ranges.each do |gr|
|
195
|
+
start = gr[:start]
|
196
|
+
stop = gr[:stop]
|
197
|
+
if stop - start > 0
|
198
|
+
reference << start # get start
|
199
|
+
reference << stop # get stop
|
200
|
+
else # Build a window of at least one nt for snv
|
201
|
+
single_nt << start
|
202
|
+
end
|
203
|
+
end
|
121
204
|
reference.uniq!
|
205
|
+
single_nt.each do |snt| # add start stop for snv
|
206
|
+
reference << snt
|
207
|
+
reference << snt
|
208
|
+
end
|
122
209
|
reference.sort!
|
123
210
|
#Define overlap ranges
|
124
211
|
final_reference = []
|
212
|
+
last_len = 1
|
125
213
|
reference.each_with_index do |coord,i|
|
126
214
|
next_coord = reference[i + 1]
|
127
|
-
|
215
|
+
if !next_coord.nil?
|
216
|
+
current_len = next_coord - coord
|
217
|
+
coord = coord + 1 if last_len == 0 # Separate SNV window from others
|
218
|
+
if current_len == 0 && last_len > 0 && !final_reference.empty?
|
219
|
+
final_reference.last[1] -= 1 # Separate SNV window from others
|
220
|
+
end
|
221
|
+
final_reference << [coord, next_coord]
|
222
|
+
last_len = current_len
|
223
|
+
end
|
128
224
|
end
|
129
225
|
return final_reference
|
130
226
|
end
|
data/lib/pets/io.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'csv'
|
2
|
+
require 'bio-vcf'
|
2
3
|
|
3
4
|
def load_hpo_ontology(hpo_file, excluded_hpo_file)
|
4
5
|
hpo = nil
|
@@ -201,20 +202,39 @@ end
|
|
201
202
|
|
202
203
|
def load_variants(variant_folder)
|
203
204
|
variants = {}
|
204
|
-
Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
|
205
|
-
profile_id = File.basename(path,
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
start = fields[1].to_i
|
211
|
-
vars << [chr, start, start]
|
205
|
+
Dir.glob(File.join(variant_folder, '*.{tab,vcf,vcf.gz}')).each do |path|
|
206
|
+
profile_id, ext = File.basename(path).split(".", 2)
|
207
|
+
if ext == 'tab' || ext == 'txt'
|
208
|
+
vars = load_tabular_vars(path)
|
209
|
+
elsif ext == 'vcf' || ext == 'vcf.gz'
|
210
|
+
vars = load_vcf(path, ext)
|
212
211
|
end
|
213
212
|
variants[profile_id] = Genomic_Feature.new(vars)
|
214
213
|
end
|
215
214
|
return variants
|
216
215
|
end
|
217
216
|
|
217
|
+
def load_tabular_vars(path)
|
218
|
+
vars = []
|
219
|
+
File.open(path).each do |line|
|
220
|
+
fields = line.chomp.split("\t")
|
221
|
+
chr = fields[0].gsub('chr','')
|
222
|
+
start = fields[1].to_i
|
223
|
+
vars << [chr, start, start]
|
224
|
+
end
|
225
|
+
return vars
|
226
|
+
end
|
227
|
+
|
228
|
+
def load_vcf(path, ext) # Some compressed files are fragmented internally. If so, VCFfile only reads first fragment
|
229
|
+
vars = [] # Use zcat original.vcf.gz | gzip > new.vcf.gz to obtain a contigous file
|
230
|
+
vcf = BioVcf::VCFfile.new(file: path, is_gz: ext == 'vcf.gz' ? true : false )
|
231
|
+
vcf.each do |var|
|
232
|
+
vars << [var.chrom.gsub('chr',''), var.pos, var.pos]
|
233
|
+
end
|
234
|
+
puts vars.length
|
235
|
+
return vars
|
236
|
+
end
|
237
|
+
|
218
238
|
def load_evidences(evidences_path, hpo)
|
219
239
|
genomic_coordinates = {}
|
220
240
|
coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
|
@@ -242,6 +262,10 @@ def load_coordinates(file_path)
|
|
242
262
|
header = false
|
243
263
|
else
|
244
264
|
entity, chr, strand, start, stop = fields
|
265
|
+
if chr == 'NA'
|
266
|
+
STDERR.puts "Warning: Record #{fields.inspect} is undefined"
|
267
|
+
next
|
268
|
+
end
|
245
269
|
coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
|
246
270
|
end
|
247
271
|
end
|
@@ -3,6 +3,7 @@ class Cohort_Parser
|
|
3
3
|
fields2extract = get_fields2extract(options)
|
4
4
|
field_numbers = fields2extract.values
|
5
5
|
records = read_records(options, fields2extract, field_numbers)
|
6
|
+
options[:extracted_fields] = fields2extract.keys
|
6
7
|
cohort, rejected_terms, rejected_recs = create_cohort(records, options)
|
7
8
|
return cohort, rejected_terms, rejected_recs
|
8
9
|
end
|
@@ -46,7 +47,7 @@ class Cohort_Parser
|
|
46
47
|
|
47
48
|
def self.get_fields2extract(options)
|
48
49
|
fields2extract = {}
|
49
|
-
[:id_col, :ont_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
[:id_col, :ont_col, :chromosome_col, :start_col, :end_col, :sex_col].each do |field|
|
50
51
|
col = options[field]
|
51
52
|
if !col.nil?
|
52
53
|
col = col.to_i if !options[:header]
|
@@ -70,7 +71,7 @@ class Cohort_Parser
|
|
70
71
|
records.each do |id, record|
|
71
72
|
rec = record.first
|
72
73
|
terms = rec.first
|
73
|
-
if options[:names]
|
74
|
+
if options[:names] # Translate hpo names 2 codes
|
74
75
|
init_term_number = terms.length
|
75
76
|
terms, rec_rejected_terms = ont.translate_names(terms)
|
76
77
|
if !rec_rejected_terms.empty?
|
@@ -87,7 +88,11 @@ class Cohort_Parser
|
|
87
88
|
else
|
88
89
|
variants = [] # Not exists genomic region attributes so we create a empty array
|
89
90
|
end
|
90
|
-
|
91
|
+
other_attr = {}
|
92
|
+
if options[:extracted_fields].include?(:sex_col) # Check for additional attributes. -1 is applied to ignore :id in extracted fields
|
93
|
+
other_attr[:sex] = record.first[options[:extracted_fields].index(:sex_col) -1]
|
94
|
+
end
|
95
|
+
cohort.add_record([id, terms, check_variants(variants)], other_attr)
|
91
96
|
end
|
92
97
|
return cohort, rejected_terms.uniq, rejected_recs
|
93
98
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'genomic_features'
|
2
|
+
class Reference_parser
|
3
|
+
|
4
|
+
def self.load(file_path, file_format: nil, feature_type: nil)
|
5
|
+
file_format = file_path.split('.', 2).last if file_format.nil?
|
6
|
+
if file_format == 'gtf'
|
7
|
+
regions, all_attrs = parse_gtf(file_path, feature_type: feature_type)
|
8
|
+
end
|
9
|
+
|
10
|
+
return Genomic_Feature.new(regions, annotations: all_attrs)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parse_gtf(file_path, feature_type: nil) # https://www.ensembl.org/info/website/upload/gff.html
|
14
|
+
features = []
|
15
|
+
all_attrs = {}
|
16
|
+
File.open(file_path).each do |line|
|
17
|
+
next if /^#/ =~ line
|
18
|
+
seqname, source, feature, start, stop, score, strand, frame, attribute = line.chomp.split("\t")
|
19
|
+
if feature_type.nil? || feature_type == feature
|
20
|
+
attrs = process_attrs(attribute, ';', ' ')
|
21
|
+
attrs['source'] = source
|
22
|
+
attrs['feature'] = feature
|
23
|
+
id = attrs['gene_id']
|
24
|
+
features << [seqname.gsub('chr',''), start.to_i, stop.to_i, id]
|
25
|
+
all_attrs[id] = attrs
|
26
|
+
end
|
27
|
+
end
|
28
|
+
return features, all_attrs
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def self.process_attrs(attributes, tuple_sep, field_sep)
|
33
|
+
return attributes.split(tuple_sep).map{|attr_pair|
|
34
|
+
tuple = attr_pair.strip.split(field_sep, 2)
|
35
|
+
tuple.last.gsub!('"','')
|
36
|
+
tuple
|
37
|
+
}.to_h
|
38
|
+
end
|
39
|
+
end
|
data/lib/pets/version.rb
CHANGED
data/lib/pets.rb
CHANGED
data/pets.gemspec
CHANGED
@@ -38,15 +38,18 @@ Gem::Specification.new do |spec|
|
|
38
38
|
|
39
39
|
spec.add_development_dependency "bundler", "~> 2.0"
|
40
40
|
spec.add_development_dependency "rake", "~> 13.0.3"
|
41
|
-
spec.add_development_dependency "rspec", "~> 3.
|
41
|
+
spec.add_development_dependency "rspec", "~> 3.11.0"
|
42
42
|
spec.add_dependency "statistics2"
|
43
43
|
spec.add_dependency "terminal-table"
|
44
44
|
spec.add_dependency "semtools", "~> 0.1.0"
|
45
|
+
spec.add_dependency "NetAnalyzer"
|
45
46
|
spec.add_dependency "report_html"
|
46
47
|
spec.add_dependency "numo-narray"
|
47
48
|
spec.add_dependency "npy"
|
48
49
|
spec.add_dependency "expcalc"
|
49
|
-
spec.add_dependency "
|
50
|
-
|
50
|
+
spec.add_dependency "bio-vcf"
|
51
|
+
spec.add_dependency "parallel", "~> 1.20.1"
|
52
|
+
spec.add_runtime_dependency 'net-ftp'
|
53
|
+
spec.add_runtime_dependency 'net-http'
|
51
54
|
end
|
52
55
|
|
@@ -25,14 +25,34 @@
|
|
25
25
|
<div style="width: 90%; background-color:#ecf0f1; margin: 0 auto;">
|
26
26
|
<h1 style="text-align: center; background-color:#d6eaf8">Patient HPO profiles by cluster.</h1>
|
27
27
|
<%= table(id: :clusters, header: true, border: 2, row_names: false, text: true,
|
28
|
-
cell_align: %w( center )) do |data|
|
29
|
-
|
28
|
+
cell_align: %w( center ), styled: 'dt', attrib: {'class' => 'table'}) do |data|
|
29
|
+
patient_list = []
|
30
|
+
data.each do |element| # Cluster
|
31
|
+
clID, patient_number, patient_ids, hpo_codes, hpo_names = element
|
32
|
+
# TODO: mostrar registro por paciente
|
30
33
|
#STDERR.puts element.inspect
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
patient_ids.each_with_index do |patID, i|
|
35
|
+
patient_record = [clID, patient_number]
|
36
|
+
patient_record << patID
|
37
|
+
patient_record << hpo_codes[i].map{|hpo_code| get_hpo_link(hpo_code)}.join(', ')
|
38
|
+
patient_record << hpo_names[i].join(', ')
|
39
|
+
patient_list << patient_record
|
40
|
+
end
|
34
41
|
end
|
42
|
+
data.clear
|
43
|
+
data.concat(patient_list)
|
35
44
|
data.unshift(["Cluster ID","Patients in Cluster","Patient IDs", "HPO codes", "Phenotypes"])
|
36
45
|
end
|
37
46
|
%>
|
38
47
|
</div>
|
48
|
+
|
49
|
+
<div style="width: 90%; background-color:#ecf0f1; margin: 0 auto;">
|
50
|
+
<h1 style="text-align: center; background-color:#d6eaf8"> Cluster detailed view.</h1>
|
51
|
+
<%
|
52
|
+
@hash_vars[:sim_mat4cluster].each do |clID, sim_matrix|
|
53
|
+
@hash_vars[:sim_matrix] = sim_matrix %>
|
54
|
+
<%= heatmap(id: :sim_matrix, header: true, row_names: true, title: "Cluster #{clID}" )%>
|
55
|
+
<% end
|
56
|
+
%>
|
57
|
+
|
58
|
+
</div>
|
@@ -4,7 +4,7 @@
|
|
4
4
|
<% @hash_vars[:similarity_matrixs].each do |pair, similarity_matrix| %>
|
5
5
|
<%
|
6
6
|
matrix_name = pair + '_sim_matrix'
|
7
|
-
if pair
|
7
|
+
if pair.include?('gene_HP')
|
8
8
|
dict = @hash_vars[:evidences][pair][:id2lab]
|
9
9
|
header = similarity_matrix.first
|
10
10
|
header.map! do |item|
|
@@ -27,8 +27,24 @@
|
|
27
27
|
<% end %>
|
28
28
|
<%=circular_genome(id: :candidates, header: false, row_names: true, transpose: false,
|
29
29
|
genomic_coordinates: @hash_vars[:genomic_coordinates] )%>
|
30
|
-
<% if !@hash_vars[:var_ids].nil?
|
31
|
-
|
32
|
-
|
30
|
+
<% if !@hash_vars[:var_ids].nil?
|
31
|
+
if @hash_vars[:var_ids].length > 200 %>
|
32
|
+
<p> Too much variant records</p>
|
33
|
+
<% else %>
|
34
|
+
<%=circular_genome(id: :var_ids, header: false,
|
35
|
+
row_names: true, transpose: false,
|
36
|
+
genomic_coordinates: @hash_vars[:var_coordinates] )%>
|
37
|
+
<% end %>
|
33
38
|
<% end %>
|
39
|
+
|
40
|
+
<h1 style="text-align: center; background-color:#d6eaf8">Candidate regions</h1>
|
41
|
+
|
42
|
+
<%= table(id: :hotspot_table) do |data|
|
43
|
+
data.each do |row|
|
44
|
+
row[1] = row[1].join(" ")
|
45
|
+
row[2] = row[2].join(",")
|
46
|
+
row[3] = row[3].join(",")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
%>
|
34
50
|
</div>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elena Rojano, Pedro Seoane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 3.
|
47
|
+
version: 3.11.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 3.
|
54
|
+
version: 3.11.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: statistics2
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: 0.1.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: NetAnalyzer
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: report_html
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -150,6 +164,20 @@ dependencies:
|
|
150
164
|
- - ">="
|
151
165
|
- !ruby/object:Gem::Version
|
152
166
|
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: bio-vcf
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
153
181
|
- !ruby/object:Gem::Dependency
|
154
182
|
name: parallel
|
155
183
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,6 +192,34 @@ dependencies:
|
|
164
192
|
- - "~>"
|
165
193
|
- !ruby/object:Gem::Version
|
166
194
|
version: 1.20.1
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: net-ftp
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :runtime
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: net-http
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :runtime
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
167
223
|
description: PETS suite includes three different tools. CohortAnalyzer performs the
|
168
224
|
calculation of several statistics that gives an overview of a cohort of patients
|
169
225
|
to analyse. Reg2Phen uses associations between pathological phenotypes and regions
|
@@ -185,6 +241,7 @@ executables:
|
|
185
241
|
- fmeasure_index.rb
|
186
242
|
- generate_HPO_IC_table.rb
|
187
243
|
- get_PR_values.rb
|
244
|
+
- get_gen_features.rb
|
188
245
|
- get_network_nodes.rb
|
189
246
|
- get_sorted_profs.rb
|
190
247
|
- install_deps.rb
|
@@ -218,6 +275,7 @@ files:
|
|
218
275
|
- bin/fmeasure_index.rb
|
219
276
|
- bin/generate_HPO_IC_table.rb
|
220
277
|
- bin/get_PR_values.rb
|
278
|
+
- bin/get_gen_features.rb
|
221
279
|
- bin/get_network_nodes.rb
|
222
280
|
- bin/get_sorted_profs.rb
|
223
281
|
- bin/install_deps.rb
|
@@ -269,11 +327,13 @@ files:
|
|
269
327
|
- lib/pets.rb
|
270
328
|
- lib/pets/coPatReporterMethods.rb
|
271
329
|
- lib/pets/cohort.rb
|
330
|
+
- lib/pets/common_optparse.rb
|
272
331
|
- lib/pets/constants.rb
|
273
332
|
- lib/pets/generalMethods.rb
|
274
333
|
- lib/pets/genomic_features.rb
|
275
334
|
- lib/pets/io.rb
|
276
335
|
- lib/pets/parsers/cohort_parser.rb
|
336
|
+
- lib/pets/parsers/reference_parser.rb
|
277
337
|
- lib/pets/phen2reg_methods.rb
|
278
338
|
- lib/pets/reg2phen_methods.rb
|
279
339
|
- lib/pets/version.rb
|
@@ -303,7 +363,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
303
363
|
- !ruby/object:Gem::Version
|
304
364
|
version: '0'
|
305
365
|
requirements: []
|
306
|
-
rubygems_version: 3.
|
366
|
+
rubygems_version: 3.3.7
|
307
367
|
signing_key:
|
308
368
|
specification_version: 4
|
309
369
|
summary: Suite with predictive tools.
|