pets 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
data/lib/pets/cohort.rb
ADDED
@@ -0,0 +1,307 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'semtools'
|
3
|
+
|
4
|
+
class Cohort
|
5
|
+
@@ont = {}
|
6
|
+
class << self # https://www.ruby-forum.com/t/attr-accessor-for-class-variable/136693
|
7
|
+
attr_accessor :act_ont # Which ontology use for ont related operations
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_accessor :profiles
|
11
|
+
|
12
|
+
def self.get_ontology(ont_id)
|
13
|
+
return @@ont[ont_id]
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.load_ontology(ont_name, ont_file, excluded_terms_file = nil)
|
17
|
+
ont = nil
|
18
|
+
if !ont_file.include?('.json')
|
19
|
+
if !excluded_terms_file.nil?
|
20
|
+
ont = Ontology.new(file: ont_file, load_file: true, removable_terms: read_excluded_ont_file(excluded_terms_file))
|
21
|
+
else
|
22
|
+
ont = Ontology.new(file: ont_file, load_file: true)
|
23
|
+
end
|
24
|
+
else
|
25
|
+
ont = Ontology.new
|
26
|
+
ont.read(ont_file)
|
27
|
+
if !excluded_terms_file.nil?
|
28
|
+
ont.add_removable_terms(read_excluded_ont_file(excluded_terms_file))
|
29
|
+
ont.remove_removable()
|
30
|
+
ont.build_index()
|
31
|
+
end
|
32
|
+
end
|
33
|
+
@@ont[ont_name] = ont
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.read_excluded_ont_file(file)
|
37
|
+
excluded_hpo = []
|
38
|
+
File.open(file).each do |line|
|
39
|
+
excluded_hpo << line.chomp
|
40
|
+
end
|
41
|
+
return excluded_hpo
|
42
|
+
end
|
43
|
+
|
44
|
+
def initialize()
|
45
|
+
@profiles = {}
|
46
|
+
@vars = {}
|
47
|
+
@var_idx = Genomic_Feature.new([])
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_record(rec) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
51
|
+
id, profile, vars = rec
|
52
|
+
@profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
|
53
|
+
add_gen_feat(id, vars) if !vars.nil?
|
54
|
+
end
|
55
|
+
|
56
|
+
def delete(id)
|
57
|
+
@profiles.delete(id)
|
58
|
+
@vars.delete(id)
|
59
|
+
end
|
60
|
+
|
61
|
+
def select_by_profile!
|
62
|
+
@profiles.select!{|id, profile| yield(id, profile)}
|
63
|
+
current_ids = @profiles.keys
|
64
|
+
@vars.select!{|id, var| current_ids.include?(id)}
|
65
|
+
end
|
66
|
+
|
67
|
+
def select_by_var!
|
68
|
+
@vars.select!{|id, profile| yield(id, profile)}
|
69
|
+
current_ids = @vars.keys
|
70
|
+
@profiles.select!{|id, var| current_ids.include?(id)}
|
71
|
+
end
|
72
|
+
|
73
|
+
def filter_by_term_number(n_terms)
|
74
|
+
select_by_profile!{|id, profile| profile.length >= n_terms}
|
75
|
+
end
|
76
|
+
|
77
|
+
def remove_incomplete_records # remove resc that lacks of vars or phenotypes
|
78
|
+
ids_with_terms = @profiles.keys
|
79
|
+
ids_with_vars = []
|
80
|
+
@vars.each{|id, regs| ids_with_vars << id if regs.length > 0}
|
81
|
+
full_ids = ids_with_vars & ids_with_terms
|
82
|
+
@profiles.select!{|id, prof| full_ids.include?(id)}
|
83
|
+
@vars.select!{|id, var| full_ids.include?(id)}
|
84
|
+
end
|
85
|
+
|
86
|
+
def add_gen_feat(id, feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
87
|
+
@vars[id] = Genomic_Feature.new(feat_array)
|
88
|
+
end
|
89
|
+
|
90
|
+
def get_profile(id)
|
91
|
+
return @profiles[id]
|
92
|
+
end
|
93
|
+
|
94
|
+
def get_vars(id)
|
95
|
+
return @vars[id]
|
96
|
+
end
|
97
|
+
|
98
|
+
def each_profile()
|
99
|
+
@profiles.each do |id, profile|
|
100
|
+
yield(id, profile)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def each_var()
|
105
|
+
@vars.each do |id, var_info|
|
106
|
+
yield(id, var_info)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def get_general_profile(thr=0) # TODO move funcionality to semtools
|
111
|
+
term_count = Hash.new(0)
|
112
|
+
each_profile do |id, prof|
|
113
|
+
prof.each do |term|
|
114
|
+
general_profile[prof] += 1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
records = @profiles.length
|
118
|
+
general_profile = []
|
119
|
+
term_count.each do |term, count|
|
120
|
+
general_profile << term if count.fdiv(records) >= thr
|
121
|
+
end
|
122
|
+
ont = @@ont[Cohort.act_ont]
|
123
|
+
return ont.clean_profile_hard(general_profile)
|
124
|
+
end
|
125
|
+
|
126
|
+
def check(hard=false) # OLD format_patient_data
|
127
|
+
ont = @@ont[Cohort.act_ont]
|
128
|
+
rejected_terms = []
|
129
|
+
rejected_recs = []
|
130
|
+
@profiles.each do |id, terms|
|
131
|
+
if hard
|
132
|
+
terms = ont.clean_profile_hard(terms)
|
133
|
+
rejec_terms = []
|
134
|
+
else
|
135
|
+
terms, rejec_terms = ont.check_ids(terms)
|
136
|
+
end
|
137
|
+
if !rejec_terms.empty?
|
138
|
+
STDERR.puts "WARNING: record #{id} has the unknown CODES '#{rejec_terms.join(',')}'. Codes removed."
|
139
|
+
rejected_terms.concat(rejec_terms)
|
140
|
+
end
|
141
|
+
if terms.empty?
|
142
|
+
rejected_recs << id
|
143
|
+
else
|
144
|
+
@profiles[id] = terms
|
145
|
+
end
|
146
|
+
end
|
147
|
+
@profiles.select!{|id, record| !rejected_recs.include?(id)}
|
148
|
+
@vars.select!{|id, record| !rejected_recs.include?(id)}
|
149
|
+
return rejected_terms.uniq, rejected_recs
|
150
|
+
end
|
151
|
+
|
152
|
+
def link2ont(ont_id)
|
153
|
+
@@ont[ont_id].load_profiles(@profiles)
|
154
|
+
end
|
155
|
+
|
156
|
+
def get_profile_redundancy
|
157
|
+
ont = @@ont[Cohort.act_ont]
|
158
|
+
profile_sizes, parental_terms_per_profile = ont.get_profile_redundancy
|
159
|
+
return profile_sizes, parental_terms_per_profile
|
160
|
+
end
|
161
|
+
|
162
|
+
def get_profiles_terms_frequency(options={})
|
163
|
+
ont = @@ont[Cohort.act_ont]
|
164
|
+
term_stats = ont.get_profiles_terms_frequency(**options) #https://www.ruby-lang.org/en/news/2019/12/12/separation-of-positional-and-keyword-arguments-in-ruby-3-0/
|
165
|
+
return term_stats
|
166
|
+
end
|
167
|
+
|
168
|
+
def compute_term_list_and_childs()
|
169
|
+
ont = @@ont[Cohort.act_ont]
|
170
|
+
suggested_childs, term_with_childs_ratio = ont.compute_term_list_and_childs()
|
171
|
+
end
|
172
|
+
|
173
|
+
def get_profile_ontology_distribution_tables()
|
174
|
+
ont = @@ont[Cohort.act_ont]
|
175
|
+
ontology_levels, distribution_percentage = ont.get_profile_ontology_distribution_tables
|
176
|
+
ontology_levels.unshift(["level", "ontology", "cohort"])
|
177
|
+
distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
|
178
|
+
return ontology_levels, distribution_percentage
|
179
|
+
end
|
180
|
+
|
181
|
+
def get_ic_analysis()
|
182
|
+
ont = @@ont[Cohort.act_ont]
|
183
|
+
onto_ic, freq_ic = ont.get_observed_ics_by_onto_and_freq # IC for TERMS
|
184
|
+
onto_ic_profile, freq_ic_profile = ont.get_profiles_resnik_dual_ICs # IC for PROFILES
|
185
|
+
return onto_ic, freq_ic, onto_ic_profile, freq_ic_profile
|
186
|
+
end
|
187
|
+
|
188
|
+
def get_profiles_mean_size
|
189
|
+
ont = @@ont[Cohort.act_ont]
|
190
|
+
profile_mean_size = ont.get_profiles_mean_size
|
191
|
+
return profile_mean_size
|
192
|
+
end
|
193
|
+
|
194
|
+
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
195
|
+
ont = @@ont[Cohort.act_ont]
|
196
|
+
length_percent = ont.get_profile_length_at_percentile(perc=perc, increasing_sort: increasing_sort)
|
197
|
+
return length_percent
|
198
|
+
end
|
199
|
+
|
200
|
+
def get_dataset_specifity_index(type)
|
201
|
+
ont = @@ont[Cohort.act_ont]
|
202
|
+
dsi = ont.get_dataset_specifity_index(type)
|
203
|
+
return dsi
|
204
|
+
end
|
205
|
+
|
206
|
+
def compare_profiles(options={})
|
207
|
+
ont = @@ont[Cohort.act_ont]
|
208
|
+
similarities = ont.compare_profiles(**options)
|
209
|
+
return similarities
|
210
|
+
end
|
211
|
+
|
212
|
+
def index_vars # equivalent to process_patient_data
|
213
|
+
each_var do |id, var|
|
214
|
+
@var_idx.merge(var, id)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def get_vars_sizes(summary=false)
|
219
|
+
if summary
|
220
|
+
return @var_idx.get_summary_sizes
|
221
|
+
else
|
222
|
+
return @var_idx.get_sizes
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def generate_cluster_regions(meth, tag, lim)
|
227
|
+
@var_idx.generate_cluster_regions(meth, tag, lim)
|
228
|
+
end
|
229
|
+
|
230
|
+
def save(output_file, mode = :default, translate = false)
|
231
|
+
File.open(output_file, 'w') do |f|
|
232
|
+
f.puts "id\tchr\tstart\tstop\tterms" if mode == 'paco'
|
233
|
+
ont = @@ont[Cohort.act_ont]
|
234
|
+
@profiles.each do |id, terms|
|
235
|
+
terms, rejected = ont.translate_ids(terms) if translate
|
236
|
+
id_variants = @vars[id]
|
237
|
+
variants = []
|
238
|
+
if id_variants.nil? || id_variants.length == 0
|
239
|
+
variants << ['-', '-', '-']
|
240
|
+
else
|
241
|
+
id_variants.each do |chr, reg|
|
242
|
+
variants << [chr, reg[:start], reg[:stop]]
|
243
|
+
end
|
244
|
+
end
|
245
|
+
variants.each do |var|
|
246
|
+
if mode == :default
|
247
|
+
f.puts "#{id}\t#{terms.join('|')}\t#{var.join("\t")}"
|
248
|
+
elsif mode == :paco
|
249
|
+
f.puts "#{id}\t#{var.join("\t")}\t#{terms.join('|')}"
|
250
|
+
else
|
251
|
+
abort('Wrong save mode option, please try default or paco')
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def export_phenopackets(output_folder, genome_assembly, vcf_index: nil)
|
259
|
+
ont = @@ont[Cohort.act_ont]
|
260
|
+
metaData = {
|
261
|
+
"createdBy" => "PETS",
|
262
|
+
"resources" => [{
|
263
|
+
"id" => "hp",
|
264
|
+
"name" => "human phenotype ontology",
|
265
|
+
"namespacePrefix" => "HP",
|
266
|
+
"url" => "http://purl.obolibrary.org/obo/hp.owl",
|
267
|
+
# "version" => "2018-03-08",
|
268
|
+
"iriPrefix" => "http://purl.obolibrary.org/obo/HP_"
|
269
|
+
}]
|
270
|
+
}
|
271
|
+
|
272
|
+
@profiles.each do |id, terms|
|
273
|
+
phenopacket = {metaData: metaData}
|
274
|
+
phenopacket[:subject] = {id: id}
|
275
|
+
phenotypicFeatures = []
|
276
|
+
terms.each do |term|
|
277
|
+
term_name = ont.translate_id(term)
|
278
|
+
phenotypicFeatures << {
|
279
|
+
type: { id: term, label: term_name},
|
280
|
+
classOfOnset: {"id" => "HP:0003577", "label" => "Congenital onset"}
|
281
|
+
}
|
282
|
+
end
|
283
|
+
phenopacket[:phenotypicFeatures] = phenotypicFeatures
|
284
|
+
if !vcf_index.nil? && vcf_index.include?(id)
|
285
|
+
htsFiles = []
|
286
|
+
htsFiles << {
|
287
|
+
"uri" => "file:/" + vcf_index[id],
|
288
|
+
"description" => id,
|
289
|
+
"htsFormat" => "VCF",
|
290
|
+
"genomeAssembly" => genome_assembly,
|
291
|
+
"individualToSampleIdentifiers" => { "patient1" => id }
|
292
|
+
}
|
293
|
+
phenopacket[:htsFiles] = htsFiles
|
294
|
+
end
|
295
|
+
File.open(File.join(output_folder, id.to_s + ".json"), "w") { |f| f.write JSON.pretty_generate(phenopacket) }
|
296
|
+
id_variants = @vars[id]
|
297
|
+
variants = []
|
298
|
+
if id_variants.nil? || id_variants.length == 0
|
299
|
+
variants << ['-', '-', '-']
|
300
|
+
else
|
301
|
+
id_variants.each do |chr, reg|
|
302
|
+
variants << [chr, reg[:start], reg[:stop]]
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# Needs define ROOT_PATH constant in file requiring this file
|
2
|
+
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
3
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
4
|
+
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
5
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
6
|
+
MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
|
7
|
+
IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
|
data/lib/pets/generalMethods.rb
CHANGED
@@ -13,80 +13,15 @@ def system_call(code_folder, script, args_string)
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
#Common methods for predictors
|
25
|
-
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
26
|
-
#1. Indexing by chr (region)
|
27
|
-
def coor_overlap?(ref_start, ref_stop, start, stop)
|
28
|
-
overlap = false
|
29
|
-
if (stop > ref_start && stop <= ref_stop) ||
|
30
|
-
(start >= ref_start && start < ref_stop) ||
|
31
|
-
(start <= ref_start && stop >= ref_stop) ||
|
32
|
-
(start > ref_start && stop < ref_stop)
|
33
|
-
overlap = true
|
34
|
-
end
|
35
|
-
return overlap
|
36
|
-
end
|
37
|
-
|
38
|
-
def load_training_file4regions(training_file)
|
39
|
-
training_set = {}
|
40
|
-
posInfo = loadFile(training_file)
|
41
|
-
posInfo.each do |info|
|
42
|
-
chr = info.shift
|
43
|
-
query = training_set[chr]
|
44
|
-
if query.nil?
|
45
|
-
training_set[chr] = [info]
|
46
|
-
else
|
47
|
-
query << info
|
48
|
-
end
|
49
|
-
end
|
50
|
-
return training_set
|
51
|
-
end
|
52
|
-
|
53
|
-
#2. Indexing by hpo (code)
|
54
|
-
#prepare training file for analysis using phenotype2region prediction
|
55
|
-
def load_training_file4HPO(training_file, thresold=0)
|
56
|
-
training_set = {}
|
57
|
-
information = loadFile(training_file, thresold)
|
58
|
-
information.each do |info|
|
59
|
-
hpoCode = info.delete_at(4)
|
60
|
-
query = training_set[hpoCode]
|
61
|
-
if query.nil?
|
62
|
-
training_set[hpoCode] = [info]
|
63
|
-
else
|
64
|
-
query << info
|
65
|
-
end
|
66
|
-
end
|
67
|
-
# STDERR.puts training_set.keys.inspect
|
68
|
-
return training_set
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
#3. Load training info file:
|
73
|
-
#Chr;Start;Stop;HPO;Association;node
|
74
|
-
def loadFile(file, thresold=0)
|
75
|
-
information = []
|
76
|
-
File.open(file).each do |line|
|
77
|
-
line.chomp!
|
78
|
-
allInfo = line.split("\t")
|
79
|
-
associationValue = allInfo[4].to_f
|
80
|
-
if associationValue >= thresold
|
81
|
-
chr = allInfo[0]
|
82
|
-
startPos = allInfo[1].to_i
|
83
|
-
stopPos = allInfo[2].to_i
|
84
|
-
hpoCode = allInfo[3]
|
85
|
-
nodeID = allInfo[5]
|
86
|
-
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
87
|
-
end
|
16
|
+
def add_record(hash, key, record, uniq=false)
|
17
|
+
query = hash[key]
|
18
|
+
if query.nil?
|
19
|
+
hash[key] = [record]
|
20
|
+
elsif !uniq # We not take care by repeated entries
|
21
|
+
query << record
|
22
|
+
elsif !query.include?(record) # We want uniq entries
|
23
|
+
query << record
|
88
24
|
end
|
89
|
-
return information
|
90
25
|
end
|
91
26
|
|
92
27
|
|
@@ -111,105 +46,6 @@ def compute_IC_values(patient_data, total_patients)
|
|
111
46
|
return patients_per_hpo
|
112
47
|
end
|
113
48
|
|
114
|
-
def load_hpo_ci_values(information_coefficient_file)
|
115
|
-
hpos_ci_values = {}
|
116
|
-
File.open(information_coefficient_file).each do |line|
|
117
|
-
line.chomp!
|
118
|
-
hpo_code, ci = line.split("\t")
|
119
|
-
hpos_ci_values[hpo_code.to_sym] = ci.to_f
|
120
|
-
end
|
121
|
-
return hpos_ci_values
|
122
|
-
end
|
123
|
-
|
124
|
-
def load_clustered_patients(file)
|
125
|
-
clusters = {}
|
126
|
-
File.open(file).each do |line|
|
127
|
-
line.chomp!
|
128
|
-
pat_id, cluster_id = line.split("\t")
|
129
|
-
query = clusters[cluster_id]
|
130
|
-
if query.nil?
|
131
|
-
clusters[cluster_id] = [pat_id]
|
132
|
-
else
|
133
|
-
query << pat_id
|
134
|
-
end
|
135
|
-
end
|
136
|
-
return clusters
|
137
|
-
end
|
138
|
-
|
139
|
-
def load_gene_data(gene_data_path)
|
140
|
-
gene_list = {} #geneID => attr
|
141
|
-
gene_location = {} # chr => gene
|
142
|
-
infile = open(gene_data_path)
|
143
|
-
gz = Zlib::GzipReader.new(infile)
|
144
|
-
current_chr = nil
|
145
|
-
genes = []
|
146
|
-
gz.each_line do |line|
|
147
|
-
line.chomp!
|
148
|
-
next if line =~ /^#/
|
149
|
-
fields = line.split("\t")
|
150
|
-
if fields[8].include?('genome=chromosome')
|
151
|
-
chr = fields[8].split(';')[1].split('=').last
|
152
|
-
gene_location[current_chr] = genes
|
153
|
-
genes = []
|
154
|
-
current_chr = chr
|
155
|
-
elsif fields[2] == 'gene'
|
156
|
-
attributes = {}
|
157
|
-
fields[8].split(';').each do |pair|
|
158
|
-
key, value = pair.split('=')
|
159
|
-
attributes[key] = value
|
160
|
-
end
|
161
|
-
geneName = nil
|
162
|
-
geneName = attributes['gene'] if !attributes['gene'].nil?
|
163
|
-
geneSyns = []
|
164
|
-
geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
|
165
|
-
description = attributes['description']
|
166
|
-
description = URI.unescape(description) if !description.nil?
|
167
|
-
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
168
|
-
gene_list[$1] = [geneName, geneSyns, description]
|
169
|
-
genes << [$1, fields[3].to_i, fields[4].to_i]
|
170
|
-
end
|
171
|
-
end
|
172
|
-
gene_location[current_chr] = genes
|
173
|
-
return gene_list, gene_location
|
174
|
-
end
|
175
|
-
|
176
|
-
def parse_kegg_data(query_genes)
|
177
|
-
kegg_data = {} #gene => attb
|
178
|
-
while !query_genes.empty?
|
179
|
-
gene_set = query_genes.shift(10)
|
180
|
-
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
181
|
-
uri = URI(url)
|
182
|
-
response = Net::HTTP.get(uri)
|
183
|
-
geneID = nil
|
184
|
-
gene_names = []
|
185
|
-
definition = nil
|
186
|
-
pathways = []
|
187
|
-
parsing_pathway_field = false
|
188
|
-
response.squeeze(' ').each_line do |line|
|
189
|
-
line.chomp!
|
190
|
-
if line =~ /^ENTRY/
|
191
|
-
geneID = line.split(' ')[1]
|
192
|
-
elsif line =~ /^NAME/
|
193
|
-
gene_names = line.split(' ', 2).last.split(', ')
|
194
|
-
elsif line =~ /^DEFINITION/
|
195
|
-
definition = line.split(' ', 2)[1]
|
196
|
-
elsif line =~ /^PATHWAY/
|
197
|
-
pathways << line.split(' ', 3)[1..2]
|
198
|
-
parsing_pathway_field = true
|
199
|
-
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
200
|
-
parsing_pathway_field = false
|
201
|
-
elsif parsing_pathway_field
|
202
|
-
pathways << line.strip.split(' ', 2)
|
203
|
-
elsif line == '///'
|
204
|
-
parsing_pathway_field = false
|
205
|
-
kegg_data[geneID] = [gene_names, definition, pathways]
|
206
|
-
pathways = []
|
207
|
-
gene_names = []
|
208
|
-
end
|
209
|
-
end
|
210
|
-
end
|
211
|
-
return kegg_data
|
212
|
-
end
|
213
49
|
|
214
50
|
def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
|
215
51
|
kegg_data = {}
|
@@ -270,21 +106,6 @@ def merge_genes_with_kegg_data(gene_list, kegg_data)
|
|
270
106
|
return merged_data
|
271
107
|
end
|
272
108
|
|
273
|
-
def write_compressed_plain_file(data, path)
|
274
|
-
File.open(path, 'w') do |f|
|
275
|
-
gz = Zlib::GzipWriter.new(f)
|
276
|
-
gz.write data.to_json
|
277
|
-
gz.close
|
278
|
-
end
|
279
|
-
end
|
280
|
-
|
281
|
-
def read_compressed_json(path)
|
282
|
-
infile = open(path)
|
283
|
-
gz = Zlib::GzipReader.new(infile)
|
284
|
-
object = JSON.parse(gz.read)
|
285
|
-
return object
|
286
|
-
end
|
287
|
-
|
288
109
|
def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
|
289
110
|
pathways_genes_in_predictions = {}
|
290
111
|
genes_in_predictions = []
|
@@ -358,138 +179,8 @@ def binom(n,k)
|
|
358
179
|
end
|
359
180
|
end
|
360
181
|
|
361
|
-
def get_reference(genomic_ranges)
|
362
|
-
#genomic_ranges = [patientID, mut_start, mut_stop]
|
363
|
-
reference = []
|
364
|
-
reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
|
365
|
-
reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
|
366
|
-
reference.uniq!
|
367
|
-
reference.sort!
|
368
|
-
#Define overlap range
|
369
|
-
final_reference = []
|
370
|
-
reference.each_with_index do |coord,i|
|
371
|
-
next_coord = reference[i + 1]
|
372
|
-
final_reference << [coord, next_coord] if !next_coord.nil?
|
373
|
-
end
|
374
|
-
return final_reference
|
375
|
-
end
|
376
|
-
|
377
|
-
def overlap_patients(genomic_ranges, reference)
|
378
|
-
overlaps = []
|
379
|
-
reference.each do |start, stop|
|
380
|
-
patients = []
|
381
|
-
genomic_ranges.each do |pt_id, pt_start, pt_stop|
|
382
|
-
if (start <= pt_start && stop >= pt_stop) ||
|
383
|
-
(start > pt_start && stop < pt_stop) ||
|
384
|
-
(stop > pt_start && stop <= pt_stop) ||
|
385
|
-
(start >= pt_start && start < pt_stop)
|
386
|
-
patients << pt_id
|
387
|
-
end
|
388
|
-
end
|
389
|
-
overlaps << patients.uniq
|
390
|
-
end
|
391
|
-
return overlaps
|
392
|
-
end
|
393
|
-
|
394
|
-
def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
|
395
|
-
patients_out_of_cluster = 0
|
396
|
-
patients_by_cluster = {}
|
397
|
-
sors = []
|
398
|
-
patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
|
399
|
-
reference = get_reference(genomic_ranges) # Get putative overlap regions
|
400
|
-
overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
|
401
|
-
clust_number = 1
|
402
|
-
reference.each_with_index do |ref, i|
|
403
|
-
current_patients = overlapping_patients[i]
|
404
|
-
if current_patients.length > pat_per_reg
|
405
|
-
ref << chrm
|
406
|
-
node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
|
407
|
-
ref << node_identifier
|
408
|
-
save_sor(current_patients, node_identifier, patients_by_cluster)
|
409
|
-
sors << ref
|
410
|
-
clust_number += 1
|
411
|
-
end
|
412
|
-
end
|
413
|
-
end
|
414
|
-
return patients_by_cluster, sors
|
415
|
-
end
|
416
182
|
|
417
|
-
def save_sor(current_patients, node_identifier, patients_by_cluster)
|
418
|
-
current_patients.each do |patient|
|
419
|
-
add_record(patients_by_cluster, patient, node_identifier)
|
420
|
-
end
|
421
|
-
end
|
422
183
|
|
423
|
-
def add_record(hash, key, record)
|
424
|
-
query = hash[key]
|
425
|
-
if query.nil?
|
426
|
-
hash[key] = [record]
|
427
|
-
elsif !query.include?(record)
|
428
|
-
query << record
|
429
|
-
end
|
430
|
-
end
|
431
|
-
|
432
|
-
def load_patient_cohort(options)
|
433
|
-
patient_data = {}
|
434
|
-
count = 0
|
435
|
-
fields2extract = get_fields2extract(options)
|
436
|
-
field_numbers = fields2extract.values
|
437
|
-
File.open(options[:input_file]).each do |line|
|
438
|
-
line.chomp!
|
439
|
-
if options[:header] && count == 0
|
440
|
-
line.gsub!(/#\s*/,'') # correct comment like headers
|
441
|
-
field_names = line.split("\t")
|
442
|
-
get_field_numbers2extract(field_names, fields2extract)
|
443
|
-
field_numbers = fields2extract.values
|
444
|
-
else
|
445
|
-
fields = line.split("\t")
|
446
|
-
pat_record = field_numbers.map{|n| fields[n]}
|
447
|
-
if fields2extract[:pat_id_col].nil?
|
448
|
-
pat_id = "pat_#{count}" #generate ids
|
449
|
-
else
|
450
|
-
original_id = pat_record.shift
|
451
|
-
pat_id = original_id + "_i#{count}" # make sure that ids are uniq
|
452
|
-
end
|
453
|
-
if !pat_record[0].nil?
|
454
|
-
pat_record[0] = pat_record[0].split(options[:hpo_separator])
|
455
|
-
else
|
456
|
-
pat_record[0] = []
|
457
|
-
end
|
458
|
-
pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
|
459
|
-
pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
|
460
|
-
patient_data[pat_id] = pat_record
|
461
|
-
end
|
462
|
-
count +=1
|
463
|
-
end
|
464
|
-
options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
|
465
|
-
return patient_data
|
466
|
-
end
|
467
|
-
|
468
|
-
def get_fields2extract(options)
|
469
|
-
fields2extract = {}
|
470
|
-
[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
|
471
|
-
col = options[field]
|
472
|
-
if !col.nil?
|
473
|
-
col = col.to_i if !options[:header]
|
474
|
-
fields2extract[field] = col
|
475
|
-
end
|
476
|
-
end
|
477
|
-
return fields2extract
|
478
|
-
end
|
479
|
-
|
480
|
-
def get_field_numbers2extract(field_names, fields2extract)
|
481
|
-
fields2extract.each do |field, name|
|
482
|
-
fields2extract[field] = field_names.index(name)
|
483
|
-
end
|
484
|
-
end
|
485
|
-
|
486
|
-
def download(ftp_server, path, name)
|
487
|
-
ftp = Net::FTP.new()
|
488
|
-
ftp.connect(ftp_server)
|
489
|
-
ftp.login
|
490
|
-
ftp.getbinaryfile(path, name)
|
491
|
-
ftp.close
|
492
|
-
end
|
493
184
|
|
494
185
|
def get_and_parse_external_data(all_paths)
|
495
186
|
sources = [
|