pets 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
data/lib/pets/cohort.rb
ADDED
@@ -0,0 +1,307 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'semtools'
|
3
|
+
|
4
|
+
class Cohort
|
5
|
+
@@ont = {}
|
6
|
+
class << self # https://www.ruby-forum.com/t/attr-accessor-for-class-variable/136693
|
7
|
+
attr_accessor :act_ont # Which ontology use for ont related operations
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_accessor :profiles
|
11
|
+
|
12
|
+
def self.get_ontology(ont_id)
|
13
|
+
return @@ont[ont_id]
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.load_ontology(ont_name, ont_file, excluded_terms_file = nil)
|
17
|
+
ont = nil
|
18
|
+
if !ont_file.include?('.json')
|
19
|
+
if !excluded_terms_file.nil?
|
20
|
+
ont = Ontology.new(file: ont_file, load_file: true, removable_terms: read_excluded_ont_file(excluded_terms_file))
|
21
|
+
else
|
22
|
+
ont = Ontology.new(file: ont_file, load_file: true)
|
23
|
+
end
|
24
|
+
else
|
25
|
+
ont = Ontology.new
|
26
|
+
ont.read(ont_file)
|
27
|
+
if !excluded_terms_file.nil?
|
28
|
+
ont.add_removable_terms(read_excluded_ont_file(excluded_terms_file))
|
29
|
+
ont.remove_removable()
|
30
|
+
ont.build_index()
|
31
|
+
end
|
32
|
+
end
|
33
|
+
@@ont[ont_name] = ont
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.read_excluded_ont_file(file)
|
37
|
+
excluded_hpo = []
|
38
|
+
File.open(file).each do |line|
|
39
|
+
excluded_hpo << line.chomp
|
40
|
+
end
|
41
|
+
return excluded_hpo
|
42
|
+
end
|
43
|
+
|
44
|
+
def initialize()
|
45
|
+
@profiles = {}
|
46
|
+
@vars = {}
|
47
|
+
@var_idx = Genomic_Feature.new([])
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_record(rec) #[id, [profile], [[chr1, start1, stop1],[chr1, start1, stop1]]]
|
51
|
+
id, profile, vars = rec
|
52
|
+
@profiles[id] = profile.map{|t| t.to_sym} if !profile.nil?
|
53
|
+
add_gen_feat(id, vars) if !vars.nil?
|
54
|
+
end
|
55
|
+
|
56
|
+
def delete(id)
|
57
|
+
@profiles.delete(id)
|
58
|
+
@vars.delete(id)
|
59
|
+
end
|
60
|
+
|
61
|
+
def select_by_profile!
|
62
|
+
@profiles.select!{|id, profile| yield(id, profile)}
|
63
|
+
current_ids = @profiles.keys
|
64
|
+
@vars.select!{|id, var| current_ids.include?(id)}
|
65
|
+
end
|
66
|
+
|
67
|
+
def select_by_var!
|
68
|
+
@vars.select!{|id, profile| yield(id, profile)}
|
69
|
+
current_ids = @vars.keys
|
70
|
+
@profiles.select!{|id, var| current_ids.include?(id)}
|
71
|
+
end
|
72
|
+
|
73
|
+
def filter_by_term_number(n_terms)
|
74
|
+
select_by_profile!{|id, profile| profile.length >= n_terms}
|
75
|
+
end
|
76
|
+
|
77
|
+
def remove_incomplete_records # remove resc that lacks of vars or phenotypes
|
78
|
+
ids_with_terms = @profiles.keys
|
79
|
+
ids_with_vars = []
|
80
|
+
@vars.each{|id, regs| ids_with_vars << id if regs.length > 0}
|
81
|
+
full_ids = ids_with_vars & ids_with_terms
|
82
|
+
@profiles.select!{|id, prof| full_ids.include?(id)}
|
83
|
+
@vars.select!{|id, var| full_ids.include?(id)}
|
84
|
+
end
|
85
|
+
|
86
|
+
def add_gen_feat(id, feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
87
|
+
@vars[id] = Genomic_Feature.new(feat_array)
|
88
|
+
end
|
89
|
+
|
90
|
+
def get_profile(id)
|
91
|
+
return @profiles[id]
|
92
|
+
end
|
93
|
+
|
94
|
+
def get_vars(id)
|
95
|
+
return @vars[id]
|
96
|
+
end
|
97
|
+
|
98
|
+
def each_profile()
|
99
|
+
@profiles.each do |id, profile|
|
100
|
+
yield(id, profile)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def each_var()
|
105
|
+
@vars.each do |id, var_info|
|
106
|
+
yield(id, var_info)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def get_general_profile(thr=0) # TODO move funcionality to semtools
|
111
|
+
term_count = Hash.new(0)
|
112
|
+
each_profile do |id, prof|
|
113
|
+
prof.each do |term|
|
114
|
+
general_profile[prof] += 1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
records = @profiles.length
|
118
|
+
general_profile = []
|
119
|
+
term_count.each do |term, count|
|
120
|
+
general_profile << term if count.fdiv(records) >= thr
|
121
|
+
end
|
122
|
+
ont = @@ont[Cohort.act_ont]
|
123
|
+
return ont.clean_profile_hard(general_profile)
|
124
|
+
end
|
125
|
+
|
126
|
+
def check(hard=false) # OLD format_patient_data
|
127
|
+
ont = @@ont[Cohort.act_ont]
|
128
|
+
rejected_terms = []
|
129
|
+
rejected_recs = []
|
130
|
+
@profiles.each do |id, terms|
|
131
|
+
if hard
|
132
|
+
terms = ont.clean_profile_hard(terms)
|
133
|
+
rejec_terms = []
|
134
|
+
else
|
135
|
+
terms, rejec_terms = ont.check_ids(terms)
|
136
|
+
end
|
137
|
+
if !rejec_terms.empty?
|
138
|
+
STDERR.puts "WARNING: record #{id} has the unknown CODES '#{rejec_terms.join(',')}'. Codes removed."
|
139
|
+
rejected_terms.concat(rejec_terms)
|
140
|
+
end
|
141
|
+
if terms.empty?
|
142
|
+
rejected_recs << id
|
143
|
+
else
|
144
|
+
@profiles[id] = terms
|
145
|
+
end
|
146
|
+
end
|
147
|
+
@profiles.select!{|id, record| !rejected_recs.include?(id)}
|
148
|
+
@vars.select!{|id, record| !rejected_recs.include?(id)}
|
149
|
+
return rejected_terms.uniq, rejected_recs
|
150
|
+
end
|
151
|
+
|
152
|
+
def link2ont(ont_id)
|
153
|
+
@@ont[ont_id].load_profiles(@profiles)
|
154
|
+
end
|
155
|
+
|
156
|
+
def get_profile_redundancy
|
157
|
+
ont = @@ont[Cohort.act_ont]
|
158
|
+
profile_sizes, parental_terms_per_profile = ont.get_profile_redundancy
|
159
|
+
return profile_sizes, parental_terms_per_profile
|
160
|
+
end
|
161
|
+
|
162
|
+
def get_profiles_terms_frequency(options={})
|
163
|
+
ont = @@ont[Cohort.act_ont]
|
164
|
+
term_stats = ont.get_profiles_terms_frequency(**options) #https://www.ruby-lang.org/en/news/2019/12/12/separation-of-positional-and-keyword-arguments-in-ruby-3-0/
|
165
|
+
return term_stats
|
166
|
+
end
|
167
|
+
|
168
|
+
def compute_term_list_and_childs()
|
169
|
+
ont = @@ont[Cohort.act_ont]
|
170
|
+
suggested_childs, term_with_childs_ratio = ont.compute_term_list_and_childs()
|
171
|
+
end
|
172
|
+
|
173
|
+
def get_profile_ontology_distribution_tables()
|
174
|
+
ont = @@ont[Cohort.act_ont]
|
175
|
+
ontology_levels, distribution_percentage = ont.get_profile_ontology_distribution_tables
|
176
|
+
ontology_levels.unshift(["level", "ontology", "cohort"])
|
177
|
+
distribution_percentage.unshift(["level", "ontology", "weighted cohort", "uniq terms cohort"])
|
178
|
+
return ontology_levels, distribution_percentage
|
179
|
+
end
|
180
|
+
|
181
|
+
def get_ic_analysis()
|
182
|
+
ont = @@ont[Cohort.act_ont]
|
183
|
+
onto_ic, freq_ic = ont.get_observed_ics_by_onto_and_freq # IC for TERMS
|
184
|
+
onto_ic_profile, freq_ic_profile = ont.get_profiles_resnik_dual_ICs # IC for PROFILES
|
185
|
+
return onto_ic, freq_ic, onto_ic_profile, freq_ic_profile
|
186
|
+
end
|
187
|
+
|
188
|
+
def get_profiles_mean_size
|
189
|
+
ont = @@ont[Cohort.act_ont]
|
190
|
+
profile_mean_size = ont.get_profiles_mean_size
|
191
|
+
return profile_mean_size
|
192
|
+
end
|
193
|
+
|
194
|
+
def get_profile_length_at_percentile(perc=50, increasing_sort: false)
|
195
|
+
ont = @@ont[Cohort.act_ont]
|
196
|
+
length_percent = ont.get_profile_length_at_percentile(perc=perc, increasing_sort: increasing_sort)
|
197
|
+
return length_percent
|
198
|
+
end
|
199
|
+
|
200
|
+
def get_dataset_specifity_index(type)
|
201
|
+
ont = @@ont[Cohort.act_ont]
|
202
|
+
dsi = ont.get_dataset_specifity_index(type)
|
203
|
+
return dsi
|
204
|
+
end
|
205
|
+
|
206
|
+
def compare_profiles(options={})
|
207
|
+
ont = @@ont[Cohort.act_ont]
|
208
|
+
similarities = ont.compare_profiles(**options)
|
209
|
+
return similarities
|
210
|
+
end
|
211
|
+
|
212
|
+
def index_vars # equivalent to process_patient_data
|
213
|
+
each_var do |id, var|
|
214
|
+
@var_idx.merge(var, id)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def get_vars_sizes(summary=false)
|
219
|
+
if summary
|
220
|
+
return @var_idx.get_summary_sizes
|
221
|
+
else
|
222
|
+
return @var_idx.get_sizes
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def generate_cluster_regions(meth, tag, lim)
|
227
|
+
@var_idx.generate_cluster_regions(meth, tag, lim)
|
228
|
+
end
|
229
|
+
|
230
|
+
def save(output_file, mode = :default, translate = false)
|
231
|
+
File.open(output_file, 'w') do |f|
|
232
|
+
f.puts "id\tchr\tstart\tstop\tterms" if mode == 'paco'
|
233
|
+
ont = @@ont[Cohort.act_ont]
|
234
|
+
@profiles.each do |id, terms|
|
235
|
+
terms, rejected = ont.translate_ids(terms) if translate
|
236
|
+
id_variants = @vars[id]
|
237
|
+
variants = []
|
238
|
+
if id_variants.nil? || id_variants.length == 0
|
239
|
+
variants << ['-', '-', '-']
|
240
|
+
else
|
241
|
+
id_variants.each do |chr, reg|
|
242
|
+
variants << [chr, reg[:start], reg[:stop]]
|
243
|
+
end
|
244
|
+
end
|
245
|
+
variants.each do |var|
|
246
|
+
if mode == :default
|
247
|
+
f.puts "#{id}\t#{terms.join('|')}\t#{var.join("\t")}"
|
248
|
+
elsif mode == :paco
|
249
|
+
f.puts "#{id}\t#{var.join("\t")}\t#{terms.join('|')}"
|
250
|
+
else
|
251
|
+
abort('Wrong save mode option, please try default or paco')
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def export_phenopackets(output_folder, genome_assembly, vcf_index: nil)
|
259
|
+
ont = @@ont[Cohort.act_ont]
|
260
|
+
metaData = {
|
261
|
+
"createdBy" => "PETS",
|
262
|
+
"resources" => [{
|
263
|
+
"id" => "hp",
|
264
|
+
"name" => "human phenotype ontology",
|
265
|
+
"namespacePrefix" => "HP",
|
266
|
+
"url" => "http://purl.obolibrary.org/obo/hp.owl",
|
267
|
+
# "version" => "2018-03-08",
|
268
|
+
"iriPrefix" => "http://purl.obolibrary.org/obo/HP_"
|
269
|
+
}]
|
270
|
+
}
|
271
|
+
|
272
|
+
@profiles.each do |id, terms|
|
273
|
+
phenopacket = {metaData: metaData}
|
274
|
+
phenopacket[:subject] = {id: id}
|
275
|
+
phenotypicFeatures = []
|
276
|
+
terms.each do |term|
|
277
|
+
term_name = ont.translate_id(term)
|
278
|
+
phenotypicFeatures << {
|
279
|
+
type: { id: term, label: term_name},
|
280
|
+
classOfOnset: {"id" => "HP:0003577", "label" => "Congenital onset"}
|
281
|
+
}
|
282
|
+
end
|
283
|
+
phenopacket[:phenotypicFeatures] = phenotypicFeatures
|
284
|
+
if !vcf_index.nil? && vcf_index.include?(id)
|
285
|
+
htsFiles = []
|
286
|
+
htsFiles << {
|
287
|
+
"uri" => "file:/" + vcf_index[id],
|
288
|
+
"description" => id,
|
289
|
+
"htsFormat" => "VCF",
|
290
|
+
"genomeAssembly" => genome_assembly,
|
291
|
+
"individualToSampleIdentifiers" => { "patient1" => id }
|
292
|
+
}
|
293
|
+
phenopacket[:htsFiles] = htsFiles
|
294
|
+
end
|
295
|
+
File.open(File.join(output_folder, id.to_s + ".json"), "w") { |f| f.write JSON.pretty_generate(phenopacket) }
|
296
|
+
id_variants = @vars[id]
|
297
|
+
variants = []
|
298
|
+
if id_variants.nil? || id_variants.length == 0
|
299
|
+
variants << ['-', '-', '-']
|
300
|
+
else
|
301
|
+
id_variants.each do |chr, reg|
|
302
|
+
variants << [chr, reg[:start], reg[:stop]]
|
303
|
+
end
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
# Needs define ROOT_PATH constant in file requiring this file
|
2
|
+
REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
|
3
|
+
EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
|
4
|
+
EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
|
5
|
+
HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
|
6
|
+
MONDO_FILE = File.join(EXTERNAL_DATA, 'mondo.obo')
|
7
|
+
IC_FILE = File.join(EXTERNAL_DATA, 'uniq_hpo_with_CI.txt')
|
data/lib/pets/generalMethods.rb
CHANGED
@@ -13,80 +13,15 @@ def system_call(code_folder, script, args_string)
|
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
#Common methods for predictors
|
25
|
-
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
26
|
-
#1. Indexing by chr (region)
|
27
|
-
def coor_overlap?(ref_start, ref_stop, start, stop)
|
28
|
-
overlap = false
|
29
|
-
if (stop > ref_start && stop <= ref_stop) ||
|
30
|
-
(start >= ref_start && start < ref_stop) ||
|
31
|
-
(start <= ref_start && stop >= ref_stop) ||
|
32
|
-
(start > ref_start && stop < ref_stop)
|
33
|
-
overlap = true
|
34
|
-
end
|
35
|
-
return overlap
|
36
|
-
end
|
37
|
-
|
38
|
-
def load_training_file4regions(training_file)
|
39
|
-
training_set = {}
|
40
|
-
posInfo = loadFile(training_file)
|
41
|
-
posInfo.each do |info|
|
42
|
-
chr = info.shift
|
43
|
-
query = training_set[chr]
|
44
|
-
if query.nil?
|
45
|
-
training_set[chr] = [info]
|
46
|
-
else
|
47
|
-
query << info
|
48
|
-
end
|
49
|
-
end
|
50
|
-
return training_set
|
51
|
-
end
|
52
|
-
|
53
|
-
#2. Indexing by hpo (code)
|
54
|
-
#prepare training file for analysis using phenotype2region prediction
|
55
|
-
def load_training_file4HPO(training_file, thresold=0)
|
56
|
-
training_set = {}
|
57
|
-
information = loadFile(training_file, thresold)
|
58
|
-
information.each do |info|
|
59
|
-
hpoCode = info.delete_at(4)
|
60
|
-
query = training_set[hpoCode]
|
61
|
-
if query.nil?
|
62
|
-
training_set[hpoCode] = [info]
|
63
|
-
else
|
64
|
-
query << info
|
65
|
-
end
|
66
|
-
end
|
67
|
-
# STDERR.puts training_set.keys.inspect
|
68
|
-
return training_set
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
#3. Load training info file:
|
73
|
-
#Chr;Start;Stop;HPO;Association;node
|
74
|
-
def loadFile(file, thresold=0)
|
75
|
-
information = []
|
76
|
-
File.open(file).each do |line|
|
77
|
-
line.chomp!
|
78
|
-
allInfo = line.split("\t")
|
79
|
-
associationValue = allInfo[4].to_f
|
80
|
-
if associationValue >= thresold
|
81
|
-
chr = allInfo[0]
|
82
|
-
startPos = allInfo[1].to_i
|
83
|
-
stopPos = allInfo[2].to_i
|
84
|
-
hpoCode = allInfo[3]
|
85
|
-
nodeID = allInfo[5]
|
86
|
-
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
87
|
-
end
|
16
|
+
def add_record(hash, key, record, uniq=false)
|
17
|
+
query = hash[key]
|
18
|
+
if query.nil?
|
19
|
+
hash[key] = [record]
|
20
|
+
elsif !uniq # We not take care by repeated entries
|
21
|
+
query << record
|
22
|
+
elsif !query.include?(record) # We want uniq entries
|
23
|
+
query << record
|
88
24
|
end
|
89
|
-
return information
|
90
25
|
end
|
91
26
|
|
92
27
|
|
@@ -111,105 +46,6 @@ def compute_IC_values(patient_data, total_patients)
|
|
111
46
|
return patients_per_hpo
|
112
47
|
end
|
113
48
|
|
114
|
-
def load_hpo_ci_values(information_coefficient_file)
|
115
|
-
hpos_ci_values = {}
|
116
|
-
File.open(information_coefficient_file).each do |line|
|
117
|
-
line.chomp!
|
118
|
-
hpo_code, ci = line.split("\t")
|
119
|
-
hpos_ci_values[hpo_code.to_sym] = ci.to_f
|
120
|
-
end
|
121
|
-
return hpos_ci_values
|
122
|
-
end
|
123
|
-
|
124
|
-
def load_clustered_patients(file)
|
125
|
-
clusters = {}
|
126
|
-
File.open(file).each do |line|
|
127
|
-
line.chomp!
|
128
|
-
pat_id, cluster_id = line.split("\t")
|
129
|
-
query = clusters[cluster_id]
|
130
|
-
if query.nil?
|
131
|
-
clusters[cluster_id] = [pat_id]
|
132
|
-
else
|
133
|
-
query << pat_id
|
134
|
-
end
|
135
|
-
end
|
136
|
-
return clusters
|
137
|
-
end
|
138
|
-
|
139
|
-
def load_gene_data(gene_data_path)
|
140
|
-
gene_list = {} #geneID => attr
|
141
|
-
gene_location = {} # chr => gene
|
142
|
-
infile = open(gene_data_path)
|
143
|
-
gz = Zlib::GzipReader.new(infile)
|
144
|
-
current_chr = nil
|
145
|
-
genes = []
|
146
|
-
gz.each_line do |line|
|
147
|
-
line.chomp!
|
148
|
-
next if line =~ /^#/
|
149
|
-
fields = line.split("\t")
|
150
|
-
if fields[8].include?('genome=chromosome')
|
151
|
-
chr = fields[8].split(';')[1].split('=').last
|
152
|
-
gene_location[current_chr] = genes
|
153
|
-
genes = []
|
154
|
-
current_chr = chr
|
155
|
-
elsif fields[2] == 'gene'
|
156
|
-
attributes = {}
|
157
|
-
fields[8].split(';').each do |pair|
|
158
|
-
key, value = pair.split('=')
|
159
|
-
attributes[key] = value
|
160
|
-
end
|
161
|
-
geneName = nil
|
162
|
-
geneName = attributes['gene'] if !attributes['gene'].nil?
|
163
|
-
geneSyns = []
|
164
|
-
geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
|
165
|
-
description = attributes['description']
|
166
|
-
description = URI.unescape(description) if !description.nil?
|
167
|
-
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
168
|
-
gene_list[$1] = [geneName, geneSyns, description]
|
169
|
-
genes << [$1, fields[3].to_i, fields[4].to_i]
|
170
|
-
end
|
171
|
-
end
|
172
|
-
gene_location[current_chr] = genes
|
173
|
-
return gene_list, gene_location
|
174
|
-
end
|
175
|
-
|
176
|
-
def parse_kegg_data(query_genes)
|
177
|
-
kegg_data = {} #gene => attb
|
178
|
-
while !query_genes.empty?
|
179
|
-
gene_set = query_genes.shift(10)
|
180
|
-
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
181
|
-
uri = URI(url)
|
182
|
-
response = Net::HTTP.get(uri)
|
183
|
-
geneID = nil
|
184
|
-
gene_names = []
|
185
|
-
definition = nil
|
186
|
-
pathways = []
|
187
|
-
parsing_pathway_field = false
|
188
|
-
response.squeeze(' ').each_line do |line|
|
189
|
-
line.chomp!
|
190
|
-
if line =~ /^ENTRY/
|
191
|
-
geneID = line.split(' ')[1]
|
192
|
-
elsif line =~ /^NAME/
|
193
|
-
gene_names = line.split(' ', 2).last.split(', ')
|
194
|
-
elsif line =~ /^DEFINITION/
|
195
|
-
definition = line.split(' ', 2)[1]
|
196
|
-
elsif line =~ /^PATHWAY/
|
197
|
-
pathways << line.split(' ', 3)[1..2]
|
198
|
-
parsing_pathway_field = true
|
199
|
-
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
200
|
-
parsing_pathway_field = false
|
201
|
-
elsif parsing_pathway_field
|
202
|
-
pathways << line.strip.split(' ', 2)
|
203
|
-
elsif line == '///'
|
204
|
-
parsing_pathway_field = false
|
205
|
-
kegg_data[geneID] = [gene_names, definition, pathways]
|
206
|
-
pathways = []
|
207
|
-
gene_names = []
|
208
|
-
end
|
209
|
-
end
|
210
|
-
end
|
211
|
-
return kegg_data
|
212
|
-
end
|
213
49
|
|
214
50
|
def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
|
215
51
|
kegg_data = {}
|
@@ -270,21 +106,6 @@ def merge_genes_with_kegg_data(gene_list, kegg_data)
|
|
270
106
|
return merged_data
|
271
107
|
end
|
272
108
|
|
273
|
-
def write_compressed_plain_file(data, path)
|
274
|
-
File.open(path, 'w') do |f|
|
275
|
-
gz = Zlib::GzipWriter.new(f)
|
276
|
-
gz.write data.to_json
|
277
|
-
gz.close
|
278
|
-
end
|
279
|
-
end
|
280
|
-
|
281
|
-
def read_compressed_json(path)
|
282
|
-
infile = open(path)
|
283
|
-
gz = Zlib::GzipReader.new(infile)
|
284
|
-
object = JSON.parse(gz.read)
|
285
|
-
return object
|
286
|
-
end
|
287
|
-
|
288
109
|
def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
|
289
110
|
pathways_genes_in_predictions = {}
|
290
111
|
genes_in_predictions = []
|
@@ -358,138 +179,8 @@ def binom(n,k)
|
|
358
179
|
end
|
359
180
|
end
|
360
181
|
|
361
|
-
def get_reference(genomic_ranges)
|
362
|
-
#genomic_ranges = [patientID, mut_start, mut_stop]
|
363
|
-
reference = []
|
364
|
-
reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
|
365
|
-
reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
|
366
|
-
reference.uniq!
|
367
|
-
reference.sort!
|
368
|
-
#Define overlap range
|
369
|
-
final_reference = []
|
370
|
-
reference.each_with_index do |coord,i|
|
371
|
-
next_coord = reference[i + 1]
|
372
|
-
final_reference << [coord, next_coord] if !next_coord.nil?
|
373
|
-
end
|
374
|
-
return final_reference
|
375
|
-
end
|
376
|
-
|
377
|
-
def overlap_patients(genomic_ranges, reference)
|
378
|
-
overlaps = []
|
379
|
-
reference.each do |start, stop|
|
380
|
-
patients = []
|
381
|
-
genomic_ranges.each do |pt_id, pt_start, pt_stop|
|
382
|
-
if (start <= pt_start && stop >= pt_stop) ||
|
383
|
-
(start > pt_start && stop < pt_stop) ||
|
384
|
-
(stop > pt_start && stop <= pt_stop) ||
|
385
|
-
(start >= pt_start && start < pt_stop)
|
386
|
-
patients << pt_id
|
387
|
-
end
|
388
|
-
end
|
389
|
-
overlaps << patients.uniq
|
390
|
-
end
|
391
|
-
return overlaps
|
392
|
-
end
|
393
|
-
|
394
|
-
def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
|
395
|
-
patients_out_of_cluster = 0
|
396
|
-
patients_by_cluster = {}
|
397
|
-
sors = []
|
398
|
-
patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
|
399
|
-
reference = get_reference(genomic_ranges) # Get putative overlap regions
|
400
|
-
overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
|
401
|
-
clust_number = 1
|
402
|
-
reference.each_with_index do |ref, i|
|
403
|
-
current_patients = overlapping_patients[i]
|
404
|
-
if current_patients.length > pat_per_reg
|
405
|
-
ref << chrm
|
406
|
-
node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
|
407
|
-
ref << node_identifier
|
408
|
-
save_sor(current_patients, node_identifier, patients_by_cluster)
|
409
|
-
sors << ref
|
410
|
-
clust_number += 1
|
411
|
-
end
|
412
|
-
end
|
413
|
-
end
|
414
|
-
return patients_by_cluster, sors
|
415
|
-
end
|
416
182
|
|
417
|
-
def save_sor(current_patients, node_identifier, patients_by_cluster)
|
418
|
-
current_patients.each do |patient|
|
419
|
-
add_record(patients_by_cluster, patient, node_identifier)
|
420
|
-
end
|
421
|
-
end
|
422
183
|
|
423
|
-
def add_record(hash, key, record)
|
424
|
-
query = hash[key]
|
425
|
-
if query.nil?
|
426
|
-
hash[key] = [record]
|
427
|
-
elsif !query.include?(record)
|
428
|
-
query << record
|
429
|
-
end
|
430
|
-
end
|
431
|
-
|
432
|
-
def load_patient_cohort(options)
|
433
|
-
patient_data = {}
|
434
|
-
count = 0
|
435
|
-
fields2extract = get_fields2extract(options)
|
436
|
-
field_numbers = fields2extract.values
|
437
|
-
File.open(options[:input_file]).each do |line|
|
438
|
-
line.chomp!
|
439
|
-
if options[:header] && count == 0
|
440
|
-
line.gsub!(/#\s*/,'') # correct comment like headers
|
441
|
-
field_names = line.split("\t")
|
442
|
-
get_field_numbers2extract(field_names, fields2extract)
|
443
|
-
field_numbers = fields2extract.values
|
444
|
-
else
|
445
|
-
fields = line.split("\t")
|
446
|
-
pat_record = field_numbers.map{|n| fields[n]}
|
447
|
-
if fields2extract[:pat_id_col].nil?
|
448
|
-
pat_id = "pat_#{count}" #generate ids
|
449
|
-
else
|
450
|
-
original_id = pat_record.shift
|
451
|
-
pat_id = original_id + "_i#{count}" # make sure that ids are uniq
|
452
|
-
end
|
453
|
-
if !pat_record[0].nil?
|
454
|
-
pat_record[0] = pat_record[0].split(options[:hpo_separator])
|
455
|
-
else
|
456
|
-
pat_record[0] = []
|
457
|
-
end
|
458
|
-
pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
|
459
|
-
pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
|
460
|
-
patient_data[pat_id] = pat_record
|
461
|
-
end
|
462
|
-
count +=1
|
463
|
-
end
|
464
|
-
options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
|
465
|
-
return patient_data
|
466
|
-
end
|
467
|
-
|
468
|
-
def get_fields2extract(options)
|
469
|
-
fields2extract = {}
|
470
|
-
[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
|
471
|
-
col = options[field]
|
472
|
-
if !col.nil?
|
473
|
-
col = col.to_i if !options[:header]
|
474
|
-
fields2extract[field] = col
|
475
|
-
end
|
476
|
-
end
|
477
|
-
return fields2extract
|
478
|
-
end
|
479
|
-
|
480
|
-
def get_field_numbers2extract(field_names, fields2extract)
|
481
|
-
fields2extract.each do |field, name|
|
482
|
-
fields2extract[field] = field_names.index(name)
|
483
|
-
end
|
484
|
-
end
|
485
|
-
|
486
|
-
def download(ftp_server, path, name)
|
487
|
-
ftp = Net::FTP.new()
|
488
|
-
ftp.connect(ftp_server)
|
489
|
-
ftp.login
|
490
|
-
ftp.getbinaryfile(path, name)
|
491
|
-
ftp.close
|
492
|
-
end
|
493
184
|
|
494
185
|
def get_and_parse_external_data(all_paths)
|
495
186
|
sources = [
|