pets 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/bin/area_under_curve_pr.rb +118 -0
- data/bin/association_metrics_average.rb +94 -0
- data/bin/coPatReporter.rb +531 -0
- data/bin/console +14 -0
- data/bin/fmeasure_index.rb +72 -0
- data/bin/get_PR_values.rb +90 -0
- data/bin/get_clusters.R +18 -0
- data/bin/get_network_nodes.rb +197 -0
- data/bin/lines.R +77 -0
- data/bin/merge_by_cluster.rb +62 -0
- data/bin/merge_pairs.rb +138 -0
- data/bin/paco_translator.rb +102 -0
- data/bin/phen2reg.rb +385 -0
- data/bin/phen2reg_predictor_check.rb +297 -0
- data/bin/plot_area.R +71 -0
- data/bin/plot_boxplot.R +21 -0
- data/bin/plot_density.R +46 -0
- data/bin/plot_scatterplot.R +25 -0
- data/bin/reg2phen.rb +116 -0
- data/bin/region_to_patients_generator.rb +84 -0
- data/bin/relate_CI_to_association_value.rb +90 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +40 -0
- data/bin/xyplot_graph.R +60 -0
- data/external_data/biosystems_gene.gz +0 -0
- data/external_data/bsid2info.gz +0 -0
- data/external_data/chromosome_sizes_hg19.txt +24 -0
- data/external_data/gene_data.gz +0 -0
- data/external_data/gene_data_with_pathways.gz +0 -0
- data/external_data/gene_location.gz +0 -0
- data/external_data/hp.obo +146363 -0
- data/external_data/remove +0 -0
- data/lib/pets.rb +6 -0
- data/lib/pets/coPatReporterMethods.rb +77 -0
- data/lib/pets/generalMethods.rb +556 -0
- data/lib/pets/phen2reg_methods.rb +432 -0
- data/lib/pets/version.rb +3 -0
- data/pets.gemspec +47 -0
- data/templates/cohort_report.erb +93 -0
- data/templates/patient_report.erb +209 -0
- metadata +183 -0
File without changes
|
data/lib/pets.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
def process_patient_data(patient_data)
|
2
|
+
parsed_patient_data = {}
|
3
|
+
patient_data.each do |patientID, metadata|
|
4
|
+
phenotypes, chr, start, stop = metadata
|
5
|
+
info = [patientID, start.to_i, stop.to_i]
|
6
|
+
query = parsed_patient_data[chr]
|
7
|
+
if query.nil?
|
8
|
+
parsed_patient_data[chr] = [info]
|
9
|
+
else
|
10
|
+
query << info
|
11
|
+
end
|
12
|
+
end
|
13
|
+
return parsed_patient_data
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_final_coverage(raw_coverage, bin_size)
|
17
|
+
coverage_to_plot = []
|
18
|
+
raw_coverage.each do |chr, coverages|
|
19
|
+
coverages.each do |start, stop, coverage|
|
20
|
+
bin_start = start - start % bin_size
|
21
|
+
bin_stop = stop - stop%bin_size
|
22
|
+
while bin_start < bin_stop
|
23
|
+
coverage_to_plot << [chr, bin_start, coverage]
|
24
|
+
bin_start += bin_size
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
return coverage_to_plot
|
29
|
+
end
|
30
|
+
|
31
|
+
def get_sor_length_distribution(raw_coverage)
|
32
|
+
all_cnvs_length = []
|
33
|
+
cnvs_count = []
|
34
|
+
raw_coverage.each do |chr, coords_info|
|
35
|
+
coords_info.each do |start, stop, pat_records|
|
36
|
+
region_length = stop - start + 1
|
37
|
+
all_cnvs_length << [region_length, pat_records]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]}
|
41
|
+
return all_cnvs_length
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_cnvs_length(patient_data)
|
45
|
+
length_stats = Hash.new(0)
|
46
|
+
patient_data.each do |pat_id, patient_record|
|
47
|
+
string_hpos, chr, start, stop = patient_record
|
48
|
+
length_stats[stop - start] += 1
|
49
|
+
end
|
50
|
+
return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def calculate_coverage(regions_data, delete_thresold = 0)
|
55
|
+
raw_coverage = {}
|
56
|
+
n_regions = 0
|
57
|
+
patients = 0
|
58
|
+
nt = 0
|
59
|
+
regions_data.each do |start, stop, chr, node|
|
60
|
+
number_of_patients = node.split('.').last.to_i
|
61
|
+
if number_of_patients <= delete_thresold
|
62
|
+
number_of_patients = 0
|
63
|
+
else
|
64
|
+
n_regions += 1
|
65
|
+
patients += number_of_patients
|
66
|
+
nt += stop - start
|
67
|
+
end
|
68
|
+
coords = [start, stop, number_of_patients]
|
69
|
+
query = raw_coverage[chr]
|
70
|
+
if query.nil?
|
71
|
+
raw_coverage[chr] = [coords]
|
72
|
+
else
|
73
|
+
query << coords
|
74
|
+
end
|
75
|
+
end
|
76
|
+
return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
|
77
|
+
end
|
@@ -0,0 +1,556 @@
|
|
1
|
+
require 'uri'
|
2
|
+
#Common methods for predictors
|
3
|
+
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
4
|
+
#1. Indexing by chr (region)
|
5
|
+
|
6
|
+
def load_training_file4regions(training_file)
|
7
|
+
training_set = {}
|
8
|
+
posInfo = loadFile(training_file)
|
9
|
+
posInfo.each do |info|
|
10
|
+
chr = info.shift
|
11
|
+
query = training_set[chr]
|
12
|
+
if query.nil?
|
13
|
+
training_set[chr] = [info]
|
14
|
+
else
|
15
|
+
query << info
|
16
|
+
end
|
17
|
+
end
|
18
|
+
return training_set
|
19
|
+
end
|
20
|
+
|
21
|
+
#2. Indexing by hpo (code)
|
22
|
+
#prepare training file for analysis using phenotype2region prediction
|
23
|
+
def load_training_file4HPO(training_file, thresold=0)
|
24
|
+
training_set = {}
|
25
|
+
information = loadFile(training_file, thresold)
|
26
|
+
information.each do |info|
|
27
|
+
hpoCode = info.delete_at(4)
|
28
|
+
query = training_set[hpoCode]
|
29
|
+
if query.nil?
|
30
|
+
training_set[hpoCode] = [info]
|
31
|
+
else
|
32
|
+
query << info
|
33
|
+
end
|
34
|
+
end
|
35
|
+
# STDERR.puts training_set.keys.inspect
|
36
|
+
return training_set
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
#3. Load training info file:
|
41
|
+
#Chr;Start;Stop;HPO;Association;node
|
42
|
+
def loadFile(file, thresold=0)
|
43
|
+
information = []
|
44
|
+
File.open(file).each do |line|
|
45
|
+
line.chomp!
|
46
|
+
allInfo = line.split("\t")
|
47
|
+
associationValue = allInfo[4].to_f
|
48
|
+
if associationValue >= thresold
|
49
|
+
chr = allInfo[0]
|
50
|
+
startPos = allInfo[1].to_i
|
51
|
+
stopPos = allInfo[2].to_i
|
52
|
+
hpoCode = allInfo[3]
|
53
|
+
nodeID = allInfo[5]
|
54
|
+
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
return information
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
|
61
|
+
if !hpo_black_list.include?(id)
|
62
|
+
attributes = [id, name, is_a - hpo_black_list, syn]
|
63
|
+
hpo_storage[id] = attributes
|
64
|
+
alt_ids.each do |altid|
|
65
|
+
hpo_storage[altid] = attributes
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def load_hpo_file(hpo_file, hpo_black_list=[])
|
71
|
+
hpo_storage = {}
|
72
|
+
id = nil
|
73
|
+
name = nil
|
74
|
+
alt_ids = []
|
75
|
+
syn = []
|
76
|
+
is_a = []
|
77
|
+
File.open(hpo_file).each do |line|
|
78
|
+
line.chomp!
|
79
|
+
tag, info = line.split(': ')
|
80
|
+
if tag == 'id' || tag == 'name' || tag == 'is_a' || tag == 'synonym' || tag == 'alt_id'
|
81
|
+
if tag == 'id'
|
82
|
+
add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) if !name.nil?
|
83
|
+
id = info
|
84
|
+
name = nil
|
85
|
+
alt_id = []
|
86
|
+
syn = []
|
87
|
+
is_a = []
|
88
|
+
end
|
89
|
+
if tag == 'alt_id'
|
90
|
+
alt_ids << info
|
91
|
+
elsif tag == 'is_a'
|
92
|
+
is_a << info.split(' ! ')[0]
|
93
|
+
elsif tag == 'synonym'
|
94
|
+
syn << info.split('"')[1] #to keep only the name of the synonym
|
95
|
+
else
|
96
|
+
name = info
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
|
101
|
+
# STDERR.puts hpo_storage.inspect
|
102
|
+
# Process.exit
|
103
|
+
return hpo_storage
|
104
|
+
end
|
105
|
+
|
106
|
+
def load_hpo_black_list(excluded_hpo_file)
|
107
|
+
excluded_hpos = []
|
108
|
+
File.open(excluded_hpo_file).each do |line|
|
109
|
+
line.chomp!
|
110
|
+
excluded_hpos << line
|
111
|
+
end
|
112
|
+
return excluded_hpos
|
113
|
+
end
|
114
|
+
|
115
|
+
def create_hpo_dictionary(hpo_storage)
|
116
|
+
hpo_dictionary = {}
|
117
|
+
hpo_storage.each do |hpo, metadata|
|
118
|
+
hpo_code, hpo_name, hpo_parents, hpo_synonyms = metadata
|
119
|
+
hpo_dictionary[hpo_name] = hpo_code
|
120
|
+
hpo_synonyms.each do |syn|
|
121
|
+
hpo_dictionary[syn] = hpo_code
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return hpo_dictionary
|
125
|
+
end
|
126
|
+
|
127
|
+
def get_child_parent_relations(hpo_storage)
|
128
|
+
# for getting hpo childs
|
129
|
+
storage_child = {}
|
130
|
+
hpo_storage.each do |hpo_code, hpo_data|
|
131
|
+
id, name, is_a, syn = hpo_data
|
132
|
+
hpo_child = [id, name]
|
133
|
+
is_a.each do |par_hpo_code|
|
134
|
+
query = storage_child[par_hpo_code]
|
135
|
+
if query.nil?
|
136
|
+
storage_child[par_hpo_code] = [hpo_child]
|
137
|
+
else
|
138
|
+
query << hpo_child
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
return storage_child
|
143
|
+
end
|
144
|
+
|
145
|
+
def compute_IC_values(patient_data, total_patients)
|
146
|
+
patients_per_hpo = Hash.new(0)
|
147
|
+
last_patient_ID = ''
|
148
|
+
patient_data.each do |patient_ID, metadata|
|
149
|
+
patient, count = patient_ID.split('_i')
|
150
|
+
if patient != last_patient_ID
|
151
|
+
hpos, chr, start, stop = metadata
|
152
|
+
hpos.each do |h|
|
153
|
+
patients_per_hpo[h] += 1
|
154
|
+
end
|
155
|
+
end
|
156
|
+
last_patient_ID = patient
|
157
|
+
end
|
158
|
+
# STDERR.puts patients_per_hpo.inspect
|
159
|
+
# Process.exit
|
160
|
+
patients_per_hpo.each do |hpo, patient_number|
|
161
|
+
patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
|
162
|
+
end
|
163
|
+
return patients_per_hpo
|
164
|
+
end
|
165
|
+
|
166
|
+
# def get_child_parent_relations(hpo_storage)
|
167
|
+
# # for getting hpo childs
|
168
|
+
# storage_child = {}
|
169
|
+
# hpo_storage.each do |hpo_code, hpo_data|
|
170
|
+
# STDERR.puts hpo_data[3].inspect
|
171
|
+
# Process.exit
|
172
|
+
# main_code, hpo_name, synonyms, parents = hpo_data
|
173
|
+
# parents.each do |par_hpo_code, par_hpo_name|
|
174
|
+
# query = storage_child[par_hpo_code]
|
175
|
+
# hpo_child = [main_code, hpo_name]
|
176
|
+
# if query.nil?
|
177
|
+
# storage_child[par_hpo_code] = [par_hpo_name, [hpo_child]]
|
178
|
+
# else
|
179
|
+
# query.last << hpo_child
|
180
|
+
# end
|
181
|
+
# end
|
182
|
+
# end
|
183
|
+
|
184
|
+
# return storage_child
|
185
|
+
# end
|
186
|
+
|
187
|
+
|
188
|
+
def load_hpo_ci_values(information_coefficient_file)
|
189
|
+
hpos_ci_values = {}
|
190
|
+
File.open(information_coefficient_file).each do |line|
|
191
|
+
line.chomp!
|
192
|
+
hpo_code, ci = line.split("\t")
|
193
|
+
hpos_ci_values[hpo_code] = ci.to_f
|
194
|
+
end
|
195
|
+
return hpos_ci_values
|
196
|
+
end
|
197
|
+
|
198
|
+
def load_clustered_patients(file)
|
199
|
+
clusters = {}
|
200
|
+
File.open(file).each do |line|
|
201
|
+
line.chomp!
|
202
|
+
pat_id, cluster_id = line.split("\t")
|
203
|
+
query = clusters[cluster_id]
|
204
|
+
if query.nil?
|
205
|
+
clusters[cluster_id] = [pat_id]
|
206
|
+
else
|
207
|
+
query << pat_id
|
208
|
+
end
|
209
|
+
end
|
210
|
+
return clusters
|
211
|
+
end
|
212
|
+
|
213
|
+
def load_gene_data(gene_data_path)
|
214
|
+
gene_list = {} #geneID => attr
|
215
|
+
gene_location = {} # chr => gene
|
216
|
+
infile = open(gene_data_path)
|
217
|
+
gz = Zlib::GzipReader.new(infile)
|
218
|
+
current_chr = nil
|
219
|
+
genes = []
|
220
|
+
gz.each_line do |line|
|
221
|
+
line.chomp!
|
222
|
+
next if line =~ /^#/
|
223
|
+
fields = line.split("\t")
|
224
|
+
if fields[8].include?('genome=chromosome')
|
225
|
+
chr = fields[8].split(';')[1].split('=').last
|
226
|
+
gene_location[current_chr] = genes
|
227
|
+
genes = []
|
228
|
+
current_chr = chr
|
229
|
+
elsif fields[2] == 'gene'
|
230
|
+
attributes = {}
|
231
|
+
fields[8].split(';').each do |pair|
|
232
|
+
key, value = pair.split('=')
|
233
|
+
attributes[key] = value
|
234
|
+
end
|
235
|
+
geneNames = []
|
236
|
+
geneNames << attributes['gene'] if !attributes['gene'].nil?
|
237
|
+
geneNames.concat(attributes['gene_synonym'].split(',')) if !attributes['gene_synonym'].nil?
|
238
|
+
description = attributes['description']
|
239
|
+
description = URI.unescape(description) if !description.nil?
|
240
|
+
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
241
|
+
gene_list[$1] = [geneNames, description]
|
242
|
+
genes << [$1, fields[3].to_i, fields[4].to_i]
|
243
|
+
end
|
244
|
+
end
|
245
|
+
gene_location[current_chr] = genes
|
246
|
+
return gene_list, gene_location
|
247
|
+
end
|
248
|
+
|
249
|
+
def parse_kegg_data(query_genes)
|
250
|
+
kegg_data = {} #gene => attb
|
251
|
+
while !query_genes.empty?
|
252
|
+
gene_set = query_genes.shift(10)
|
253
|
+
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
254
|
+
uri = URI(url)
|
255
|
+
response = Net::HTTP.get(uri)
|
256
|
+
geneID = nil
|
257
|
+
gene_names = []
|
258
|
+
definition = nil
|
259
|
+
pathways = []
|
260
|
+
parsing_pathway_field = false
|
261
|
+
response.squeeze(' ').each_line do |line|
|
262
|
+
line.chomp!
|
263
|
+
if line =~ /^ENTRY/
|
264
|
+
geneID = line.split(' ')[1]
|
265
|
+
elsif line =~ /^NAME/
|
266
|
+
gene_names = line.split(' ', 2).last.split(', ')
|
267
|
+
elsif line =~ /^DEFINITION/
|
268
|
+
definition = line.split(' ', 2)[1]
|
269
|
+
elsif line =~ /^PATHWAY/
|
270
|
+
pathways << line.split(' ', 3)[1..2]
|
271
|
+
parsing_pathway_field = true
|
272
|
+
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
273
|
+
parsing_pathway_field = false
|
274
|
+
elsif parsing_pathway_field
|
275
|
+
pathways << line.strip.split(' ', 2)
|
276
|
+
elsif line == '///'
|
277
|
+
parsing_pathway_field = false
|
278
|
+
kegg_data[geneID] = [gene_names, definition, pathways]
|
279
|
+
pathways = []
|
280
|
+
gene_names = []
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
return kegg_data
|
285
|
+
end
|
286
|
+
|
287
|
+
def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
|
288
|
+
kegg_data = {}
|
289
|
+
gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path)
|
290
|
+
keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG')
|
291
|
+
keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/}
|
292
|
+
|
293
|
+
gene2biosystems.each do |geneID, pathways|
|
294
|
+
kegg_pathways = []
|
295
|
+
pathways.each do |biosystem|
|
296
|
+
kAttrib = keggAttributes[biosystem]
|
297
|
+
kegg_pathways << kAttrib if !kAttrib.nil?
|
298
|
+
end
|
299
|
+
kegg_data[geneID] = kegg_pathways
|
300
|
+
end
|
301
|
+
return kegg_data
|
302
|
+
end
|
303
|
+
|
304
|
+
def loadBiosistemsInfo(biosystems_info_path, filterDB)
|
305
|
+
bsid2attributes = {}
|
306
|
+
infile = open(biosystems_info_path)
|
307
|
+
gz = Zlib::GzipReader.new(infile)
|
308
|
+
gz.each_line do |line|
|
309
|
+
line.chomp!
|
310
|
+
#STDERR.puts line.inspect
|
311
|
+
fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t")
|
312
|
+
bsid = fields.shift
|
313
|
+
bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0]
|
314
|
+
end
|
315
|
+
return bsid2attributes
|
316
|
+
end
|
317
|
+
|
318
|
+
def load_biosystem2gene_dictionary(biosystems_gene_path)
|
319
|
+
gene2kegg = {}
|
320
|
+
infile = open(biosystems_gene_path)
|
321
|
+
gz = Zlib::GzipReader.new(infile)
|
322
|
+
gz.each_line do |line|
|
323
|
+
line.chomp!
|
324
|
+
biosystem, gene, score = line.split("\t")
|
325
|
+
query = gene2kegg[gene]
|
326
|
+
if query.nil?
|
327
|
+
gene2kegg[gene] = [biosystem]
|
328
|
+
else
|
329
|
+
query << biosystem
|
330
|
+
end
|
331
|
+
end
|
332
|
+
return gene2kegg
|
333
|
+
end
|
334
|
+
|
335
|
+
def merge_genes_with_kegg_data(gene_list, kegg_data)
|
336
|
+
merged_data = {}
|
337
|
+
gene_list.each do |geneID, attributes|
|
338
|
+
query = kegg_data[geneID]
|
339
|
+
if query.nil?
|
340
|
+
attributes << []
|
341
|
+
else
|
342
|
+
attributes << query
|
343
|
+
end
|
344
|
+
merged_data[geneID] = attributes
|
345
|
+
end
|
346
|
+
return merged_data
|
347
|
+
end
|
348
|
+
|
349
|
+
def write_compressed_plain_file(data, path)
|
350
|
+
File.open(path, 'w') do |f|
|
351
|
+
gz = Zlib::GzipWriter.new(f)
|
352
|
+
gz.write data.to_json
|
353
|
+
gz.close
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def read_compressed_json(path)
|
358
|
+
infile = open(path)
|
359
|
+
gz = Zlib::GzipReader.new(infile)
|
360
|
+
object = JSON.parse(gz.read)
|
361
|
+
return object
|
362
|
+
end
|
363
|
+
|
364
|
+
def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
|
365
|
+
pathways_genes_in_predictions = {}
|
366
|
+
genes_in_predictions = []
|
367
|
+
genes_clusters.each do |cluster|
|
368
|
+
cluster.each do |geneID, data|
|
369
|
+
geneNames, description, pathways = data
|
370
|
+
pathways.each do |pathway|
|
371
|
+
query = pathways_genes_in_predictions[pathway]
|
372
|
+
if query.nil?
|
373
|
+
pathways_genes_in_predictions[pathway] = [geneID]
|
374
|
+
else
|
375
|
+
query << geneID if !query.include?(geneID)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
genes_in_predictions << geneID if !genes_in_predictions.include?(geneID)
|
379
|
+
end
|
380
|
+
end
|
381
|
+
genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions
|
382
|
+
gene_number = genes_with_kegg.length
|
383
|
+
stats = []
|
384
|
+
pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes|
|
385
|
+
pathway_id, pathway_name = pathway
|
386
|
+
no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes
|
387
|
+
pathway_no_predicted_genes_count = 0
|
388
|
+
no_pathway_no_predicted_genes_count = 0
|
389
|
+
genes_out_of_predictions.each do |geneID|
|
390
|
+
query = genes_with_kegg[geneID]
|
391
|
+
if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id)
|
392
|
+
pathway_no_predicted_genes_count += 1
|
393
|
+
else
|
394
|
+
no_pathway_no_predicted_genes_count += 1
|
395
|
+
end
|
396
|
+
end
|
397
|
+
#Fisher => http://www.biostathandbook.com/fishers.html
|
398
|
+
no_pathway_predicted_genes_count = no_pathway_predicted_genes.length
|
399
|
+
pathway_predicted_genes_count = pathway_predicted_genes.length
|
400
|
+
accumulated_prob = 0
|
401
|
+
pathway_no_predicted_genes_count.times do |n|
|
402
|
+
no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n
|
403
|
+
pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n
|
404
|
+
if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0
|
405
|
+
accumulated_prob += compute_hyper_prob(
|
406
|
+
n,
|
407
|
+
no_pathway_predicted_genes_count_shifted,
|
408
|
+
pathway_predicted_genes_count_shifted,
|
409
|
+
no_pathway_no_predicted_genes_count + n,
|
410
|
+
gene_number
|
411
|
+
)
|
412
|
+
else
|
413
|
+
break
|
414
|
+
end
|
415
|
+
end
|
416
|
+
contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count]
|
417
|
+
stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob]
|
418
|
+
end
|
419
|
+
return stats
|
420
|
+
end
|
421
|
+
|
422
|
+
def compute_hyper_prob(a, b, c, d, n)
|
423
|
+
binomA = binom(a + b, a)
|
424
|
+
binomC = binom(c + d, c)
|
425
|
+
divisor = binom(n, a + c)
|
426
|
+
return (binomA * binomC).fdiv(divisor)
|
427
|
+
end
|
428
|
+
|
429
|
+
def binom(n,k)
|
430
|
+
if k > 0 && k < n
|
431
|
+
res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
|
432
|
+
else
|
433
|
+
res = 1
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
def get_reference(genomic_ranges)
|
438
|
+
#genomic_ranges = [patientID, mut_start, mut_stop]
|
439
|
+
reference = []
|
440
|
+
reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
|
441
|
+
reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
|
442
|
+
reference.uniq!
|
443
|
+
reference.sort!
|
444
|
+
#Define overlap range
|
445
|
+
final_reference = []
|
446
|
+
reference.each_with_index do |coord,i|
|
447
|
+
next_coord = reference[i + 1]
|
448
|
+
final_reference << [coord, next_coord] if !next_coord.nil?
|
449
|
+
end
|
450
|
+
return final_reference
|
451
|
+
end
|
452
|
+
|
453
|
+
def overlap_patients(genomic_ranges, reference)
|
454
|
+
overlaps = []
|
455
|
+
reference.each do |start, stop|
|
456
|
+
patients = []
|
457
|
+
genomic_ranges.each do |pt_id, pt_start, pt_stop|
|
458
|
+
if (start <= pt_start && stop >= pt_stop) ||
|
459
|
+
(start > pt_start && stop < pt_stop) ||
|
460
|
+
(stop > pt_start && stop <= pt_stop) ||
|
461
|
+
(start >= pt_start && start < pt_stop)
|
462
|
+
patients << pt_id
|
463
|
+
end
|
464
|
+
end
|
465
|
+
overlaps << patients.uniq
|
466
|
+
end
|
467
|
+
return overlaps
|
468
|
+
end
|
469
|
+
|
470
|
+
def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
|
471
|
+
patients_out_of_cluster = 0
|
472
|
+
patients_by_cluster = {}
|
473
|
+
sors = []
|
474
|
+
patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
|
475
|
+
reference = get_reference(genomic_ranges) # Get putative overlap regions
|
476
|
+
overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
|
477
|
+
clust_number = 1
|
478
|
+
reference.each_with_index do |ref, i|
|
479
|
+
current_patients = overlapping_patients[i]
|
480
|
+
if current_patients.length > pat_per_reg
|
481
|
+
ref << chrm
|
482
|
+
node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
|
483
|
+
ref << node_identifier
|
484
|
+
save_sor(current_patients, node_identifier, patients_by_cluster)
|
485
|
+
sors << ref
|
486
|
+
clust_number += 1
|
487
|
+
end
|
488
|
+
end
|
489
|
+
end
|
490
|
+
return patients_by_cluster, sors
|
491
|
+
end
|
492
|
+
|
493
|
+
def save_sor(current_patients, node_identifier, patients_by_cluster)
|
494
|
+
current_patients.each do |patient|
|
495
|
+
add_record(patients_by_cluster, patient, node_identifier)
|
496
|
+
end
|
497
|
+
end
|
498
|
+
|
499
|
+
def add_record(hash, key, record)
|
500
|
+
query = hash[key]
|
501
|
+
if query.nil?
|
502
|
+
hash[key] = [record]
|
503
|
+
elsif !query.include?(record)
|
504
|
+
query << record
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
def load_patient_cohort(options)
|
509
|
+
patient_data = {}
|
510
|
+
count = 0
|
511
|
+
fields2extract = get_fields2extract(options)
|
512
|
+
field_numbers = fields2extract.values
|
513
|
+
original_ids = []
|
514
|
+
File.open(options[:input_file]).each do |line|
|
515
|
+
line.chomp!
|
516
|
+
if options[:header] && count == 0
|
517
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
518
|
+
field_names = line.split("\t")
|
519
|
+
get_field_numbers2extract(field_names, fields2extract)
|
520
|
+
field_numbers = fields2extract.values
|
521
|
+
else
|
522
|
+
fields = line.split("\t")
|
523
|
+
pat_record = field_numbers.map{|n| fields[n]}
|
524
|
+
if fields2extract[:pat_id_col].nil?
|
525
|
+
pat_id = "pat_#{count}" #generate ids
|
526
|
+
else
|
527
|
+
original_id = pat_record.shift
|
528
|
+
original_ids << original_id
|
529
|
+
pat_id = original_id + "_i#{count}" # make sure that ids are uniq
|
530
|
+
end
|
531
|
+
patient_data[pat_id] = pat_record
|
532
|
+
end
|
533
|
+
count +=1
|
534
|
+
end
|
535
|
+
fields2extract[:pat_id_col].nil? ? patient_number = count : patient_number = original_ids.uniq.length
|
536
|
+
options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
|
537
|
+
return patient_data, patient_number
|
538
|
+
end
|
539
|
+
|
540
|
+
def get_fields2extract(options)
|
541
|
+
fields2extract = {}
|
542
|
+
[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
|
543
|
+
col = options[field]
|
544
|
+
if !col.nil?
|
545
|
+
col = col.to_i if !options[:header]
|
546
|
+
fields2extract[field] = col
|
547
|
+
end
|
548
|
+
end
|
549
|
+
return fields2extract
|
550
|
+
end
|
551
|
+
|
552
|
+
def get_field_numbers2extract(field_names, fields2extract)
|
553
|
+
fields2extract.each do |field, name|
|
554
|
+
fields2extract[field] = field_names.index(name)
|
555
|
+
end
|
556
|
+
end
|