pets 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/bin/area_under_curve_pr.rb +118 -0
- data/bin/association_metrics_average.rb +94 -0
- data/bin/coPatReporter.rb +531 -0
- data/bin/console +14 -0
- data/bin/fmeasure_index.rb +72 -0
- data/bin/get_PR_values.rb +90 -0
- data/bin/get_clusters.R +18 -0
- data/bin/get_network_nodes.rb +197 -0
- data/bin/lines.R +77 -0
- data/bin/merge_by_cluster.rb +62 -0
- data/bin/merge_pairs.rb +138 -0
- data/bin/paco_translator.rb +102 -0
- data/bin/phen2reg.rb +385 -0
- data/bin/phen2reg_predictor_check.rb +297 -0
- data/bin/plot_area.R +71 -0
- data/bin/plot_boxplot.R +21 -0
- data/bin/plot_density.R +46 -0
- data/bin/plot_scatterplot.R +25 -0
- data/bin/reg2phen.rb +116 -0
- data/bin/region_to_patients_generator.rb +84 -0
- data/bin/relate_CI_to_association_value.rb +90 -0
- data/bin/setup +8 -0
- data/bin/standardize_scores.R +40 -0
- data/bin/xyplot_graph.R +60 -0
- data/external_data/biosystems_gene.gz +0 -0
- data/external_data/bsid2info.gz +0 -0
- data/external_data/chromosome_sizes_hg19.txt +24 -0
- data/external_data/gene_data.gz +0 -0
- data/external_data/gene_data_with_pathways.gz +0 -0
- data/external_data/gene_location.gz +0 -0
- data/external_data/hp.obo +146363 -0
- data/external_data/remove +0 -0
- data/lib/pets.rb +6 -0
- data/lib/pets/coPatReporterMethods.rb +77 -0
- data/lib/pets/generalMethods.rb +556 -0
- data/lib/pets/phen2reg_methods.rb +432 -0
- data/lib/pets/version.rb +3 -0
- data/pets.gemspec +47 -0
- data/templates/cohort_report.erb +93 -0
- data/templates/patient_report.erb +209 -0
- metadata +183 -0
File without changes
|
data/lib/pets.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
def process_patient_data(patient_data)
|
2
|
+
parsed_patient_data = {}
|
3
|
+
patient_data.each do |patientID, metadata|
|
4
|
+
phenotypes, chr, start, stop = metadata
|
5
|
+
info = [patientID, start.to_i, stop.to_i]
|
6
|
+
query = parsed_patient_data[chr]
|
7
|
+
if query.nil?
|
8
|
+
parsed_patient_data[chr] = [info]
|
9
|
+
else
|
10
|
+
query << info
|
11
|
+
end
|
12
|
+
end
|
13
|
+
return parsed_patient_data
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_final_coverage(raw_coverage, bin_size)
|
17
|
+
coverage_to_plot = []
|
18
|
+
raw_coverage.each do |chr, coverages|
|
19
|
+
coverages.each do |start, stop, coverage|
|
20
|
+
bin_start = start - start % bin_size
|
21
|
+
bin_stop = stop - stop%bin_size
|
22
|
+
while bin_start < bin_stop
|
23
|
+
coverage_to_plot << [chr, bin_start, coverage]
|
24
|
+
bin_start += bin_size
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
return coverage_to_plot
|
29
|
+
end
|
30
|
+
|
31
|
+
def get_sor_length_distribution(raw_coverage)
|
32
|
+
all_cnvs_length = []
|
33
|
+
cnvs_count = []
|
34
|
+
raw_coverage.each do |chr, coords_info|
|
35
|
+
coords_info.each do |start, stop, pat_records|
|
36
|
+
region_length = stop - start + 1
|
37
|
+
all_cnvs_length << [region_length, pat_records]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
all_cnvs_length.sort!{|c1, c2| c1[1] <=> c2[1]}
|
41
|
+
return all_cnvs_length
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_cnvs_length(patient_data)
|
45
|
+
length_stats = Hash.new(0)
|
46
|
+
patient_data.each do |pat_id, patient_record|
|
47
|
+
string_hpos, chr, start, stop = patient_record
|
48
|
+
length_stats[stop - start] += 1
|
49
|
+
end
|
50
|
+
return length_stats.to_a.sort!{|stat| stat[1] <=> stat[1] }
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def calculate_coverage(regions_data, delete_thresold = 0)
|
55
|
+
raw_coverage = {}
|
56
|
+
n_regions = 0
|
57
|
+
patients = 0
|
58
|
+
nt = 0
|
59
|
+
regions_data.each do |start, stop, chr, node|
|
60
|
+
number_of_patients = node.split('.').last.to_i
|
61
|
+
if number_of_patients <= delete_thresold
|
62
|
+
number_of_patients = 0
|
63
|
+
else
|
64
|
+
n_regions += 1
|
65
|
+
patients += number_of_patients
|
66
|
+
nt += stop - start
|
67
|
+
end
|
68
|
+
coords = [start, stop, number_of_patients]
|
69
|
+
query = raw_coverage[chr]
|
70
|
+
if query.nil?
|
71
|
+
raw_coverage[chr] = [coords]
|
72
|
+
else
|
73
|
+
query << coords
|
74
|
+
end
|
75
|
+
end
|
76
|
+
return raw_coverage, n_regions, nt, patients.fdiv(n_regions)
|
77
|
+
end
|
@@ -0,0 +1,556 @@
|
|
1
|
+
require 'uri'
|
2
|
+
#Common methods for predictors
|
3
|
+
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
4
|
+
#1. Indexing by chr (region)
|
5
|
+
|
6
|
+
def load_training_file4regions(training_file)
|
7
|
+
training_set = {}
|
8
|
+
posInfo = loadFile(training_file)
|
9
|
+
posInfo.each do |info|
|
10
|
+
chr = info.shift
|
11
|
+
query = training_set[chr]
|
12
|
+
if query.nil?
|
13
|
+
training_set[chr] = [info]
|
14
|
+
else
|
15
|
+
query << info
|
16
|
+
end
|
17
|
+
end
|
18
|
+
return training_set
|
19
|
+
end
|
20
|
+
|
21
|
+
#2. Indexing by hpo (code)
|
22
|
+
#prepare training file for analysis using phenotype2region prediction
|
23
|
+
def load_training_file4HPO(training_file, thresold=0)
|
24
|
+
training_set = {}
|
25
|
+
information = loadFile(training_file, thresold)
|
26
|
+
information.each do |info|
|
27
|
+
hpoCode = info.delete_at(4)
|
28
|
+
query = training_set[hpoCode]
|
29
|
+
if query.nil?
|
30
|
+
training_set[hpoCode] = [info]
|
31
|
+
else
|
32
|
+
query << info
|
33
|
+
end
|
34
|
+
end
|
35
|
+
# STDERR.puts training_set.keys.inspect
|
36
|
+
return training_set
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
#3. Load training info file:
|
41
|
+
#Chr;Start;Stop;HPO;Association;node
|
42
|
+
def loadFile(file, thresold=0)
|
43
|
+
information = []
|
44
|
+
File.open(file).each do |line|
|
45
|
+
line.chomp!
|
46
|
+
allInfo = line.split("\t")
|
47
|
+
associationValue = allInfo[4].to_f
|
48
|
+
if associationValue >= thresold
|
49
|
+
chr = allInfo[0]
|
50
|
+
startPos = allInfo[1].to_i
|
51
|
+
stopPos = allInfo[2].to_i
|
52
|
+
hpoCode = allInfo[3]
|
53
|
+
nodeID = allInfo[5]
|
54
|
+
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
return information
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
|
61
|
+
if !hpo_black_list.include?(id)
|
62
|
+
attributes = [id, name, is_a - hpo_black_list, syn]
|
63
|
+
hpo_storage[id] = attributes
|
64
|
+
alt_ids.each do |altid|
|
65
|
+
hpo_storage[altid] = attributes
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def load_hpo_file(hpo_file, hpo_black_list=[])
|
71
|
+
hpo_storage = {}
|
72
|
+
id = nil
|
73
|
+
name = nil
|
74
|
+
alt_ids = []
|
75
|
+
syn = []
|
76
|
+
is_a = []
|
77
|
+
File.open(hpo_file).each do |line|
|
78
|
+
line.chomp!
|
79
|
+
tag, info = line.split(': ')
|
80
|
+
if tag == 'id' || tag == 'name' || tag == 'is_a' || tag == 'synonym' || tag == 'alt_id'
|
81
|
+
if tag == 'id'
|
82
|
+
add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) if !name.nil?
|
83
|
+
id = info
|
84
|
+
name = nil
|
85
|
+
alt_id = []
|
86
|
+
syn = []
|
87
|
+
is_a = []
|
88
|
+
end
|
89
|
+
if tag == 'alt_id'
|
90
|
+
alt_ids << info
|
91
|
+
elsif tag == 'is_a'
|
92
|
+
is_a << info.split(' ! ')[0]
|
93
|
+
elsif tag == 'synonym'
|
94
|
+
syn << info.split('"')[1] #to keep only the name of the synonym
|
95
|
+
else
|
96
|
+
name = info
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
|
101
|
+
# STDERR.puts hpo_storage.inspect
|
102
|
+
# Process.exit
|
103
|
+
return hpo_storage
|
104
|
+
end
|
105
|
+
|
106
|
+
def load_hpo_black_list(excluded_hpo_file)
|
107
|
+
excluded_hpos = []
|
108
|
+
File.open(excluded_hpo_file).each do |line|
|
109
|
+
line.chomp!
|
110
|
+
excluded_hpos << line
|
111
|
+
end
|
112
|
+
return excluded_hpos
|
113
|
+
end
|
114
|
+
|
115
|
+
def create_hpo_dictionary(hpo_storage)
|
116
|
+
hpo_dictionary = {}
|
117
|
+
hpo_storage.each do |hpo, metadata|
|
118
|
+
hpo_code, hpo_name, hpo_parents, hpo_synonyms = metadata
|
119
|
+
hpo_dictionary[hpo_name] = hpo_code
|
120
|
+
hpo_synonyms.each do |syn|
|
121
|
+
hpo_dictionary[syn] = hpo_code
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return hpo_dictionary
|
125
|
+
end
|
126
|
+
|
127
|
+
def get_child_parent_relations(hpo_storage)
|
128
|
+
# for getting hpo childs
|
129
|
+
storage_child = {}
|
130
|
+
hpo_storage.each do |hpo_code, hpo_data|
|
131
|
+
id, name, is_a, syn = hpo_data
|
132
|
+
hpo_child = [id, name]
|
133
|
+
is_a.each do |par_hpo_code|
|
134
|
+
query = storage_child[par_hpo_code]
|
135
|
+
if query.nil?
|
136
|
+
storage_child[par_hpo_code] = [hpo_child]
|
137
|
+
else
|
138
|
+
query << hpo_child
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
return storage_child
|
143
|
+
end
|
144
|
+
|
145
|
+
def compute_IC_values(patient_data, total_patients)
|
146
|
+
patients_per_hpo = Hash.new(0)
|
147
|
+
last_patient_ID = ''
|
148
|
+
patient_data.each do |patient_ID, metadata|
|
149
|
+
patient, count = patient_ID.split('_i')
|
150
|
+
if patient != last_patient_ID
|
151
|
+
hpos, chr, start, stop = metadata
|
152
|
+
hpos.each do |h|
|
153
|
+
patients_per_hpo[h] += 1
|
154
|
+
end
|
155
|
+
end
|
156
|
+
last_patient_ID = patient
|
157
|
+
end
|
158
|
+
# STDERR.puts patients_per_hpo.inspect
|
159
|
+
# Process.exit
|
160
|
+
patients_per_hpo.each do |hpo, patient_number|
|
161
|
+
patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
|
162
|
+
end
|
163
|
+
return patients_per_hpo
|
164
|
+
end
|
165
|
+
|
166
|
+
# def get_child_parent_relations(hpo_storage)
|
167
|
+
# # for getting hpo childs
|
168
|
+
# storage_child = {}
|
169
|
+
# hpo_storage.each do |hpo_code, hpo_data|
|
170
|
+
# STDERR.puts hpo_data[3].inspect
|
171
|
+
# Process.exit
|
172
|
+
# main_code, hpo_name, synonyms, parents = hpo_data
|
173
|
+
# parents.each do |par_hpo_code, par_hpo_name|
|
174
|
+
# query = storage_child[par_hpo_code]
|
175
|
+
# hpo_child = [main_code, hpo_name]
|
176
|
+
# if query.nil?
|
177
|
+
# storage_child[par_hpo_code] = [par_hpo_name, [hpo_child]]
|
178
|
+
# else
|
179
|
+
# query.last << hpo_child
|
180
|
+
# end
|
181
|
+
# end
|
182
|
+
# end
|
183
|
+
|
184
|
+
# return storage_child
|
185
|
+
# end
|
186
|
+
|
187
|
+
|
188
|
+
def load_hpo_ci_values(information_coefficient_file)
|
189
|
+
hpos_ci_values = {}
|
190
|
+
File.open(information_coefficient_file).each do |line|
|
191
|
+
line.chomp!
|
192
|
+
hpo_code, ci = line.split("\t")
|
193
|
+
hpos_ci_values[hpo_code] = ci.to_f
|
194
|
+
end
|
195
|
+
return hpos_ci_values
|
196
|
+
end
|
197
|
+
|
198
|
+
def load_clustered_patients(file)
|
199
|
+
clusters = {}
|
200
|
+
File.open(file).each do |line|
|
201
|
+
line.chomp!
|
202
|
+
pat_id, cluster_id = line.split("\t")
|
203
|
+
query = clusters[cluster_id]
|
204
|
+
if query.nil?
|
205
|
+
clusters[cluster_id] = [pat_id]
|
206
|
+
else
|
207
|
+
query << pat_id
|
208
|
+
end
|
209
|
+
end
|
210
|
+
return clusters
|
211
|
+
end
|
212
|
+
|
213
|
+
def load_gene_data(gene_data_path)
|
214
|
+
gene_list = {} #geneID => attr
|
215
|
+
gene_location = {} # chr => gene
|
216
|
+
infile = open(gene_data_path)
|
217
|
+
gz = Zlib::GzipReader.new(infile)
|
218
|
+
current_chr = nil
|
219
|
+
genes = []
|
220
|
+
gz.each_line do |line|
|
221
|
+
line.chomp!
|
222
|
+
next if line =~ /^#/
|
223
|
+
fields = line.split("\t")
|
224
|
+
if fields[8].include?('genome=chromosome')
|
225
|
+
chr = fields[8].split(';')[1].split('=').last
|
226
|
+
gene_location[current_chr] = genes
|
227
|
+
genes = []
|
228
|
+
current_chr = chr
|
229
|
+
elsif fields[2] == 'gene'
|
230
|
+
attributes = {}
|
231
|
+
fields[8].split(';').each do |pair|
|
232
|
+
key, value = pair.split('=')
|
233
|
+
attributes[key] = value
|
234
|
+
end
|
235
|
+
geneNames = []
|
236
|
+
geneNames << attributes['gene'] if !attributes['gene'].nil?
|
237
|
+
geneNames.concat(attributes['gene_synonym'].split(',')) if !attributes['gene_synonym'].nil?
|
238
|
+
description = attributes['description']
|
239
|
+
description = URI.unescape(description) if !description.nil?
|
240
|
+
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
241
|
+
gene_list[$1] = [geneNames, description]
|
242
|
+
genes << [$1, fields[3].to_i, fields[4].to_i]
|
243
|
+
end
|
244
|
+
end
|
245
|
+
gene_location[current_chr] = genes
|
246
|
+
return gene_list, gene_location
|
247
|
+
end
|
248
|
+
|
249
|
+
def parse_kegg_data(query_genes)
|
250
|
+
kegg_data = {} #gene => attb
|
251
|
+
while !query_genes.empty?
|
252
|
+
gene_set = query_genes.shift(10)
|
253
|
+
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
254
|
+
uri = URI(url)
|
255
|
+
response = Net::HTTP.get(uri)
|
256
|
+
geneID = nil
|
257
|
+
gene_names = []
|
258
|
+
definition = nil
|
259
|
+
pathways = []
|
260
|
+
parsing_pathway_field = false
|
261
|
+
response.squeeze(' ').each_line do |line|
|
262
|
+
line.chomp!
|
263
|
+
if line =~ /^ENTRY/
|
264
|
+
geneID = line.split(' ')[1]
|
265
|
+
elsif line =~ /^NAME/
|
266
|
+
gene_names = line.split(' ', 2).last.split(', ')
|
267
|
+
elsif line =~ /^DEFINITION/
|
268
|
+
definition = line.split(' ', 2)[1]
|
269
|
+
elsif line =~ /^PATHWAY/
|
270
|
+
pathways << line.split(' ', 3)[1..2]
|
271
|
+
parsing_pathway_field = true
|
272
|
+
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
273
|
+
parsing_pathway_field = false
|
274
|
+
elsif parsing_pathway_field
|
275
|
+
pathways << line.strip.split(' ', 2)
|
276
|
+
elsif line == '///'
|
277
|
+
parsing_pathway_field = false
|
278
|
+
kegg_data[geneID] = [gene_names, definition, pathways]
|
279
|
+
pathways = []
|
280
|
+
gene_names = []
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
return kegg_data
|
285
|
+
end
|
286
|
+
|
287
|
+
def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
|
288
|
+
kegg_data = {}
|
289
|
+
gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path)
|
290
|
+
keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG')
|
291
|
+
keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/}
|
292
|
+
|
293
|
+
gene2biosystems.each do |geneID, pathways|
|
294
|
+
kegg_pathways = []
|
295
|
+
pathways.each do |biosystem|
|
296
|
+
kAttrib = keggAttributes[biosystem]
|
297
|
+
kegg_pathways << kAttrib if !kAttrib.nil?
|
298
|
+
end
|
299
|
+
kegg_data[geneID] = kegg_pathways
|
300
|
+
end
|
301
|
+
return kegg_data
|
302
|
+
end
|
303
|
+
|
304
|
+
def loadBiosistemsInfo(biosystems_info_path, filterDB)
|
305
|
+
bsid2attributes = {}
|
306
|
+
infile = open(biosystems_info_path)
|
307
|
+
gz = Zlib::GzipReader.new(infile)
|
308
|
+
gz.each_line do |line|
|
309
|
+
line.chomp!
|
310
|
+
#STDERR.puts line.inspect
|
311
|
+
fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t")
|
312
|
+
bsid = fields.shift
|
313
|
+
bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0]
|
314
|
+
end
|
315
|
+
return bsid2attributes
|
316
|
+
end
|
317
|
+
|
318
|
+
def load_biosystem2gene_dictionary(biosystems_gene_path)
|
319
|
+
gene2kegg = {}
|
320
|
+
infile = open(biosystems_gene_path)
|
321
|
+
gz = Zlib::GzipReader.new(infile)
|
322
|
+
gz.each_line do |line|
|
323
|
+
line.chomp!
|
324
|
+
biosystem, gene, score = line.split("\t")
|
325
|
+
query = gene2kegg[gene]
|
326
|
+
if query.nil?
|
327
|
+
gene2kegg[gene] = [biosystem]
|
328
|
+
else
|
329
|
+
query << biosystem
|
330
|
+
end
|
331
|
+
end
|
332
|
+
return gene2kegg
|
333
|
+
end
|
334
|
+
|
335
|
+
def merge_genes_with_kegg_data(gene_list, kegg_data)
|
336
|
+
merged_data = {}
|
337
|
+
gene_list.each do |geneID, attributes|
|
338
|
+
query = kegg_data[geneID]
|
339
|
+
if query.nil?
|
340
|
+
attributes << []
|
341
|
+
else
|
342
|
+
attributes << query
|
343
|
+
end
|
344
|
+
merged_data[geneID] = attributes
|
345
|
+
end
|
346
|
+
return merged_data
|
347
|
+
end
|
348
|
+
|
349
|
+
def write_compressed_plain_file(data, path)
|
350
|
+
File.open(path, 'w') do |f|
|
351
|
+
gz = Zlib::GzipWriter.new(f)
|
352
|
+
gz.write data.to_json
|
353
|
+
gz.close
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
def read_compressed_json(path)
|
358
|
+
infile = open(path)
|
359
|
+
gz = Zlib::GzipReader.new(infile)
|
360
|
+
object = JSON.parse(gz.read)
|
361
|
+
return object
|
362
|
+
end
|
363
|
+
|
364
|
+
def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
|
365
|
+
pathways_genes_in_predictions = {}
|
366
|
+
genes_in_predictions = []
|
367
|
+
genes_clusters.each do |cluster|
|
368
|
+
cluster.each do |geneID, data|
|
369
|
+
geneNames, description, pathways = data
|
370
|
+
pathways.each do |pathway|
|
371
|
+
query = pathways_genes_in_predictions[pathway]
|
372
|
+
if query.nil?
|
373
|
+
pathways_genes_in_predictions[pathway] = [geneID]
|
374
|
+
else
|
375
|
+
query << geneID if !query.include?(geneID)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
genes_in_predictions << geneID if !genes_in_predictions.include?(geneID)
|
379
|
+
end
|
380
|
+
end
|
381
|
+
genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions
|
382
|
+
gene_number = genes_with_kegg.length
|
383
|
+
stats = []
|
384
|
+
pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes|
|
385
|
+
pathway_id, pathway_name = pathway
|
386
|
+
no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes
|
387
|
+
pathway_no_predicted_genes_count = 0
|
388
|
+
no_pathway_no_predicted_genes_count = 0
|
389
|
+
genes_out_of_predictions.each do |geneID|
|
390
|
+
query = genes_with_kegg[geneID]
|
391
|
+
if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id)
|
392
|
+
pathway_no_predicted_genes_count += 1
|
393
|
+
else
|
394
|
+
no_pathway_no_predicted_genes_count += 1
|
395
|
+
end
|
396
|
+
end
|
397
|
+
#Fisher => http://www.biostathandbook.com/fishers.html
|
398
|
+
no_pathway_predicted_genes_count = no_pathway_predicted_genes.length
|
399
|
+
pathway_predicted_genes_count = pathway_predicted_genes.length
|
400
|
+
accumulated_prob = 0
|
401
|
+
pathway_no_predicted_genes_count.times do |n|
|
402
|
+
no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n
|
403
|
+
pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n
|
404
|
+
if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0
|
405
|
+
accumulated_prob += compute_hyper_prob(
|
406
|
+
n,
|
407
|
+
no_pathway_predicted_genes_count_shifted,
|
408
|
+
pathway_predicted_genes_count_shifted,
|
409
|
+
no_pathway_no_predicted_genes_count + n,
|
410
|
+
gene_number
|
411
|
+
)
|
412
|
+
else
|
413
|
+
break
|
414
|
+
end
|
415
|
+
end
|
416
|
+
contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count]
|
417
|
+
stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob]
|
418
|
+
end
|
419
|
+
return stats
|
420
|
+
end
|
421
|
+
|
422
|
+
def compute_hyper_prob(a, b, c, d, n)
|
423
|
+
binomA = binom(a + b, a)
|
424
|
+
binomC = binom(c + d, c)
|
425
|
+
divisor = binom(n, a + c)
|
426
|
+
return (binomA * binomC).fdiv(divisor)
|
427
|
+
end
|
428
|
+
|
429
|
+
def binom(n,k)
|
430
|
+
if k > 0 && k < n
|
431
|
+
res = (1+n-k..n).inject(:*)/(1..k).inject(:*)
|
432
|
+
else
|
433
|
+
res = 1
|
434
|
+
end
|
435
|
+
end
|
436
|
+
|
437
|
+
def get_reference(genomic_ranges)
|
438
|
+
#genomic_ranges = [patientID, mut_start, mut_stop]
|
439
|
+
reference = []
|
440
|
+
reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
|
441
|
+
reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
|
442
|
+
reference.uniq!
|
443
|
+
reference.sort!
|
444
|
+
#Define overlap range
|
445
|
+
final_reference = []
|
446
|
+
reference.each_with_index do |coord,i|
|
447
|
+
next_coord = reference[i + 1]
|
448
|
+
final_reference << [coord, next_coord] if !next_coord.nil?
|
449
|
+
end
|
450
|
+
return final_reference
|
451
|
+
end
|
452
|
+
|
453
|
+
def overlap_patients(genomic_ranges, reference)
|
454
|
+
overlaps = []
|
455
|
+
reference.each do |start, stop|
|
456
|
+
patients = []
|
457
|
+
genomic_ranges.each do |pt_id, pt_start, pt_stop|
|
458
|
+
if (start <= pt_start && stop >= pt_stop) ||
|
459
|
+
(start > pt_start && stop < pt_stop) ||
|
460
|
+
(stop > pt_start && stop <= pt_stop) ||
|
461
|
+
(start >= pt_start && start < pt_stop)
|
462
|
+
patients << pt_id
|
463
|
+
end
|
464
|
+
end
|
465
|
+
overlaps << patients.uniq
|
466
|
+
end
|
467
|
+
return overlaps
|
468
|
+
end
|
469
|
+
|
470
|
+
def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
|
471
|
+
patients_out_of_cluster = 0
|
472
|
+
patients_by_cluster = {}
|
473
|
+
sors = []
|
474
|
+
patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
|
475
|
+
reference = get_reference(genomic_ranges) # Get putative overlap regions
|
476
|
+
overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
|
477
|
+
clust_number = 1
|
478
|
+
reference.each_with_index do |ref, i|
|
479
|
+
current_patients = overlapping_patients[i]
|
480
|
+
if current_patients.length > pat_per_reg
|
481
|
+
ref << chrm
|
482
|
+
node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
|
483
|
+
ref << node_identifier
|
484
|
+
save_sor(current_patients, node_identifier, patients_by_cluster)
|
485
|
+
sors << ref
|
486
|
+
clust_number += 1
|
487
|
+
end
|
488
|
+
end
|
489
|
+
end
|
490
|
+
return patients_by_cluster, sors
|
491
|
+
end
|
492
|
+
|
493
|
+
def save_sor(current_patients, node_identifier, patients_by_cluster)
|
494
|
+
current_patients.each do |patient|
|
495
|
+
add_record(patients_by_cluster, patient, node_identifier)
|
496
|
+
end
|
497
|
+
end
|
498
|
+
|
499
|
+
def add_record(hash, key, record)
|
500
|
+
query = hash[key]
|
501
|
+
if query.nil?
|
502
|
+
hash[key] = [record]
|
503
|
+
elsif !query.include?(record)
|
504
|
+
query << record
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
def load_patient_cohort(options)
|
509
|
+
patient_data = {}
|
510
|
+
count = 0
|
511
|
+
fields2extract = get_fields2extract(options)
|
512
|
+
field_numbers = fields2extract.values
|
513
|
+
original_ids = []
|
514
|
+
File.open(options[:input_file]).each do |line|
|
515
|
+
line.chomp!
|
516
|
+
if options[:header] && count == 0
|
517
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
518
|
+
field_names = line.split("\t")
|
519
|
+
get_field_numbers2extract(field_names, fields2extract)
|
520
|
+
field_numbers = fields2extract.values
|
521
|
+
else
|
522
|
+
fields = line.split("\t")
|
523
|
+
pat_record = field_numbers.map{|n| fields[n]}
|
524
|
+
if fields2extract[:pat_id_col].nil?
|
525
|
+
pat_id = "pat_#{count}" #generate ids
|
526
|
+
else
|
527
|
+
original_id = pat_record.shift
|
528
|
+
original_ids << original_id
|
529
|
+
pat_id = original_id + "_i#{count}" # make sure that ids are uniq
|
530
|
+
end
|
531
|
+
patient_data[pat_id] = pat_record
|
532
|
+
end
|
533
|
+
count +=1
|
534
|
+
end
|
535
|
+
fields2extract[:pat_id_col].nil? ? patient_number = count : patient_number = original_ids.uniq.length
|
536
|
+
options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
|
537
|
+
return patient_data, patient_number
|
538
|
+
end
|
539
|
+
|
540
|
+
def get_fields2extract(options)
|
541
|
+
fields2extract = {}
|
542
|
+
[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
|
543
|
+
col = options[field]
|
544
|
+
if !col.nil?
|
545
|
+
col = col.to_i if !options[:header]
|
546
|
+
fields2extract[field] = col
|
547
|
+
end
|
548
|
+
end
|
549
|
+
return fields2extract
|
550
|
+
end
|
551
|
+
|
552
|
+
def get_field_numbers2extract(field_names, fields2extract)
|
553
|
+
fields2extract.each do |field, name|
|
554
|
+
fields2extract[field] = field_names.index(name)
|
555
|
+
end
|
556
|
+
end
|