pets 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
@@ -0,0 +1,144 @@
|
|
1
|
+
class Genomic_Feature
|
2
|
+
#If any method use gen_fet as name is a Genomic_Feature object
|
3
|
+
def initialize(feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
4
|
+
@regions = {}
|
5
|
+
@reg_id = -1
|
6
|
+
load_features(feat_array)
|
7
|
+
end
|
8
|
+
|
9
|
+
def load_features(feat_array)
|
10
|
+
feat_array.each do |chr, start, stop|
|
11
|
+
chr = chr.to_sym
|
12
|
+
region = {start: start, stop: stop, to: @reg_id +=1 }
|
13
|
+
add_record(@regions, chr, region)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def length
|
18
|
+
return @regions.length
|
19
|
+
end
|
20
|
+
|
21
|
+
def each()
|
22
|
+
@regions.each do |chr, regs|
|
23
|
+
regs.each do |region|
|
24
|
+
yield(chr, region)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_chr
|
30
|
+
return @regions.keys
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_sizes
|
34
|
+
sizes = []
|
35
|
+
each do |chr, region|
|
36
|
+
size = region[:stop] - region[:start] + 1
|
37
|
+
sizes << size
|
38
|
+
end
|
39
|
+
return sizes
|
40
|
+
end
|
41
|
+
|
42
|
+
def get_summary_sizes
|
43
|
+
sizes = Hash.new(0)
|
44
|
+
each do |chr, region|
|
45
|
+
size = region[:stop] - region[:start] + 1
|
46
|
+
sizes[size] += 1
|
47
|
+
end
|
48
|
+
return sizes.to_a.sort!{|s| s[1] <=> s[1] }
|
49
|
+
end
|
50
|
+
|
51
|
+
def merge(gen_fet, to = nil) # 'to' the regions must be connected "to" given id
|
52
|
+
gen_fet.each do |chr, region|
|
53
|
+
to.nil? ? region[:to] = @reg_id +=1 : region[:to] = to # handle id custom or default
|
54
|
+
add_record(@regions, chr, region)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_reference_overlaps(genomic_ranges, reference)
|
59
|
+
overlaps = []
|
60
|
+
reference.each do |start, stop|
|
61
|
+
reg_ids = []
|
62
|
+
genomic_ranges.each do |reg|
|
63
|
+
reg_ids << reg[:to] if coor_overlap?(start, stop, reg)
|
64
|
+
end
|
65
|
+
overlaps << reg_ids.uniq
|
66
|
+
end
|
67
|
+
return overlaps
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_cluster_regions(meth, tag, ids_per_reg = 1)
|
71
|
+
compute_windows(meth) # Get putative genome windows
|
72
|
+
patients_out_of_cluster = 0
|
73
|
+
ids_by_cluster = {}
|
74
|
+
annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
|
75
|
+
@regions.each do |chr, regs|
|
76
|
+
reference = @windows[chr]
|
77
|
+
overlaps = get_reference_overlaps(regs, reference) # See what patient has match with a overlap region
|
78
|
+
clust_numb = 0
|
79
|
+
reference.each_with_index do |ref, i|
|
80
|
+
current_ids = overlaps[i]
|
81
|
+
if current_ids.length > ids_per_reg
|
82
|
+
clust_id = "#{chr}.#{clust_numb +=1}.#{tag}.#{current_ids.length}"
|
83
|
+
current_ids.each do |curr_id|
|
84
|
+
add_record(ids_by_cluster, curr_id, clust_id, true)
|
85
|
+
end
|
86
|
+
annotated_full_ref << ref.dup.concat([chr, clust_id])
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
return ids_by_cluster, annotated_full_ref
|
91
|
+
end
|
92
|
+
|
93
|
+
def compute_windows(meth)
|
94
|
+
@windows = {}
|
95
|
+
@regions.each do |chr, regs|
|
96
|
+
chr_windows = nil
|
97
|
+
if meth == :reg_overlap
|
98
|
+
chr_windows = compute_region_overlap_windows(regs)
|
99
|
+
end
|
100
|
+
@windows[chr] = chr_windows
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def add_record(hash, key, record, uniq=false)
|
107
|
+
query = hash[key]
|
108
|
+
if query.nil?
|
109
|
+
hash[key] = [record]
|
110
|
+
elsif !uniq # We not take care by repeated entries
|
111
|
+
query << record
|
112
|
+
elsif !query.include?(record) # We want uniq entries
|
113
|
+
query << record
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def compute_region_overlap_windows(genomic_ranges)
|
118
|
+
reference = []
|
119
|
+
reference.concat(genomic_ranges.map{|gr| gr[:start]})# get start
|
120
|
+
reference.concat(genomic_ranges.map{|gr| gr[:stop]})# get stop
|
121
|
+
reference.uniq!
|
122
|
+
reference.sort!
|
123
|
+
#Define overlap ranges
|
124
|
+
final_reference = []
|
125
|
+
reference.each_with_index do |coord,i|
|
126
|
+
next_coord = reference[i + 1]
|
127
|
+
final_reference << [coord, next_coord] if !next_coord.nil?
|
128
|
+
end
|
129
|
+
return final_reference
|
130
|
+
end
|
131
|
+
|
132
|
+
def coor_overlap?(start, stop, reg)
|
133
|
+
overlap = false
|
134
|
+
reg_start = reg[:start]
|
135
|
+
reg_stop = reg[:stop]
|
136
|
+
if (start <= reg_start && stop >= reg_stop) ||
|
137
|
+
(start > reg_start && stop < reg_stop) ||
|
138
|
+
(stop > reg_start && stop <= reg_stop) ||
|
139
|
+
(start >= reg_start && start < reg_stop)
|
140
|
+
overlap = true
|
141
|
+
end
|
142
|
+
return overlap
|
143
|
+
end
|
144
|
+
end
|
data/lib/pets/io.rb
ADDED
@@ -0,0 +1,457 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
def load_hpo_ontology(hpo_file, excluded_hpo_file)
|
4
|
+
hpo = nil
|
5
|
+
if !hpo_file.include?('.json')
|
6
|
+
if !excluded_hpo_file.nil?
|
7
|
+
hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
|
8
|
+
else
|
9
|
+
hpo = Ontology.new(file: hpo_file, load_file: true)
|
10
|
+
end
|
11
|
+
else
|
12
|
+
hpo = Ontology.new
|
13
|
+
hpo.read(hpo_file)
|
14
|
+
if !excluded_hpo_file.nil?
|
15
|
+
hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
|
16
|
+
hpo.remove_removable()
|
17
|
+
hpo.build_index()
|
18
|
+
end
|
19
|
+
end
|
20
|
+
return hpo
|
21
|
+
end
|
22
|
+
|
23
|
+
def read_excluded_hpo_file(file)
|
24
|
+
excluded_hpo = []
|
25
|
+
File.open(file).each do |line|
|
26
|
+
excluded_hpo << line.chomp
|
27
|
+
end
|
28
|
+
return excluded_hpo
|
29
|
+
end
|
30
|
+
|
31
|
+
def write_hash(hash, file_path, header = [])
|
32
|
+
File.open(file_path, 'w') do |handler|
|
33
|
+
handler.puts header.join("\t") if !header.empty?
|
34
|
+
hash.each do |key, array|
|
35
|
+
handler.puts "#{key}\t#{array.join("\t")}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def write_array(array, file_path)
|
41
|
+
File.open(file_path, 'w') do |handler|
|
42
|
+
array.each do |record|
|
43
|
+
if record.class == String
|
44
|
+
line = record
|
45
|
+
else
|
46
|
+
line = record.join("\t")
|
47
|
+
end
|
48
|
+
handler.puts line
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def write_matrix_for_R(matrix, x_names, y_names, file)
|
54
|
+
File.open(file, 'w') do |f|
|
55
|
+
f.puts x_names.join("\t")
|
56
|
+
matrix.each_with_index do |row, i|
|
57
|
+
f.puts [y_names[i]].concat(row).join("\t")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
|
63
|
+
File.open(cluster_ic_data_file, 'w') do |f|
|
64
|
+
f.puts %w[cluster_id ic Plen].join("\t")
|
65
|
+
all_ics.each_with_index do |cluster_ics, i|
|
66
|
+
break if i == limit
|
67
|
+
cluster_length = cluster_ics.length
|
68
|
+
cluster_ics.each_with_index do |clust_ic, j|
|
69
|
+
f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
76
|
+
File.open(cluster_chromosome_data_file, 'w') do |f|
|
77
|
+
f.puts %w[cluster_id chr count].join("\t")
|
78
|
+
index = 0
|
79
|
+
last_id = cluster_data.first.first unless cluster_data.empty?
|
80
|
+
cluster_data.each do |cluster_id, patient_number, chr, count|
|
81
|
+
index += 1 if cluster_id != last_id
|
82
|
+
break if index == limit
|
83
|
+
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
84
|
+
last_id = cluster_id
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
90
|
+
File.open(coverage_to_plot_file, 'w') do |f|
|
91
|
+
coverage_to_plot.each do |chr, position, freq|
|
92
|
+
f.puts "#{chr}\t#{position}\t#{freq}"
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
99
|
+
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
100
|
+
suggested_childs.each do |pat_id, suggestions|
|
101
|
+
warning = nil
|
102
|
+
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
103
|
+
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
104
|
+
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
105
|
+
suggestions.each do |parent, childs|
|
106
|
+
parent_code, parent_name = parent
|
107
|
+
if childs.empty?
|
108
|
+
csv << ["#{parent_name} (#{parent_code})", '-']
|
109
|
+
else
|
110
|
+
parent_writed = false
|
111
|
+
childs.each do |child_code, child_name|
|
112
|
+
if !parent_writed
|
113
|
+
parent_field = "#{parent_name} (#{parent_code})"
|
114
|
+
parent_writed = true
|
115
|
+
else
|
116
|
+
parent_field = ""
|
117
|
+
end
|
118
|
+
csv << [parent_field, "#{child_name} (#{child_code})"]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
csv << ["", ""]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
|
128
|
+
File.open(filename, 'w') do |f|
|
129
|
+
f.puts "#{x_axis_name}\t#{y_axis_name}"
|
130
|
+
x_axis_value.each_with_index do |value,i|
|
131
|
+
y_value = y_axis_value[i]
|
132
|
+
raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
|
133
|
+
f.puts [value, y_value].join("\t")
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
|
140
|
+
File.open(similarity_matrix_file, 'w') do |f|
|
141
|
+
similarity_matrix.each do |row|
|
142
|
+
f.puts row.join("\t")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def write_profile_pairs(similarity_pairs, filename)
|
148
|
+
File.open(filename, 'w') do |f|
|
149
|
+
similarity_pairs.each do |pairsA, pairsB_and_values|
|
150
|
+
pairsB_and_values.each do |pairsB, values|
|
151
|
+
f.puts "#{pairsA}\t#{pairsB}\t#{values}"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
|
158
|
+
File.open(output_file, 'w') do |f|
|
159
|
+
f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
|
160
|
+
average_hp_per_pat_distribution.each do |patient_num, ave|
|
161
|
+
f.puts "#{patient_num}\t#{ave}"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def parse_clusters_file(clusters_file, patient_data)
|
167
|
+
clusters_info = {}
|
168
|
+
clusters_table = []
|
169
|
+
File.open(clusters_file).each do |line|
|
170
|
+
line.chomp!
|
171
|
+
patientID, clusterID = line.split("\t")
|
172
|
+
patientHPOProfile = patient_data.get_profile(patientID)
|
173
|
+
query = clusters_info[clusterID]
|
174
|
+
if query.nil?
|
175
|
+
clusters_info[clusterID] = {patientID => patientHPOProfile}
|
176
|
+
else
|
177
|
+
query[patientID] = patientHPOProfile
|
178
|
+
end
|
179
|
+
end
|
180
|
+
clusters_info.each do |clusterID, patients_info|
|
181
|
+
patients_per_cluster = patients_info.keys.length
|
182
|
+
clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
|
183
|
+
end
|
184
|
+
return clusters_table, clusters_info
|
185
|
+
end
|
186
|
+
|
187
|
+
def load_profiles(file_path, hpo)
|
188
|
+
profiles = {}
|
189
|
+
#count = 0
|
190
|
+
File.open(file_path).each do |line|
|
191
|
+
id, profile = line.chomp.split("\t")
|
192
|
+
hpos = profile.split(',').map{|a| a.to_sym}
|
193
|
+
hpos, rejected_hpos = hpo.check_ids(hpos)
|
194
|
+
if !hpos.empty?
|
195
|
+
hpos = hpo.clean_profile(hpos)
|
196
|
+
profiles[id] = hpos if !hpos.empty?
|
197
|
+
end
|
198
|
+
end
|
199
|
+
return profiles
|
200
|
+
end
|
201
|
+
|
202
|
+
def load_variants(variant_folder)
|
203
|
+
variants = {}
|
204
|
+
Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
|
205
|
+
profile_id = File.basename(path, '.tab')
|
206
|
+
vars = []
|
207
|
+
File.open(path).each do |line|
|
208
|
+
fields = line.chomp.split("\t")
|
209
|
+
chr = fields[0]
|
210
|
+
start = fields[1].to_i
|
211
|
+
vars << [chr, start, start]
|
212
|
+
end
|
213
|
+
variants[profile_id] = Genomic_Feature.new(vars)
|
214
|
+
end
|
215
|
+
return variants
|
216
|
+
end
|
217
|
+
|
218
|
+
def load_evidences(evidences_path, hpo)
|
219
|
+
genomic_coordinates = {}
|
220
|
+
coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
|
221
|
+
coord_files.each do |cd_f|
|
222
|
+
entity = File.basename(cd_f, '.coords')
|
223
|
+
coordinates = load_coordinates(cd_f)
|
224
|
+
genomic_coordinates[entity] = coordinates
|
225
|
+
end
|
226
|
+
evidences = {}
|
227
|
+
evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
|
228
|
+
evidence_files.each do |e_f|
|
229
|
+
pair = File.basename(e_f, '.txt')
|
230
|
+
profiles, id2label = load_evidence_profiles(e_f, hpo)
|
231
|
+
evidences[pair] = {prof: profiles, id2lab: id2label}
|
232
|
+
end
|
233
|
+
return evidences, genomic_coordinates
|
234
|
+
end
|
235
|
+
|
236
|
+
def load_coordinates(file_path)
|
237
|
+
coordinates = {}
|
238
|
+
header = true
|
239
|
+
File.open(file_path).each do |line|
|
240
|
+
fields = line.chomp.split("\t")
|
241
|
+
if header
|
242
|
+
header = false
|
243
|
+
else
|
244
|
+
entity, chr, strand, start, stop = fields
|
245
|
+
coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
return coordinates
|
249
|
+
end
|
250
|
+
|
251
|
+
def load_evidence_profiles(file_path, hpo)
|
252
|
+
profiles = {}
|
253
|
+
id2label = {}
|
254
|
+
#count = 0
|
255
|
+
File.open(file_path).each do |line|
|
256
|
+
id, label, profile = line.chomp.split("\t")
|
257
|
+
hpos = profile.split(',').map{|a| a.to_sym}
|
258
|
+
hpos, rejected_hpos = hpo.check_ids(hpos)
|
259
|
+
if !hpos.empty?
|
260
|
+
hpos = hpo.clean_profile(hpos)
|
261
|
+
profiles[id] = hpos if !hpos.empty?
|
262
|
+
id2label[id] = label
|
263
|
+
end
|
264
|
+
end
|
265
|
+
return profiles, id2label
|
266
|
+
end
|
267
|
+
|
268
|
+
#Common methods for predictors
|
269
|
+
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
270
|
+
#1. Indexing by chr (region)
|
271
|
+
def coor_overlap?(ref_start, ref_stop, start, stop)
|
272
|
+
overlap = false
|
273
|
+
if (stop > ref_start && stop <= ref_stop) ||
|
274
|
+
(start >= ref_start && start < ref_stop) ||
|
275
|
+
(start <= ref_start && stop >= ref_stop) ||
|
276
|
+
(start > ref_start && stop < ref_stop)
|
277
|
+
overlap = true
|
278
|
+
end
|
279
|
+
return overlap
|
280
|
+
end
|
281
|
+
|
282
|
+
def load_training_file4regions(training_file)
|
283
|
+
training_set = {}
|
284
|
+
posInfo = loadFile(training_file)
|
285
|
+
posInfo.each do |info|
|
286
|
+
chr = info.shift
|
287
|
+
query = training_set[chr]
|
288
|
+
if query.nil?
|
289
|
+
training_set[chr] = [info]
|
290
|
+
else
|
291
|
+
query << info
|
292
|
+
end
|
293
|
+
end
|
294
|
+
return training_set
|
295
|
+
end
|
296
|
+
|
297
|
+
#2. Indexing by hpo (code)
|
298
|
+
#prepare training file for analysis using phenotype2region prediction
|
299
|
+
def load_training_file4HPO(training_file, thresold=0)
|
300
|
+
training_set = {}
|
301
|
+
information = loadFile(training_file, thresold)
|
302
|
+
information.each do |info|
|
303
|
+
hpoCode = info.delete_at(4)
|
304
|
+
query = training_set[hpoCode]
|
305
|
+
if query.nil?
|
306
|
+
training_set[hpoCode] = [info]
|
307
|
+
else
|
308
|
+
query << info
|
309
|
+
end
|
310
|
+
end
|
311
|
+
# STDERR.puts training_set.keys.inspect
|
312
|
+
return training_set
|
313
|
+
end
|
314
|
+
|
315
|
+
|
316
|
+
#3. Load training info file:
|
317
|
+
#Chr;Start;Stop;HPO;Association;node
|
318
|
+
def loadFile(file, thresold=0)
|
319
|
+
information = []
|
320
|
+
File.open(file).each do |line|
|
321
|
+
line.chomp!
|
322
|
+
allInfo = line.split("\t")
|
323
|
+
associationValue = allInfo[4].to_f
|
324
|
+
if associationValue >= thresold
|
325
|
+
chr = allInfo[0]
|
326
|
+
startPos = allInfo[1].to_i
|
327
|
+
stopPos = allInfo[2].to_i
|
328
|
+
hpoCode = allInfo[3]
|
329
|
+
nodeID = allInfo[5]
|
330
|
+
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
331
|
+
end
|
332
|
+
end
|
333
|
+
return information
|
334
|
+
end
|
335
|
+
|
336
|
+
def load_hpo_ci_values(information_coefficient_file)
|
337
|
+
hpos_ci_values = {}
|
338
|
+
File.open(information_coefficient_file).each do |line|
|
339
|
+
line.chomp!
|
340
|
+
hpo_code, ci = line.split("\t")
|
341
|
+
hpos_ci_values[hpo_code.to_sym] = ci.to_f
|
342
|
+
end
|
343
|
+
return hpos_ci_values
|
344
|
+
end
|
345
|
+
|
346
|
+
def load_clustered_patients(file)
|
347
|
+
clusters = {}
|
348
|
+
File.open(file).each do |line|
|
349
|
+
line.chomp!
|
350
|
+
pat_id, cluster_id = line.split("\t")
|
351
|
+
query = clusters[cluster_id]
|
352
|
+
if query.nil?
|
353
|
+
clusters[cluster_id] = [pat_id]
|
354
|
+
else
|
355
|
+
query << pat_id
|
356
|
+
end
|
357
|
+
end
|
358
|
+
return clusters
|
359
|
+
end
|
360
|
+
|
361
|
+
def load_gene_data(gene_data_path)
|
362
|
+
gene_list = {} #geneID => attr
|
363
|
+
gene_location = {} # chr => gene
|
364
|
+
infile = open(gene_data_path)
|
365
|
+
gz = Zlib::GzipReader.new(infile)
|
366
|
+
current_chr = nil
|
367
|
+
genes = []
|
368
|
+
gz.each_line do |line|
|
369
|
+
line.chomp!
|
370
|
+
next if line =~ /^#/
|
371
|
+
fields = line.split("\t")
|
372
|
+
if fields[8].include?('genome=chromosome')
|
373
|
+
chr = fields[8].split(';')[1].split('=').last
|
374
|
+
gene_location[current_chr] = genes
|
375
|
+
genes = []
|
376
|
+
current_chr = chr
|
377
|
+
elsif fields[2] == 'gene'
|
378
|
+
attributes = {}
|
379
|
+
fields[8].split(';').each do |pair|
|
380
|
+
key, value = pair.split('=')
|
381
|
+
attributes[key] = value
|
382
|
+
end
|
383
|
+
geneName = nil
|
384
|
+
geneName = attributes['gene'] if !attributes['gene'].nil?
|
385
|
+
geneSyns = []
|
386
|
+
geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
|
387
|
+
description = attributes['description']
|
388
|
+
description = URI.unescape(description) if !description.nil?
|
389
|
+
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
390
|
+
gene_list[$1] = [geneName, geneSyns, description]
|
391
|
+
genes << [$1, fields[3].to_i, fields[4].to_i]
|
392
|
+
end
|
393
|
+
end
|
394
|
+
gene_location[current_chr] = genes
|
395
|
+
return gene_list, gene_location
|
396
|
+
end
|
397
|
+
|
398
|
+
def parse_kegg_data(query_genes)
|
399
|
+
kegg_data = {} #gene => attb
|
400
|
+
while !query_genes.empty?
|
401
|
+
gene_set = query_genes.shift(10)
|
402
|
+
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
403
|
+
uri = URI(url)
|
404
|
+
response = Net::HTTP.get(uri)
|
405
|
+
geneID = nil
|
406
|
+
gene_names = []
|
407
|
+
definition = nil
|
408
|
+
pathways = []
|
409
|
+
parsing_pathway_field = false
|
410
|
+
response.squeeze(' ').each_line do |line|
|
411
|
+
line.chomp!
|
412
|
+
if line =~ /^ENTRY/
|
413
|
+
geneID = line.split(' ')[1]
|
414
|
+
elsif line =~ /^NAME/
|
415
|
+
gene_names = line.split(' ', 2).last.split(', ')
|
416
|
+
elsif line =~ /^DEFINITION/
|
417
|
+
definition = line.split(' ', 2)[1]
|
418
|
+
elsif line =~ /^PATHWAY/
|
419
|
+
pathways << line.split(' ', 3)[1..2]
|
420
|
+
parsing_pathway_field = true
|
421
|
+
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
422
|
+
parsing_pathway_field = false
|
423
|
+
elsif parsing_pathway_field
|
424
|
+
pathways << line.strip.split(' ', 2)
|
425
|
+
elsif line == '///'
|
426
|
+
parsing_pathway_field = false
|
427
|
+
kegg_data[geneID] = [gene_names, definition, pathways]
|
428
|
+
pathways = []
|
429
|
+
gene_names = []
|
430
|
+
end
|
431
|
+
end
|
432
|
+
end
|
433
|
+
return kegg_data
|
434
|
+
end
|
435
|
+
|
436
|
+
def write_compressed_plain_file(data, path)
|
437
|
+
File.open(path, 'w') do |f|
|
438
|
+
gz = Zlib::GzipWriter.new(f)
|
439
|
+
gz.write data.to_json
|
440
|
+
gz.close
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
def read_compressed_json(path)
|
445
|
+
infile = open(path)
|
446
|
+
gz = Zlib::GzipReader.new(infile)
|
447
|
+
object = JSON.parse(gz.read)
|
448
|
+
return object
|
449
|
+
end
|
450
|
+
|
451
|
+
def download(ftp_server, path, name)
|
452
|
+
ftp = Net::FTP.new()
|
453
|
+
ftp.connect(ftp_server)
|
454
|
+
ftp.login
|
455
|
+
ftp.getbinaryfile(path, name)
|
456
|
+
ftp.close
|
457
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
class Cohort_Parser
|
2
|
+
def self.load(options)
|
3
|
+
fields2extract = get_fields2extract(options)
|
4
|
+
field_numbers = fields2extract.values
|
5
|
+
records = read_records(options, fields2extract, field_numbers)
|
6
|
+
cohort, rejected_terms, rejected_recs = create_cohort(records, options)
|
7
|
+
return cohort, rejected_terms, rejected_recs
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.read_records(options, fields2extract, field_numbers)
|
11
|
+
records = {}
|
12
|
+
count = 0
|
13
|
+
File.open(options[:input_file]).each do |line|
|
14
|
+
line.chomp!
|
15
|
+
if options[:header] && count == 0
|
16
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
17
|
+
field_names = line.split("\t")
|
18
|
+
get_field_numbers2extract(field_names, fields2extract)
|
19
|
+
field_numbers = fields2extract.values
|
20
|
+
else
|
21
|
+
fields = line.split("\t")
|
22
|
+
record = field_numbers.map{|n| fields[n]}
|
23
|
+
if fields2extract[:id_col].nil?
|
24
|
+
id = "rec_#{count}" #generate ids
|
25
|
+
else
|
26
|
+
id = record.shift
|
27
|
+
end
|
28
|
+
if !record[0].nil?
|
29
|
+
record[0] = record[0].split(options[:separator])
|
30
|
+
else
|
31
|
+
record[0] = []
|
32
|
+
end
|
33
|
+
record[2] = record[2].to_i if !options[:start_col].nil?
|
34
|
+
record[3] = record[3].to_i if !options[:end_col].nil?
|
35
|
+
query = records[id]
|
36
|
+
if query.nil?
|
37
|
+
records[id] = [record]
|
38
|
+
else
|
39
|
+
query << record
|
40
|
+
end
|
41
|
+
end
|
42
|
+
count +=1
|
43
|
+
end
|
44
|
+
return records
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.get_fields2extract(options)
|
48
|
+
fields2extract = {}
|
49
|
+
[:id_col, :ont_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
col = options[field]
|
51
|
+
if !col.nil?
|
52
|
+
col = col.to_i if !options[:header]
|
53
|
+
fields2extract[field] = col
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return fields2extract
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.get_field_numbers2extract(field_names, fields2extract)
|
60
|
+
fields2extract.each do |field, name|
|
61
|
+
fields2extract[field] = field_names.index(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.create_cohort(records, options)
|
66
|
+
ont = Cohort.get_ontology(Cohort.act_ont)
|
67
|
+
rejected_terms = []
|
68
|
+
rejected_recs = []
|
69
|
+
cohort = Cohort.new()
|
70
|
+
records.each do |id, record|
|
71
|
+
rec = record.first
|
72
|
+
terms = rec.first
|
73
|
+
if options[:names]
|
74
|
+
init_term_number = terms.length
|
75
|
+
terms, rec_rejected_terms = ont.translate_names(terms)
|
76
|
+
if !rec_rejected_terms.empty?
|
77
|
+
STDERR.puts "WARNING: record #{id} has the unknown term NAMES '#{rec_rejected_terms.join(',')}'. Terms removed."
|
78
|
+
rejected_terms.concat(rec_rejected_terms)
|
79
|
+
end
|
80
|
+
if terms.empty? && init_term_number > 0
|
81
|
+
rejected_recs << id
|
82
|
+
next
|
83
|
+
end
|
84
|
+
end
|
85
|
+
if rec.length > 1 # there is genomic region attributes
|
86
|
+
variants = record.map{|v| v[1..3] }
|
87
|
+
else
|
88
|
+
variants = [] # Not exists genomic region attributes so we create a empty array
|
89
|
+
end
|
90
|
+
cohort.add_record([id, terms, check_variants(variants)])
|
91
|
+
end
|
92
|
+
return cohort, rejected_terms.uniq, rejected_recs
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.check_variants(vars)
|
96
|
+
checked_vars = []
|
97
|
+
vars.each do |var| #[chr, start, stop]
|
98
|
+
if var.first == '-' # the chr must be defined
|
99
|
+
STDERR.puts "WARNING: variant #{var.join(',')} has been removed"
|
100
|
+
else
|
101
|
+
checked_vars << var
|
102
|
+
end
|
103
|
+
end
|
104
|
+
return vars
|
105
|
+
end
|
106
|
+
end
|
data/lib/pets/version.rb
CHANGED