pets 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/README.md +79 -5
- data/bin/coPatReporter.rb +63 -156
- data/bin/comPatMondo.rb +1 -4
- data/bin/evidence_profiler.rb +38 -151
- data/bin/get_network_nodes.rb +79 -132
- data/bin/get_sorted_profs.rb +25 -36
- data/bin/install_deps.rb +7 -0
- data/bin/paco_translator.rb +29 -72
- data/bin/phen2reg.rb +1 -4
- data/bin/profiles2phenopacket.rb +110 -0
- data/bin/reg2phen.rb +1 -3
- data/example_datasets/associations_file.txt +757 -0
- data/example_datasets/example_patient.txt +6 -0
- data/example_datasets/example_patient_hpos.txt +15 -0
- data/example_datasets/genes.txt +8 -0
- data/example_datasets/hpo2ci.txt +2798 -0
- data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
- data/example_datasets/launch.sh +20 -0
- data/external_code/generate_boxpot.R +51 -21
- data/external_code/get_clusters.R +2 -2
- data/external_code/install_R_dependencies.R +11 -0
- data/external_code/plot_heatmap.R +34 -30
- data/lib/pets/coPatReporterMethods.rb +143 -441
- data/lib/pets/cohort.rb +307 -0
- data/lib/pets/constants.rb +7 -0
- data/lib/pets/generalMethods.rb +8 -317
- data/lib/pets/genomic_features.rb +144 -0
- data/lib/pets/io.rb +457 -0
- data/lib/pets/parsers/cohort_parser.rb +106 -0
- data/lib/pets/version.rb +1 -1
- data/lib/pets.rb +8 -0
- data/pets.gemspec +1 -0
- data/templates/cohort_report.erb +5 -7
- data/templates/patient_report.erb +1 -1
- metadata +34 -3
@@ -0,0 +1,144 @@
|
|
1
|
+
class Genomic_Feature
|
2
|
+
#If any method use gen_fet as name is a Genomic_Feature object
|
3
|
+
def initialize(feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
|
4
|
+
@regions = {}
|
5
|
+
@reg_id = -1
|
6
|
+
load_features(feat_array)
|
7
|
+
end
|
8
|
+
|
9
|
+
def load_features(feat_array)
|
10
|
+
feat_array.each do |chr, start, stop|
|
11
|
+
chr = chr.to_sym
|
12
|
+
region = {start: start, stop: stop, to: @reg_id +=1 }
|
13
|
+
add_record(@regions, chr, region)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def length
|
18
|
+
return @regions.length
|
19
|
+
end
|
20
|
+
|
21
|
+
def each()
|
22
|
+
@regions.each do |chr, regs|
|
23
|
+
regs.each do |region|
|
24
|
+
yield(chr, region)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_chr
|
30
|
+
return @regions.keys
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_sizes
|
34
|
+
sizes = []
|
35
|
+
each do |chr, region|
|
36
|
+
size = region[:stop] - region[:start] + 1
|
37
|
+
sizes << size
|
38
|
+
end
|
39
|
+
return sizes
|
40
|
+
end
|
41
|
+
|
42
|
+
def get_summary_sizes
|
43
|
+
sizes = Hash.new(0)
|
44
|
+
each do |chr, region|
|
45
|
+
size = region[:stop] - region[:start] + 1
|
46
|
+
sizes[size] += 1
|
47
|
+
end
|
48
|
+
return sizes.to_a.sort!{|s| s[1] <=> s[1] }
|
49
|
+
end
|
50
|
+
|
51
|
+
def merge(gen_fet, to = nil) # 'to' the regions must be connected "to" given id
|
52
|
+
gen_fet.each do |chr, region|
|
53
|
+
to.nil? ? region[:to] = @reg_id +=1 : region[:to] = to # handle id custom or default
|
54
|
+
add_record(@regions, chr, region)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_reference_overlaps(genomic_ranges, reference)
|
59
|
+
overlaps = []
|
60
|
+
reference.each do |start, stop|
|
61
|
+
reg_ids = []
|
62
|
+
genomic_ranges.each do |reg|
|
63
|
+
reg_ids << reg[:to] if coor_overlap?(start, stop, reg)
|
64
|
+
end
|
65
|
+
overlaps << reg_ids.uniq
|
66
|
+
end
|
67
|
+
return overlaps
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_cluster_regions(meth, tag, ids_per_reg = 1)
|
71
|
+
compute_windows(meth) # Get putative genome windows
|
72
|
+
patients_out_of_cluster = 0
|
73
|
+
ids_by_cluster = {}
|
74
|
+
annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
|
75
|
+
@regions.each do |chr, regs|
|
76
|
+
reference = @windows[chr]
|
77
|
+
overlaps = get_reference_overlaps(regs, reference) # See what patient has match with a overlap region
|
78
|
+
clust_numb = 0
|
79
|
+
reference.each_with_index do |ref, i|
|
80
|
+
current_ids = overlaps[i]
|
81
|
+
if current_ids.length > ids_per_reg
|
82
|
+
clust_id = "#{chr}.#{clust_numb +=1}.#{tag}.#{current_ids.length}"
|
83
|
+
current_ids.each do |curr_id|
|
84
|
+
add_record(ids_by_cluster, curr_id, clust_id, true)
|
85
|
+
end
|
86
|
+
annotated_full_ref << ref.dup.concat([chr, clust_id])
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
return ids_by_cluster, annotated_full_ref
|
91
|
+
end
|
92
|
+
|
93
|
+
def compute_windows(meth)
|
94
|
+
@windows = {}
|
95
|
+
@regions.each do |chr, regs|
|
96
|
+
chr_windows = nil
|
97
|
+
if meth == :reg_overlap
|
98
|
+
chr_windows = compute_region_overlap_windows(regs)
|
99
|
+
end
|
100
|
+
@windows[chr] = chr_windows
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def add_record(hash, key, record, uniq=false)
|
107
|
+
query = hash[key]
|
108
|
+
if query.nil?
|
109
|
+
hash[key] = [record]
|
110
|
+
elsif !uniq # We not take care by repeated entries
|
111
|
+
query << record
|
112
|
+
elsif !query.include?(record) # We want uniq entries
|
113
|
+
query << record
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def compute_region_overlap_windows(genomic_ranges)
|
118
|
+
reference = []
|
119
|
+
reference.concat(genomic_ranges.map{|gr| gr[:start]})# get start
|
120
|
+
reference.concat(genomic_ranges.map{|gr| gr[:stop]})# get stop
|
121
|
+
reference.uniq!
|
122
|
+
reference.sort!
|
123
|
+
#Define overlap ranges
|
124
|
+
final_reference = []
|
125
|
+
reference.each_with_index do |coord,i|
|
126
|
+
next_coord = reference[i + 1]
|
127
|
+
final_reference << [coord, next_coord] if !next_coord.nil?
|
128
|
+
end
|
129
|
+
return final_reference
|
130
|
+
end
|
131
|
+
|
132
|
+
def coor_overlap?(start, stop, reg)
|
133
|
+
overlap = false
|
134
|
+
reg_start = reg[:start]
|
135
|
+
reg_stop = reg[:stop]
|
136
|
+
if (start <= reg_start && stop >= reg_stop) ||
|
137
|
+
(start > reg_start && stop < reg_stop) ||
|
138
|
+
(stop > reg_start && stop <= reg_stop) ||
|
139
|
+
(start >= reg_start && start < reg_stop)
|
140
|
+
overlap = true
|
141
|
+
end
|
142
|
+
return overlap
|
143
|
+
end
|
144
|
+
end
|
data/lib/pets/io.rb
ADDED
@@ -0,0 +1,457 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
def load_hpo_ontology(hpo_file, excluded_hpo_file)
|
4
|
+
hpo = nil
|
5
|
+
if !hpo_file.include?('.json')
|
6
|
+
if !excluded_hpo_file.nil?
|
7
|
+
hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file))
|
8
|
+
else
|
9
|
+
hpo = Ontology.new(file: hpo_file, load_file: true)
|
10
|
+
end
|
11
|
+
else
|
12
|
+
hpo = Ontology.new
|
13
|
+
hpo.read(hpo_file)
|
14
|
+
if !excluded_hpo_file.nil?
|
15
|
+
hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file))
|
16
|
+
hpo.remove_removable()
|
17
|
+
hpo.build_index()
|
18
|
+
end
|
19
|
+
end
|
20
|
+
return hpo
|
21
|
+
end
|
22
|
+
|
23
|
+
def read_excluded_hpo_file(file)
|
24
|
+
excluded_hpo = []
|
25
|
+
File.open(file).each do |line|
|
26
|
+
excluded_hpo << line.chomp
|
27
|
+
end
|
28
|
+
return excluded_hpo
|
29
|
+
end
|
30
|
+
|
31
|
+
def write_hash(hash, file_path, header = [])
|
32
|
+
File.open(file_path, 'w') do |handler|
|
33
|
+
handler.puts header.join("\t") if !header.empty?
|
34
|
+
hash.each do |key, array|
|
35
|
+
handler.puts "#{key}\t#{array.join("\t")}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def write_array(array, file_path)
|
41
|
+
File.open(file_path, 'w') do |handler|
|
42
|
+
array.each do |record|
|
43
|
+
if record.class == String
|
44
|
+
line = record
|
45
|
+
else
|
46
|
+
line = record.join("\t")
|
47
|
+
end
|
48
|
+
handler.puts line
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def write_matrix_for_R(matrix, x_names, y_names, file)
|
54
|
+
File.open(file, 'w') do |f|
|
55
|
+
f.puts x_names.join("\t")
|
56
|
+
matrix.each_with_index do |row, i|
|
57
|
+
f.puts [y_names[i]].concat(row).join("\t")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit)
|
63
|
+
File.open(cluster_ic_data_file, 'w') do |f|
|
64
|
+
f.puts %w[cluster_id ic Plen].join("\t")
|
65
|
+
all_ics.each_with_index do |cluster_ics, i|
|
66
|
+
break if i == limit
|
67
|
+
cluster_length = cluster_ics.length
|
68
|
+
cluster_ics.each_with_index do |clust_ic, j|
|
69
|
+
f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit)
|
76
|
+
File.open(cluster_chromosome_data_file, 'w') do |f|
|
77
|
+
f.puts %w[cluster_id chr count].join("\t")
|
78
|
+
index = 0
|
79
|
+
last_id = cluster_data.first.first unless cluster_data.empty?
|
80
|
+
cluster_data.each do |cluster_id, patient_number, chr, count|
|
81
|
+
index += 1 if cluster_id != last_id
|
82
|
+
break if index == limit
|
83
|
+
f.puts ["#{patient_number}_#{index}", chr, count].join("\t")
|
84
|
+
last_id = cluster_id
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def write_coverage_data(coverage_to_plot, coverage_to_plot_file)
|
90
|
+
File.open(coverage_to_plot_file, 'w') do |f|
|
91
|
+
coverage_to_plot.each do |chr, position, freq|
|
92
|
+
f.puts "#{chr}\t#{position}\t#{freq}"
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats)
|
99
|
+
CSV.open(detailed_profile_evaluation_file, "wb") do |csv|
|
100
|
+
suggested_childs.each do |pat_id, suggestions|
|
101
|
+
warning = nil
|
102
|
+
warning = 'WARNING: Very few phenotypes' if suggestions.length < 4
|
103
|
+
csv << ["PATIENT #{pat_id}", "#{warning}"]
|
104
|
+
csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"]
|
105
|
+
suggestions.each do |parent, childs|
|
106
|
+
parent_code, parent_name = parent
|
107
|
+
if childs.empty?
|
108
|
+
csv << ["#{parent_name} (#{parent_code})", '-']
|
109
|
+
else
|
110
|
+
parent_writed = false
|
111
|
+
childs.each do |child_code, child_name|
|
112
|
+
if !parent_writed
|
113
|
+
parent_field = "#{parent_name} (#{parent_code})"
|
114
|
+
parent_writed = true
|
115
|
+
else
|
116
|
+
parent_field = ""
|
117
|
+
end
|
118
|
+
csv << [parent_field, "#{child_name} (#{child_code})"]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
csv << ["", ""]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name)
|
128
|
+
File.open(filename, 'w') do |f|
|
129
|
+
f.puts "#{x_axis_name}\t#{y_axis_name}"
|
130
|
+
x_axis_value.each_with_index do |value,i|
|
131
|
+
y_value = y_axis_value[i]
|
132
|
+
raise("The #{i} position is not presented in y_axis_value") if y_value.nil?
|
133
|
+
f.puts [value, y_value].join("\t")
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
def write_similarity_matrix(similarity_matrix, similarity_matrix_file)
|
140
|
+
File.open(similarity_matrix_file, 'w') do |f|
|
141
|
+
similarity_matrix.each do |row|
|
142
|
+
f.puts row.join("\t")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def write_profile_pairs(similarity_pairs, filename)
|
148
|
+
File.open(filename, 'w') do |f|
|
149
|
+
similarity_pairs.each do |pairsA, pairsB_and_values|
|
150
|
+
pairsB_and_values.each do |pairsB, values|
|
151
|
+
f.puts "#{pairsA}\t#{pairsB}\t#{values}"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file)
|
158
|
+
File.open(output_file, 'w') do |f|
|
159
|
+
f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}"
|
160
|
+
average_hp_per_pat_distribution.each do |patient_num, ave|
|
161
|
+
f.puts "#{patient_num}\t#{ave}"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def parse_clusters_file(clusters_file, patient_data)
|
167
|
+
clusters_info = {}
|
168
|
+
clusters_table = []
|
169
|
+
File.open(clusters_file).each do |line|
|
170
|
+
line.chomp!
|
171
|
+
patientID, clusterID = line.split("\t")
|
172
|
+
patientHPOProfile = patient_data.get_profile(patientID)
|
173
|
+
query = clusters_info[clusterID]
|
174
|
+
if query.nil?
|
175
|
+
clusters_info[clusterID] = {patientID => patientHPOProfile}
|
176
|
+
else
|
177
|
+
query[patientID] = patientHPOProfile
|
178
|
+
end
|
179
|
+
end
|
180
|
+
clusters_info.each do |clusterID, patients_info|
|
181
|
+
patients_per_cluster = patients_info.keys.length
|
182
|
+
clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values]
|
183
|
+
end
|
184
|
+
return clusters_table, clusters_info
|
185
|
+
end
|
186
|
+
|
187
|
+
def load_profiles(file_path, hpo)
|
188
|
+
profiles = {}
|
189
|
+
#count = 0
|
190
|
+
File.open(file_path).each do |line|
|
191
|
+
id, profile = line.chomp.split("\t")
|
192
|
+
hpos = profile.split(',').map{|a| a.to_sym}
|
193
|
+
hpos, rejected_hpos = hpo.check_ids(hpos)
|
194
|
+
if !hpos.empty?
|
195
|
+
hpos = hpo.clean_profile(hpos)
|
196
|
+
profiles[id] = hpos if !hpos.empty?
|
197
|
+
end
|
198
|
+
end
|
199
|
+
return profiles
|
200
|
+
end
|
201
|
+
|
202
|
+
def load_variants(variant_folder)
|
203
|
+
variants = {}
|
204
|
+
Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
|
205
|
+
profile_id = File.basename(path, '.tab')
|
206
|
+
vars = []
|
207
|
+
File.open(path).each do |line|
|
208
|
+
fields = line.chomp.split("\t")
|
209
|
+
chr = fields[0]
|
210
|
+
start = fields[1].to_i
|
211
|
+
vars << [chr, start, start]
|
212
|
+
end
|
213
|
+
variants[profile_id] = Genomic_Feature.new(vars)
|
214
|
+
end
|
215
|
+
return variants
|
216
|
+
end
|
217
|
+
|
218
|
+
def load_evidences(evidences_path, hpo)
|
219
|
+
genomic_coordinates = {}
|
220
|
+
coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
|
221
|
+
coord_files.each do |cd_f|
|
222
|
+
entity = File.basename(cd_f, '.coords')
|
223
|
+
coordinates = load_coordinates(cd_f)
|
224
|
+
genomic_coordinates[entity] = coordinates
|
225
|
+
end
|
226
|
+
evidences = {}
|
227
|
+
evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
|
228
|
+
evidence_files.each do |e_f|
|
229
|
+
pair = File.basename(e_f, '.txt')
|
230
|
+
profiles, id2label = load_evidence_profiles(e_f, hpo)
|
231
|
+
evidences[pair] = {prof: profiles, id2lab: id2label}
|
232
|
+
end
|
233
|
+
return evidences, genomic_coordinates
|
234
|
+
end
|
235
|
+
|
236
|
+
def load_coordinates(file_path)
|
237
|
+
coordinates = {}
|
238
|
+
header = true
|
239
|
+
File.open(file_path).each do |line|
|
240
|
+
fields = line.chomp.split("\t")
|
241
|
+
if header
|
242
|
+
header = false
|
243
|
+
else
|
244
|
+
entity, chr, strand, start, stop = fields
|
245
|
+
coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
return coordinates
|
249
|
+
end
|
250
|
+
|
251
|
+
def load_evidence_profiles(file_path, hpo)
|
252
|
+
profiles = {}
|
253
|
+
id2label = {}
|
254
|
+
#count = 0
|
255
|
+
File.open(file_path).each do |line|
|
256
|
+
id, label, profile = line.chomp.split("\t")
|
257
|
+
hpos = profile.split(',').map{|a| a.to_sym}
|
258
|
+
hpos, rejected_hpos = hpo.check_ids(hpos)
|
259
|
+
if !hpos.empty?
|
260
|
+
hpos = hpo.clean_profile(hpos)
|
261
|
+
profiles[id] = hpos if !hpos.empty?
|
262
|
+
id2label[id] = label
|
263
|
+
end
|
264
|
+
end
|
265
|
+
return profiles, id2label
|
266
|
+
end
|
267
|
+
|
268
|
+
#Common methods for predictors
|
269
|
+
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
|
270
|
+
#1. Indexing by chr (region)
|
271
|
+
def coor_overlap?(ref_start, ref_stop, start, stop)
|
272
|
+
overlap = false
|
273
|
+
if (stop > ref_start && stop <= ref_stop) ||
|
274
|
+
(start >= ref_start && start < ref_stop) ||
|
275
|
+
(start <= ref_start && stop >= ref_stop) ||
|
276
|
+
(start > ref_start && stop < ref_stop)
|
277
|
+
overlap = true
|
278
|
+
end
|
279
|
+
return overlap
|
280
|
+
end
|
281
|
+
|
282
|
+
def load_training_file4regions(training_file)
|
283
|
+
training_set = {}
|
284
|
+
posInfo = loadFile(training_file)
|
285
|
+
posInfo.each do |info|
|
286
|
+
chr = info.shift
|
287
|
+
query = training_set[chr]
|
288
|
+
if query.nil?
|
289
|
+
training_set[chr] = [info]
|
290
|
+
else
|
291
|
+
query << info
|
292
|
+
end
|
293
|
+
end
|
294
|
+
return training_set
|
295
|
+
end
|
296
|
+
|
297
|
+
#2. Indexing by hpo (code)
|
298
|
+
#prepare training file for analysis using phenotype2region prediction
|
299
|
+
def load_training_file4HPO(training_file, thresold=0)
|
300
|
+
training_set = {}
|
301
|
+
information = loadFile(training_file, thresold)
|
302
|
+
information.each do |info|
|
303
|
+
hpoCode = info.delete_at(4)
|
304
|
+
query = training_set[hpoCode]
|
305
|
+
if query.nil?
|
306
|
+
training_set[hpoCode] = [info]
|
307
|
+
else
|
308
|
+
query << info
|
309
|
+
end
|
310
|
+
end
|
311
|
+
# STDERR.puts training_set.keys.inspect
|
312
|
+
return training_set
|
313
|
+
end
|
314
|
+
|
315
|
+
|
316
|
+
#3. Load training info file:
|
317
|
+
#Chr;Start;Stop;HPO;Association;node
|
318
|
+
def loadFile(file, thresold=0)
|
319
|
+
information = []
|
320
|
+
File.open(file).each do |line|
|
321
|
+
line.chomp!
|
322
|
+
allInfo = line.split("\t")
|
323
|
+
associationValue = allInfo[4].to_f
|
324
|
+
if associationValue >= thresold
|
325
|
+
chr = allInfo[0]
|
326
|
+
startPos = allInfo[1].to_i
|
327
|
+
stopPos = allInfo[2].to_i
|
328
|
+
hpoCode = allInfo[3]
|
329
|
+
nodeID = allInfo[5]
|
330
|
+
information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
|
331
|
+
end
|
332
|
+
end
|
333
|
+
return information
|
334
|
+
end
|
335
|
+
|
336
|
+
def load_hpo_ci_values(information_coefficient_file)
|
337
|
+
hpos_ci_values = {}
|
338
|
+
File.open(information_coefficient_file).each do |line|
|
339
|
+
line.chomp!
|
340
|
+
hpo_code, ci = line.split("\t")
|
341
|
+
hpos_ci_values[hpo_code.to_sym] = ci.to_f
|
342
|
+
end
|
343
|
+
return hpos_ci_values
|
344
|
+
end
|
345
|
+
|
346
|
+
def load_clustered_patients(file)
|
347
|
+
clusters = {}
|
348
|
+
File.open(file).each do |line|
|
349
|
+
line.chomp!
|
350
|
+
pat_id, cluster_id = line.split("\t")
|
351
|
+
query = clusters[cluster_id]
|
352
|
+
if query.nil?
|
353
|
+
clusters[cluster_id] = [pat_id]
|
354
|
+
else
|
355
|
+
query << pat_id
|
356
|
+
end
|
357
|
+
end
|
358
|
+
return clusters
|
359
|
+
end
|
360
|
+
|
361
|
+
def load_gene_data(gene_data_path)
|
362
|
+
gene_list = {} #geneID => attr
|
363
|
+
gene_location = {} # chr => gene
|
364
|
+
infile = open(gene_data_path)
|
365
|
+
gz = Zlib::GzipReader.new(infile)
|
366
|
+
current_chr = nil
|
367
|
+
genes = []
|
368
|
+
gz.each_line do |line|
|
369
|
+
line.chomp!
|
370
|
+
next if line =~ /^#/
|
371
|
+
fields = line.split("\t")
|
372
|
+
if fields[8].include?('genome=chromosome')
|
373
|
+
chr = fields[8].split(';')[1].split('=').last
|
374
|
+
gene_location[current_chr] = genes
|
375
|
+
genes = []
|
376
|
+
current_chr = chr
|
377
|
+
elsif fields[2] == 'gene'
|
378
|
+
attributes = {}
|
379
|
+
fields[8].split(';').each do |pair|
|
380
|
+
key, value = pair.split('=')
|
381
|
+
attributes[key] = value
|
382
|
+
end
|
383
|
+
geneName = nil
|
384
|
+
geneName = attributes['gene'] if !attributes['gene'].nil?
|
385
|
+
geneSyns = []
|
386
|
+
geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
|
387
|
+
description = attributes['description']
|
388
|
+
description = URI.unescape(description) if !description.nil?
|
389
|
+
attributes['Dbxref'] =~ /GeneID:(\d+)/
|
390
|
+
gene_list[$1] = [geneName, geneSyns, description]
|
391
|
+
genes << [$1, fields[3].to_i, fields[4].to_i]
|
392
|
+
end
|
393
|
+
end
|
394
|
+
gene_location[current_chr] = genes
|
395
|
+
return gene_list, gene_location
|
396
|
+
end
|
397
|
+
|
398
|
+
def parse_kegg_data(query_genes)
|
399
|
+
kegg_data = {} #gene => attb
|
400
|
+
while !query_genes.empty?
|
401
|
+
gene_set = query_genes.shift(10)
|
402
|
+
url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
|
403
|
+
uri = URI(url)
|
404
|
+
response = Net::HTTP.get(uri)
|
405
|
+
geneID = nil
|
406
|
+
gene_names = []
|
407
|
+
definition = nil
|
408
|
+
pathways = []
|
409
|
+
parsing_pathway_field = false
|
410
|
+
response.squeeze(' ').each_line do |line|
|
411
|
+
line.chomp!
|
412
|
+
if line =~ /^ENTRY/
|
413
|
+
geneID = line.split(' ')[1]
|
414
|
+
elsif line =~ /^NAME/
|
415
|
+
gene_names = line.split(' ', 2).last.split(', ')
|
416
|
+
elsif line =~ /^DEFINITION/
|
417
|
+
definition = line.split(' ', 2)[1]
|
418
|
+
elsif line =~ /^PATHWAY/
|
419
|
+
pathways << line.split(' ', 3)[1..2]
|
420
|
+
parsing_pathway_field = true
|
421
|
+
elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
|
422
|
+
parsing_pathway_field = false
|
423
|
+
elsif parsing_pathway_field
|
424
|
+
pathways << line.strip.split(' ', 2)
|
425
|
+
elsif line == '///'
|
426
|
+
parsing_pathway_field = false
|
427
|
+
kegg_data[geneID] = [gene_names, definition, pathways]
|
428
|
+
pathways = []
|
429
|
+
gene_names = []
|
430
|
+
end
|
431
|
+
end
|
432
|
+
end
|
433
|
+
return kegg_data
|
434
|
+
end
|
435
|
+
|
436
|
+
def write_compressed_plain_file(data, path)
|
437
|
+
File.open(path, 'w') do |f|
|
438
|
+
gz = Zlib::GzipWriter.new(f)
|
439
|
+
gz.write data.to_json
|
440
|
+
gz.close
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
def read_compressed_json(path)
|
445
|
+
infile = open(path)
|
446
|
+
gz = Zlib::GzipReader.new(infile)
|
447
|
+
object = JSON.parse(gz.read)
|
448
|
+
return object
|
449
|
+
end
|
450
|
+
|
451
|
+
def download(ftp_server, path, name)
|
452
|
+
ftp = Net::FTP.new()
|
453
|
+
ftp.connect(ftp_server)
|
454
|
+
ftp.login
|
455
|
+
ftp.getbinaryfile(path, name)
|
456
|
+
ftp.close
|
457
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
class Cohort_Parser
|
2
|
+
def self.load(options)
|
3
|
+
fields2extract = get_fields2extract(options)
|
4
|
+
field_numbers = fields2extract.values
|
5
|
+
records = read_records(options, fields2extract, field_numbers)
|
6
|
+
cohort, rejected_terms, rejected_recs = create_cohort(records, options)
|
7
|
+
return cohort, rejected_terms, rejected_recs
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.read_records(options, fields2extract, field_numbers)
|
11
|
+
records = {}
|
12
|
+
count = 0
|
13
|
+
File.open(options[:input_file]).each do |line|
|
14
|
+
line.chomp!
|
15
|
+
if options[:header] && count == 0
|
16
|
+
line.gsub!(/#\s*/,'') # correct comment like headers
|
17
|
+
field_names = line.split("\t")
|
18
|
+
get_field_numbers2extract(field_names, fields2extract)
|
19
|
+
field_numbers = fields2extract.values
|
20
|
+
else
|
21
|
+
fields = line.split("\t")
|
22
|
+
record = field_numbers.map{|n| fields[n]}
|
23
|
+
if fields2extract[:id_col].nil?
|
24
|
+
id = "rec_#{count}" #generate ids
|
25
|
+
else
|
26
|
+
id = record.shift
|
27
|
+
end
|
28
|
+
if !record[0].nil?
|
29
|
+
record[0] = record[0].split(options[:separator])
|
30
|
+
else
|
31
|
+
record[0] = []
|
32
|
+
end
|
33
|
+
record[2] = record[2].to_i if !options[:start_col].nil?
|
34
|
+
record[3] = record[3].to_i if !options[:end_col].nil?
|
35
|
+
query = records[id]
|
36
|
+
if query.nil?
|
37
|
+
records[id] = [record]
|
38
|
+
else
|
39
|
+
query << record
|
40
|
+
end
|
41
|
+
end
|
42
|
+
count +=1
|
43
|
+
end
|
44
|
+
return records
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.get_fields2extract(options)
|
48
|
+
fields2extract = {}
|
49
|
+
[:id_col, :ont_col, :chromosome_col, :start_col, :end_col].each do |field|
|
50
|
+
col = options[field]
|
51
|
+
if !col.nil?
|
52
|
+
col = col.to_i if !options[:header]
|
53
|
+
fields2extract[field] = col
|
54
|
+
end
|
55
|
+
end
|
56
|
+
return fields2extract
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.get_field_numbers2extract(field_names, fields2extract)
|
60
|
+
fields2extract.each do |field, name|
|
61
|
+
fields2extract[field] = field_names.index(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.create_cohort(records, options)
|
66
|
+
ont = Cohort.get_ontology(Cohort.act_ont)
|
67
|
+
rejected_terms = []
|
68
|
+
rejected_recs = []
|
69
|
+
cohort = Cohort.new()
|
70
|
+
records.each do |id, record|
|
71
|
+
rec = record.first
|
72
|
+
terms = rec.first
|
73
|
+
if options[:names]
|
74
|
+
init_term_number = terms.length
|
75
|
+
terms, rec_rejected_terms = ont.translate_names(terms)
|
76
|
+
if !rec_rejected_terms.empty?
|
77
|
+
STDERR.puts "WARNING: record #{id} has the unknown term NAMES '#{rec_rejected_terms.join(',')}'. Terms removed."
|
78
|
+
rejected_terms.concat(rec_rejected_terms)
|
79
|
+
end
|
80
|
+
if terms.empty? && init_term_number > 0
|
81
|
+
rejected_recs << id
|
82
|
+
next
|
83
|
+
end
|
84
|
+
end
|
85
|
+
if rec.length > 1 # there is genomic region attributes
|
86
|
+
variants = record.map{|v| v[1..3] }
|
87
|
+
else
|
88
|
+
variants = [] # Not exists genomic region attributes so we create a empty array
|
89
|
+
end
|
90
|
+
cohort.add_record([id, terms, check_variants(variants)])
|
91
|
+
end
|
92
|
+
return cohort, rejected_terms.uniq, rejected_recs
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.check_variants(vars)
|
96
|
+
checked_vars = []
|
97
|
+
vars.each do |var| #[chr, start, stop]
|
98
|
+
if var.first == '-' # the chr must be defined
|
99
|
+
STDERR.puts "WARNING: variant #{var.join(',')} has been removed"
|
100
|
+
else
|
101
|
+
checked_vars << var
|
102
|
+
end
|
103
|
+
end
|
104
|
+
return vars
|
105
|
+
end
|
106
|
+
end
|
data/lib/pets/version.rb
CHANGED