pets 0.2.3 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
@@ -1,166 +1,102 @@
1
1
  #! /usr/bin/env ruby
2
2
 
3
3
  ROOT_PATH = File.dirname(__FILE__)
4
- REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
5
- EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
6
- EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
7
- HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
8
4
  $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
9
5
 
10
6
  require 'fileutils'
11
7
  require 'optparse'
12
8
  require 'report_html'
13
9
  require 'semtools'
14
- require 'generalMethods.rb'
15
-
16
-
17
- class Report_html
18
- def circular_genome(user_options = {}, &block)
19
- default_options = {}.merge!(user_options)
20
- coordinates = user_options[:genomic_coordinates]
21
- html_string = canvasXpress_main(default_options, block) do |options, config, samples, vars, values, object_id, x, z|
22
- config['graphType'] = 'Circular'
23
- config["arcSegmentsSeparation"] = 3
24
- config["colorScheme"] = "Tableau"
25
- config["colors"] = ["#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#AA4466","#882255","#AA4499"]
26
- config["showIdeogram"] = true
27
- chr = []
28
- pos = []
29
- tags2remove = []
30
- vars.each_with_index do |var, i|
31
- coord = coordinates[var]
32
- if !coord.nil?
33
- tag = coord.first.gsub(/[^\dXY]/,'')
34
- if tag == 'X' || tag == 'Y' || (tag.to_i > 0 && tag.to_i <= 22)
35
- chr << coord.first.gsub(/[^\dXY]/,'')
36
- pos << coord.last - 1
37
- else
38
- tags2remove << i
39
- end
40
- else
41
- tags2remove << i
42
- end
43
- end
44
- tags2remove.reverse_each{|i| ent = vars.delete_at(i); warn("Feature #{ent} has not valid coordinates")} # Remove entities with invalid coordinates
45
- z['chr'] = chr
46
- z['pos'] = pos
47
- end
48
- return html_string
49
- end
50
- end
10
+ require 'pets'
51
11
 
52
12
  #############################################################################################
53
13
  ## METHODS
54
14
  ############################################################################################
55
- def load_profiles(file_path, hpo)
56
- profiles = {}
57
- #count = 0
58
- File.open(file_path).each do |line|
59
- id, profile = line.chomp.split("\t")
60
- hpos = profile.split(',').map{|a| a.to_sym}
61
- hpos, rejected_hpos = hpo.check_ids(hpos)
62
- if !hpos.empty?
63
- hpos = hpo.clean_profile(hpos)
64
- profiles[id] = hpos if !hpos.empty?
65
- end
15
+ def load_pathogenic_scores(path)
16
+ scores = {}
17
+ File.open(path).each do |line|
18
+ feature, score = line.split("\t")
19
+ scores[feature] = score.to_f
66
20
  end
67
- return profiles
21
+ return scores
68
22
  end
69
23
 
70
- def load_variants(variant_folder)
71
- variants = {}
72
- coordinates = {}
73
- count = 0
74
- all_vars = {}
75
- Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
76
- profile_id = File.basename(path, '.tab')
77
- vars = {}
78
- File.open(path).each do |line|
79
- fields = line.chomp.split("\t")
80
- chr = fields[0]
81
- start = fields[1].to_i
82
- query = coordinates[chr]
83
- if query.nil?
84
- coordinates[chr] = [start]
85
- count += 1
86
- id = "var_#{count}"
87
- else
88
- if !query.include?(start)
89
- query << start
90
- count += 1
91
- id = "var_#{count}"
92
- else
93
- id = all_vars.key([chr, start])
94
- end
95
- end
96
- vars[id] = [chr, start]
97
- end
98
- all_vars.merge!(vars)
99
- variants[profile_id] = vars
100
- end
101
- return variants
24
+ def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
25
+ coords = nil
26
+ all_coordinates = genomic_coordinates[entity]
27
+ coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)} if !all_coordinates.nil?
28
+ return coords
102
29
  end
103
30
 
104
- def load_evidences(evidences_path, hpo)
105
- genomic_coordinates = {}
106
- coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
107
- coord_files.each do |cd_f|
108
- entity = File.basename(cd_f, '.coords')
109
- coordinates = load_coordinates(cd_f)
110
- genomic_coordinates[entity] = coordinates
111
- end
112
- evidences = {}
113
- evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
114
- evidence_files.each do |e_f|
115
- pair = File.basename(e_f, '.txt')
116
- profiles, id2label = load_evidence_profiles(e_f, hpo)
117
- evidences[pair] = {prof: profiles, id2lab: id2label}
118
- end
119
- return evidences, genomic_coordinates
31
+ def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
32
+ evidences, prof_vars, hotspots_with_pat_vars, template, output)
33
+ var_ids, var_coors = format_variants4report(prof_vars)
34
+ container = {
35
+ profile_id: profile_id,
36
+ candidates: all_candidates.each{|c| c[0] = c.first.to_s},
37
+ genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
38
+ similarity_matrixs: similarity_matrixs,
39
+ evidences: evidences,
40
+ var_ids: var_ids,
41
+ var_coordinates: var_coors,
42
+ hotspot_table: hotspots_with_pat_vars
43
+ }
44
+ report = Report_html.new(container, 'Evidence profile report')
45
+ report.build(template)
46
+ report.write(File.join(output, profile_id.to_s + '.html'))
120
47
  end
121
48
 
122
- def load_coordinates(file_path)
123
- coordinates = {}
124
- header = true
125
- File.open(file_path).each do |line|
126
- fields = line.chomp.split("\t")
127
- if header
128
- header = false
129
- else
130
- entity, chr, strand, start, stop = fields
131
- coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
49
+ def format_variants4report(var_data)
50
+ if var_data.nil?
51
+ var_ids, var_coors = nil
52
+ else
53
+ var_ids = []
54
+ var_coors = {}
55
+ count = 0
56
+ var_data.each do |chr, reg|
57
+ var_id = "var_#{count}"
58
+ var_ids << [var_id, 0]
59
+ var_coors[var_id] = [chr.to_s, reg[:start]]
60
+ count += 1
132
61
  end
133
62
  end
134
- return coordinates
63
+ return var_ids, var_coors
135
64
  end
136
65
 
137
- def load_evidence_profiles(file_path, hpo)
138
- profiles = {}
139
- id2label = {}
140
- #count = 0
141
- File.open(file_path).each do |line|
142
- id, label, profile = line.chomp.split("\t")
143
- hpos = profile.split(',').map{|a| a.to_sym}
144
- hpos, rejected_hpos = hpo.check_ids(hpos)
145
- if !hpos.empty?
146
- hpos = hpo.clean_profile(hpos)
147
- profiles[id] = hpos if !hpos.empty?
148
- id2label[id] = label
66
+
67
+ def generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
68
+ hotspots_with_pat_vars = []
69
+ if !prof_vars.nil?
70
+ phen_regions = Genomic_Feature.hash2genomic_feature(all_genomic_coordinates){|k, v| v[0..2].concat([k])}
71
+ phen_candidates_by_hotspot, phen_genome_hotspots = phen_regions.generate_cluster_regions(:reg_overlap, 'A', 0, true)
72
+ genome_matches = phen_genome_hotspots.match(prof_vars)
73
+ hotspot_with_phen_candidates = invert_hash(phen_candidates_by_hotspot)
74
+ genome_matches.each do |hotspot_id, pat_vars|
75
+ reg = phen_genome_hotspots.region_by_to(hotspot_id)
76
+ coords = [reg[:chr], reg[:start], reg[:stop]]
77
+ hotspots_with_pat_vars << [hotspot_id, coords, hotspot_with_phen_candidates[hotspot_id], pat_vars]
149
78
  end
79
+ # TODO: see to use original similarities without use top candidates in similarity_matrixs
80
+ # TODO: COMPLETE UNTIL FULL PREDICTOR
150
81
  end
151
- return profiles, id2label
82
+ return hotspots_with_pat_vars
152
83
  end
153
84
 
154
-
155
-
156
- def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
157
- all_coordinates = genomic_coordinates[entity]
158
- coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)}
159
- return coords
85
+ def invert_hash(h)
86
+ new_h = {}
87
+ h.each do |k, vals|
88
+ vals.each do |v|
89
+ query = new_h[v]
90
+ if query.nil?
91
+ new_h[v] = [k]
92
+ else
93
+ query << k
94
+ end
95
+ end
96
+ end
97
+ return new_h
160
98
  end
161
99
 
162
-
163
-
164
100
  #############################################################################################
165
101
  ## OPTPARSE
166
102
  ############################################################################################
@@ -199,6 +135,11 @@ OptionParser.new do |opts|
199
135
  options[:variant_data] = item
200
136
  end
201
137
 
138
+ options[:pathogenic_scores] = nil # TODO: Generalize to a folder with a table per patient
139
+ opts.on("-P", "--pathogenic_scores PATH", 'File with genome features an their pathogenic scores') do |item|
140
+ options[:pathogenic_scores] = item
141
+ end
142
+
202
143
  opts.on_tail("-h", "--help", "Show this message") do
203
144
  puts opts
204
145
  exit
@@ -211,12 +152,12 @@ end.parse!
211
152
  ############################################################################################
212
153
 
213
154
  hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
214
- hpo = Ontology.new
215
- hpo.read(hpo_file)
155
+ hpo = Ontology.new(file: hpo_file, load_file: true)
216
156
 
217
157
  profiles = load_profiles(options[:profiles_file], hpo)
218
158
  profile_variants = options[:variant_data].nil? ? {} : load_variants(options[:variant_data])
219
159
  evidences, genomic_coordinates = load_evidences(options[:evidences], hpo)
160
+ pathogenic_scores = options[:pathogenic_scores].nil? ? {} : load_pathogenic_scores(options[:pathogenic_scores])
220
161
 
221
162
  hpo.load_profiles(profiles)
222
163
  evidences_similarity = {}
@@ -225,7 +166,8 @@ evidences.each do |pair, data|
225
166
  if profile_type == 'HP'
226
167
  evidence_profiles = data[:prof]
227
168
  evidence_profiles.transform_keys!{|prof_id, terms| prof_id.to_sym}
228
- evidences_similarity[pair] = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
169
+ similarities = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
170
+ evidences_similarity[pair] = similarities if !similarities.empty?
229
171
  end
230
172
  end
231
173
 
@@ -239,23 +181,33 @@ profiles.each do |profile_id, reference_prof|
239
181
  entity = pair.split('_').first
240
182
  similarities = ev_profiles_similarity[profile_id.to_sym]
241
183
  candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(reference_prof, similarities, evidences[pair][:prof], hpo, 40, 40)
242
- candidate_sim_matrix.unshift(['HP'] + candidates_ids)
184
+ coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
185
+ candidate_sim_matrix.unshift(['HP'] + candidates_ids)
186
+ if !pathogenic_scores.empty? # priorize by pathogenic scores
187
+ candidate_sim_matrix_patho, candidates_patho, candidates_ids_patho = get_similarity_matrix(
188
+ reference_prof, similarities,
189
+ evidences[pair][:prof], hpo, 40, 40,
190
+ other_scores = pathogenic_scores, id2label = evidences[pair][:id2lab])
191
+ if !candidate_sim_matrix_patho.empty?
192
+ candidate_sim_matrix_patho.unshift(['HP'] + candidates_ids_patho)
193
+ similarity_matrixs[pair + '_path_vars'] = candidate_sim_matrix_patho
194
+ evidences[pair + '_path_vars'] = evidences[pair]
195
+ end
196
+ end
197
+ next if coords.nil?
243
198
  all_candidates.concat(candidates)
244
199
  similarity_matrixs[pair] = candidate_sim_matrix
245
- coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
246
200
  all_genomic_coordinates.merge!(coords)
247
201
  end
248
202
  prof_vars = profile_variants[profile_id]
249
- container = {
250
- profile_id: profile_id,
251
- candidates: all_candidates.each{|c| c[0] = c.first.to_s},
252
- genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
253
- similarity_matrixs: similarity_matrixs,
254
- evidences: evidences,
255
- var_ids: prof_vars.nil? ? nil : prof_vars.keys.map{|i| [i, 0]},
256
- var_coordinates: prof_vars
257
- }
258
- report = Report_html.new(container, 'Evidence profile report')
259
- report.build(template)
260
- report.write(File.join(options[:output_folder], profile_id.to_s + '.html'))
203
+ hotspots_with_pat_vars = generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
204
+ make_report(
205
+ profile_id,
206
+ all_candidates,
207
+ all_genomic_coordinates,
208
+ similarity_matrixs,
209
+ evidences, prof_vars,
210
+ hotspots_with_pat_vars,
211
+ template, options[:output_folder]
212
+ )
261
213
  end
@@ -0,0 +1,146 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ROOT_PATH = File.dirname(__FILE__)
4
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
5
+
6
+ require 'optparse'
7
+ require 'pets'
8
+
9
+ ##################################
10
+ ## METHODS
11
+ ##################################
12
+
13
+ def get_data(options)
14
+ fields2extract = get_fields2extract(options)
15
+ field_numbers = fields2extract.values
16
+ records = read_records(options, fields2extract, field_numbers)
17
+ end
18
+
19
+ def read_records(options, fields2extract, field_numbers) # Modified from cohort_parset
20
+ records = []
21
+ count = 0
22
+ File.open(options[:input_file]).each do |line|
23
+ line.chomp!
24
+ if options[:header] && count == 0
25
+ line.gsub!(/#\s*/,'') # correct comment like headers
26
+ field_names = line.split("\t")
27
+ get_field_numbers2extract(field_names, fields2extract)
28
+ field_numbers = fields2extract.values
29
+ else
30
+ fields = line.split("\t")
31
+ record = field_numbers.map{|n| fields[n]}
32
+ if fields2extract[:id_col].nil?
33
+ id = "rec_#{count}" #generate ids
34
+ else
35
+ id = record.shift
36
+ end
37
+ record[1] = record[1].to_i
38
+ record[2] = record[2].to_i
39
+ record << id
40
+ records << record
41
+ end
42
+ count +=1
43
+ end
44
+ return records
45
+ end
46
+
47
+ def get_fields2extract(options)
48
+ fields2extract = {}
49
+ [:id_col, :chromosome_col, :start_col, :end_col].each do |field|
50
+ col = options[field]
51
+ if !col.nil?
52
+ col = col.to_i if !options[:header]
53
+ fields2extract[field] = col
54
+ end
55
+ end
56
+ return fields2extract
57
+ end
58
+
59
+ def get_field_numbers2extract(field_names, fields2extract)
60
+ fields2extract.each do |field, name|
61
+ fields2extract[field] = field_names.index(name)
62
+ end
63
+ end
64
+
65
+ ##########################
66
+ #OPT-PARSER
67
+ ##########################
68
+
69
+ options = {}
70
+ OptionParser.new do |opts|
71
+ opts.banner = "Usage: #{__FILE__} [options]"
72
+
73
+ options[:chromosome_col] = nil
74
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
75
+ options[:chromosome_col] = data
76
+ end
77
+
78
+ options[:id_col] = nil
79
+ opts.on("-d", "--id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the id") do |data|
80
+ options[:id_col] = data
81
+ end
82
+
83
+ options[:end_col] = nil
84
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
85
+ options[:end_col] = data
86
+ end
87
+
88
+ options[:header] = true
89
+ #chr\tstart\tstop
90
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
91
+ options[:header] = false
92
+ end
93
+
94
+ options[:input_file] = nil
95
+ opts.on("-i", "--input_file PATH", "Input file path") do |data|
96
+ options[:input_file] = data
97
+ end
98
+
99
+ options[:reference_file] = nil
100
+ opts.on("-r", "--reference_file PATH", "Reference file with genome annotation") do |data|
101
+ options[:reference_file] = data
102
+ end
103
+
104
+ options[:output_file] = nil
105
+ opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
106
+ options[:output_file] = data
107
+ end
108
+
109
+ options[:start_col] = nil
110
+ opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
111
+ options[:start_col] = data
112
+ end
113
+
114
+ options[:feature_type] = nil
115
+ opts.on("-t", "--feature_type STRING", "Keep features from reference whose are tagged with this feature type") do |data|
116
+ options[:feature_type] = data
117
+ end
118
+
119
+ options[:feature_name] = nil
120
+ opts.on("-n", "--feature_name STRING", "Use this feature id that is present in attributes/annotation field of reference") do |data|
121
+ options[:feature_name] = data
122
+ end
123
+
124
+ opts.on_tail("-h", "--help", "Show this message") do
125
+ puts opts
126
+ exit
127
+ end
128
+
129
+ end.parse!
130
+
131
+ regions = Genomic_Feature.new(get_data(options))
132
+ Genomic_Feature.add_reference(
133
+ Reference_parser.load(
134
+ options[:reference_file],
135
+ feature_type: options[:feature_type]
136
+ )
137
+ )
138
+ gene_features = regions.get_features(attr_type: options[:feature_name])
139
+
140
+ File.open(options[:output_file], 'w') do |f|
141
+ gene_features.each do |id, feat_ids|
142
+ feat_ids.each do |ft_id|
143
+ f.puts "#{id}\t#{ft_id}"
144
+ end
145
+ end
146
+ end