pets 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +2 -0
  3. data/README.md +79 -5
  4. data/bin/coPatReporter.rb +68 -156
  5. data/bin/comPatMondo.rb +1 -4
  6. data/bin/evidence_profiler.rb +102 -150
  7. data/bin/get_gen_features.rb +146 -0
  8. data/bin/get_network_nodes.rb +79 -132
  9. data/bin/get_sorted_profs.rb +25 -36
  10. data/bin/install_deps.rb +8 -0
  11. data/bin/paco_translator.rb +29 -72
  12. data/bin/phen2reg.rb +1 -4
  13. data/bin/profiles2phenopacket.rb +86 -0
  14. data/bin/reg2phen.rb +1 -3
  15. data/example_datasets/associations_file.txt +757 -0
  16. data/example_datasets/example_patient.txt +6 -0
  17. data/example_datasets/example_patient_hpos.txt +15 -0
  18. data/example_datasets/genes.txt +8 -0
  19. data/example_datasets/hpo2ci.txt +2798 -0
  20. data/example_datasets/hummu_congenital_full_dataset.txt +4183 -0
  21. data/example_datasets/launch.sh +20 -0
  22. data/external_code/generate_boxpot.R +51 -21
  23. data/external_code/get_clusters.R +2 -2
  24. data/external_code/install_R_dependencies.R +16 -0
  25. data/external_code/plot_heatmap.R +34 -30
  26. data/lib/pets/coPatReporterMethods.rb +172 -424
  27. data/lib/pets/cohort.rb +309 -0
  28. data/lib/pets/common_optparse.rb +30 -0
  29. data/lib/pets/constants.rb +8 -0
  30. data/lib/pets/generalMethods.rb +29 -319
  31. data/lib/pets/genomic_features.rb +240 -0
  32. data/lib/pets/io.rb +481 -0
  33. data/lib/pets/parsers/cohort_parser.rb +111 -0
  34. data/lib/pets/parsers/reference_parser.rb +39 -0
  35. data/lib/pets/version.rb +1 -1
  36. data/lib/pets.rb +9 -0
  37. data/pets.gemspec +7 -3
  38. data/templates/cluster_report.erb +25 -5
  39. data/templates/cohort_report.erb +5 -7
  40. data/templates/evidence_profile.erb +20 -4
  41. data/templates/patient_report.erb +1 -1
  42. metadata +96 -5
@@ -1,166 +1,102 @@
1
1
  #! /usr/bin/env ruby
2
2
 
3
3
  ROOT_PATH = File.dirname(__FILE__)
4
- REPORT_FOLDER = File.expand_path(File.join(ROOT_PATH, '..', 'templates'))
5
- EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
6
- EXTERNAL_CODE = File.expand_path(File.join(ROOT_PATH, '..', 'external_code'))
7
- HPO_FILE = File.join(EXTERNAL_DATA, 'hp.json')
8
4
  $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
9
5
 
10
6
  require 'fileutils'
11
7
  require 'optparse'
12
8
  require 'report_html'
13
9
  require 'semtools'
14
- require 'generalMethods.rb'
15
-
16
-
17
- class Report_html
18
- def circular_genome(user_options = {}, &block)
19
- default_options = {}.merge!(user_options)
20
- coordinates = user_options[:genomic_coordinates]
21
- html_string = canvasXpress_main(default_options, block) do |options, config, samples, vars, values, object_id, x, z|
22
- config['graphType'] = 'Circular'
23
- config["arcSegmentsSeparation"] = 3
24
- config["colorScheme"] = "Tableau"
25
- config["colors"] = ["#332288","#6699CC","#88CCEE","#44AA99","#117733","#999933","#DDCC77","#661100","#CC6677","#AA4466","#882255","#AA4499"]
26
- config["showIdeogram"] = true
27
- chr = []
28
- pos = []
29
- tags2remove = []
30
- vars.each_with_index do |var, i|
31
- coord = coordinates[var]
32
- if !coord.nil?
33
- tag = coord.first.gsub(/[^\dXY]/,'')
34
- if tag == 'X' || tag == 'Y' || (tag.to_i > 0 && tag.to_i <= 22)
35
- chr << coord.first.gsub(/[^\dXY]/,'')
36
- pos << coord.last - 1
37
- else
38
- tags2remove << i
39
- end
40
- else
41
- tags2remove << i
42
- end
43
- end
44
- tags2remove.reverse_each{|i| ent = vars.delete_at(i); warn("Feature #{ent} has not valid coordinates")} # Remove entities with invalid coordinates
45
- z['chr'] = chr
46
- z['pos'] = pos
47
- end
48
- return html_string
49
- end
50
- end
10
+ require 'pets'
51
11
 
52
12
  #############################################################################################
53
13
  ## METHODS
54
14
  ############################################################################################
55
- def load_profiles(file_path, hpo)
56
- profiles = {}
57
- #count = 0
58
- File.open(file_path).each do |line|
59
- id, profile = line.chomp.split("\t")
60
- hpos = profile.split(',').map{|a| a.to_sym}
61
- hpos, rejected_hpos = hpo.check_ids(hpos)
62
- if !hpos.empty?
63
- hpos = hpo.clean_profile(hpos)
64
- profiles[id] = hpos if !hpos.empty?
65
- end
15
+ def load_pathogenic_scores(path)
16
+ scores = {}
17
+ File.open(path).each do |line|
18
+ feature, score = line.split("\t")
19
+ scores[feature] = score.to_f
66
20
  end
67
- return profiles
21
+ return scores
68
22
  end
69
23
 
70
- def load_variants(variant_folder)
71
- variants = {}
72
- coordinates = {}
73
- count = 0
74
- all_vars = {}
75
- Dir.glob(File.join(variant_folder, '*.tab')).each do |path|
76
- profile_id = File.basename(path, '.tab')
77
- vars = {}
78
- File.open(path).each do |line|
79
- fields = line.chomp.split("\t")
80
- chr = fields[0]
81
- start = fields[1].to_i
82
- query = coordinates[chr]
83
- if query.nil?
84
- coordinates[chr] = [start]
85
- count += 1
86
- id = "var_#{count}"
87
- else
88
- if !query.include?(start)
89
- query << start
90
- count += 1
91
- id = "var_#{count}"
92
- else
93
- id = all_vars.key([chr, start])
94
- end
95
- end
96
- vars[id] = [chr, start]
97
- end
98
- all_vars.merge!(vars)
99
- variants[profile_id] = vars
100
- end
101
- return variants
24
+ def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
25
+ coords = nil
26
+ all_coordinates = genomic_coordinates[entity]
27
+ coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)} if !all_coordinates.nil?
28
+ return coords
102
29
  end
103
30
 
104
- def load_evidences(evidences_path, hpo)
105
- genomic_coordinates = {}
106
- coord_files = Dir.glob(File.join(evidences_path, '*.coords'))
107
- coord_files.each do |cd_f|
108
- entity = File.basename(cd_f, '.coords')
109
- coordinates = load_coordinates(cd_f)
110
- genomic_coordinates[entity] = coordinates
111
- end
112
- evidences = {}
113
- evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt'))
114
- evidence_files.each do |e_f|
115
- pair = File.basename(e_f, '.txt')
116
- profiles, id2label = load_evidence_profiles(e_f, hpo)
117
- evidences[pair] = {prof: profiles, id2lab: id2label}
118
- end
119
- return evidences, genomic_coordinates
31
+ def make_report(profile_id, all_candidates, all_genomic_coordinates, similarity_matrixs,
32
+ evidences, prof_vars, hotspots_with_pat_vars, template, output)
33
+ var_ids, var_coors = format_variants4report(prof_vars)
34
+ container = {
35
+ profile_id: profile_id,
36
+ candidates: all_candidates.each{|c| c[0] = c.first.to_s},
37
+ genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
38
+ similarity_matrixs: similarity_matrixs,
39
+ evidences: evidences,
40
+ var_ids: var_ids,
41
+ var_coordinates: var_coors,
42
+ hotspot_table: hotspots_with_pat_vars
43
+ }
44
+ report = Report_html.new(container, 'Evidence profile report')
45
+ report.build(template)
46
+ report.write(File.join(output, profile_id.to_s + '.html'))
120
47
  end
121
48
 
122
- def load_coordinates(file_path)
123
- coordinates = {}
124
- header = true
125
- File.open(file_path).each do |line|
126
- fields = line.chomp.split("\t")
127
- if header
128
- header = false
129
- else
130
- entity, chr, strand, start, stop = fields
131
- coordinates[entity] = [chr, start.to_i, stop.to_i, strand]
49
+ def format_variants4report(var_data)
50
+ if var_data.nil?
51
+ var_ids, var_coors = nil
52
+ else
53
+ var_ids = []
54
+ var_coors = {}
55
+ count = 0
56
+ var_data.each do |chr, reg|
57
+ var_id = "var_#{count}"
58
+ var_ids << [var_id, 0]
59
+ var_coors[var_id] = [chr.to_s, reg[:start]]
60
+ count += 1
132
61
  end
133
62
  end
134
- return coordinates
63
+ return var_ids, var_coors
135
64
  end
136
65
 
137
- def load_evidence_profiles(file_path, hpo)
138
- profiles = {}
139
- id2label = {}
140
- #count = 0
141
- File.open(file_path).each do |line|
142
- id, label, profile = line.chomp.split("\t")
143
- hpos = profile.split(',').map{|a| a.to_sym}
144
- hpos, rejected_hpos = hpo.check_ids(hpos)
145
- if !hpos.empty?
146
- hpos = hpo.clean_profile(hpos)
147
- profiles[id] = hpos if !hpos.empty?
148
- id2label[id] = label
66
+
67
+ def generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
68
+ hotspots_with_pat_vars = []
69
+ if !prof_vars.nil?
70
+ phen_regions = Genomic_Feature.hash2genomic_feature(all_genomic_coordinates){|k, v| v[0..2].concat([k])}
71
+ phen_candidates_by_hotspot, phen_genome_hotspots = phen_regions.generate_cluster_regions(:reg_overlap, 'A', 0, true)
72
+ genome_matches = phen_genome_hotspots.match(prof_vars)
73
+ hotspot_with_phen_candidates = invert_hash(phen_candidates_by_hotspot)
74
+ genome_matches.each do |hotspot_id, pat_vars|
75
+ reg = phen_genome_hotspots.region_by_to(hotspot_id)
76
+ coords = [reg[:chr], reg[:start], reg[:stop]]
77
+ hotspots_with_pat_vars << [hotspot_id, coords, hotspot_with_phen_candidates[hotspot_id], pat_vars]
149
78
  end
79
+ # TODO: see to use original similarities without use top candidates in similarity_matrixs
80
+ # TODO: COMPLETE UNTIL FULL PREDICTOR
150
81
  end
151
- return profiles, id2label
82
+ return hotspots_with_pat_vars
152
83
  end
153
84
 
154
-
155
-
156
- def get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
157
- all_coordinates = genomic_coordinates[entity]
158
- coords = all_coordinates.select{|id, coordinates| candidates_ids.include?(id.to_sym)}
159
- return coords
85
+ def invert_hash(h)
86
+ new_h = {}
87
+ h.each do |k, vals|
88
+ vals.each do |v|
89
+ query = new_h[v]
90
+ if query.nil?
91
+ new_h[v] = [k]
92
+ else
93
+ query << k
94
+ end
95
+ end
96
+ end
97
+ return new_h
160
98
  end
161
99
 
162
-
163
-
164
100
  #############################################################################################
165
101
  ## OPTPARSE
166
102
  ############################################################################################
@@ -199,6 +135,11 @@ OptionParser.new do |opts|
199
135
  options[:variant_data] = item
200
136
  end
201
137
 
138
+ options[:pathogenic_scores] = nil # TODO: Generalize to a folder with a table per patient
139
+ opts.on("-P", "--pathogenic_scores PATH", 'File with genome features an their pathogenic scores') do |item|
140
+ options[:pathogenic_scores] = item
141
+ end
142
+
202
143
  opts.on_tail("-h", "--help", "Show this message") do
203
144
  puts opts
204
145
  exit
@@ -211,12 +152,12 @@ end.parse!
211
152
  ############################################################################################
212
153
 
213
154
  hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE
214
- hpo = Ontology.new
215
- hpo.read(hpo_file)
155
+ hpo = Ontology.new(file: hpo_file, load_file: true)
216
156
 
217
157
  profiles = load_profiles(options[:profiles_file], hpo)
218
158
  profile_variants = options[:variant_data].nil? ? {} : load_variants(options[:variant_data])
219
159
  evidences, genomic_coordinates = load_evidences(options[:evidences], hpo)
160
+ pathogenic_scores = options[:pathogenic_scores].nil? ? {} : load_pathogenic_scores(options[:pathogenic_scores])
220
161
 
221
162
  hpo.load_profiles(profiles)
222
163
  evidences_similarity = {}
@@ -225,7 +166,8 @@ evidences.each do |pair, data|
225
166
  if profile_type == 'HP'
226
167
  evidence_profiles = data[:prof]
227
168
  evidence_profiles.transform_keys!{|prof_id, terms| prof_id.to_sym}
228
- evidences_similarity[pair] = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
169
+ similarities = hpo.compare_profiles(external_profiles: evidence_profiles, sim_type: :lin, bidirectional: false)
170
+ evidences_similarity[pair] = similarities if !similarities.empty?
229
171
  end
230
172
  end
231
173
 
@@ -239,23 +181,33 @@ profiles.each do |profile_id, reference_prof|
239
181
  entity = pair.split('_').first
240
182
  similarities = ev_profiles_similarity[profile_id.to_sym]
241
183
  candidate_sim_matrix, candidates, candidates_ids = get_similarity_matrix(reference_prof, similarities, evidences[pair][:prof], hpo, 40, 40)
242
- candidate_sim_matrix.unshift(['HP'] + candidates_ids)
184
+ coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
185
+ candidate_sim_matrix.unshift(['HP'] + candidates_ids)
186
+ if !pathogenic_scores.empty? # priorize by pathogenic scores
187
+ candidate_sim_matrix_patho, candidates_patho, candidates_ids_patho = get_similarity_matrix(
188
+ reference_prof, similarities,
189
+ evidences[pair][:prof], hpo, 40, 40,
190
+ other_scores = pathogenic_scores, id2label = evidences[pair][:id2lab])
191
+ if !candidate_sim_matrix_patho.empty?
192
+ candidate_sim_matrix_patho.unshift(['HP'] + candidates_ids_patho)
193
+ similarity_matrixs[pair + '_path_vars'] = candidate_sim_matrix_patho
194
+ evidences[pair + '_path_vars'] = evidences[pair]
195
+ end
196
+ end
197
+ next if coords.nil?
243
198
  all_candidates.concat(candidates)
244
199
  similarity_matrixs[pair] = candidate_sim_matrix
245
- coords = get_evidence_coordinates(entity, genomic_coordinates, candidates_ids)
246
200
  all_genomic_coordinates.merge!(coords)
247
201
  end
248
202
  prof_vars = profile_variants[profile_id]
249
- container = {
250
- profile_id: profile_id,
251
- candidates: all_candidates.each{|c| c[0] = c.first.to_s},
252
- genomic_coordinates: all_genomic_coordinates.transform_values{|c| c.first(2) },
253
- similarity_matrixs: similarity_matrixs,
254
- evidences: evidences,
255
- var_ids: prof_vars.nil? ? nil : prof_vars.keys.map{|i| [i, 0]},
256
- var_coordinates: prof_vars
257
- }
258
- report = Report_html.new(container, 'Evidence profile report')
259
- report.build(template)
260
- report.write(File.join(options[:output_folder], profile_id.to_s + '.html'))
203
+ hotspots_with_pat_vars = generate_prediction(similarity_matrixs, all_genomic_coordinates, prof_vars)
204
+ make_report(
205
+ profile_id,
206
+ all_candidates,
207
+ all_genomic_coordinates,
208
+ similarity_matrixs,
209
+ evidences, prof_vars,
210
+ hotspots_with_pat_vars,
211
+ template, options[:output_folder]
212
+ )
261
213
  end
@@ -0,0 +1,146 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ROOT_PATH = File.dirname(__FILE__)
4
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
5
+
6
+ require 'optparse'
7
+ require 'pets'
8
+
9
+ ##################################
10
+ ## METHODS
11
+ ##################################
12
+
13
+ def get_data(options)
14
+ fields2extract = get_fields2extract(options)
15
+ field_numbers = fields2extract.values
16
+ records = read_records(options, fields2extract, field_numbers)
17
+ end
18
+
19
+ def read_records(options, fields2extract, field_numbers) # Modified from cohort_parset
20
+ records = []
21
+ count = 0
22
+ File.open(options[:input_file]).each do |line|
23
+ line.chomp!
24
+ if options[:header] && count == 0
25
+ line.gsub!(/#\s*/,'') # correct comment like headers
26
+ field_names = line.split("\t")
27
+ get_field_numbers2extract(field_names, fields2extract)
28
+ field_numbers = fields2extract.values
29
+ else
30
+ fields = line.split("\t")
31
+ record = field_numbers.map{|n| fields[n]}
32
+ if fields2extract[:id_col].nil?
33
+ id = "rec_#{count}" #generate ids
34
+ else
35
+ id = record.shift
36
+ end
37
+ record[1] = record[1].to_i
38
+ record[2] = record[2].to_i
39
+ record << id
40
+ records << record
41
+ end
42
+ count +=1
43
+ end
44
+ return records
45
+ end
46
+
47
+ def get_fields2extract(options)
48
+ fields2extract = {}
49
+ [:id_col, :chromosome_col, :start_col, :end_col].each do |field|
50
+ col = options[field]
51
+ if !col.nil?
52
+ col = col.to_i if !options[:header]
53
+ fields2extract[field] = col
54
+ end
55
+ end
56
+ return fields2extract
57
+ end
58
+
59
+ def get_field_numbers2extract(field_names, fields2extract)
60
+ fields2extract.each do |field, name|
61
+ fields2extract[field] = field_names.index(name)
62
+ end
63
+ end
64
+
65
+ ##########################
66
+ #OPT-PARSER
67
+ ##########################
68
+
69
+ options = {}
70
+ OptionParser.new do |opts|
71
+ opts.banner = "Usage: #{__FILE__} [options]"
72
+
73
+ options[:chromosome_col] = nil
74
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
75
+ options[:chromosome_col] = data
76
+ end
77
+
78
+ options[:id_col] = nil
79
+ opts.on("-d", "--id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the id") do |data|
80
+ options[:id_col] = data
81
+ end
82
+
83
+ options[:end_col] = nil
84
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
85
+ options[:end_col] = data
86
+ end
87
+
88
+ options[:header] = true
89
+ #chr\tstart\tstop
90
+ opts.on("-H", "--header", "Set if the file has a line header. Default true") do
91
+ options[:header] = false
92
+ end
93
+
94
+ options[:input_file] = nil
95
+ opts.on("-i", "--input_file PATH", "Input file path") do |data|
96
+ options[:input_file] = data
97
+ end
98
+
99
+ options[:reference_file] = nil
100
+ opts.on("-r", "--reference_file PATH", "Reference file with genome annotation") do |data|
101
+ options[:reference_file] = data
102
+ end
103
+
104
+ options[:output_file] = nil
105
+ opts.on("-o", "--output_file PATH", "Output file with patient data") do |data|
106
+ options[:output_file] = data
107
+ end
108
+
109
+ options[:start_col] = nil
110
+ opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
111
+ options[:start_col] = data
112
+ end
113
+
114
+ options[:feature_type] = nil
115
+ opts.on("-t", "--feature_type STRING", "Keep features from reference whose are tagged with this feature type") do |data|
116
+ options[:feature_type] = data
117
+ end
118
+
119
+ options[:feature_name] = nil
120
+ opts.on("-n", "--feature_name STRING", "Use this feature id that is present in attributes/annotation field of reference") do |data|
121
+ options[:feature_name] = data
122
+ end
123
+
124
+ opts.on_tail("-h", "--help", "Show this message") do
125
+ puts opts
126
+ exit
127
+ end
128
+
129
+ end.parse!
130
+
131
+ regions = Genomic_Feature.new(get_data(options))
132
+ Genomic_Feature.add_reference(
133
+ Reference_parser.load(
134
+ options[:reference_file],
135
+ feature_type: options[:feature_type]
136
+ )
137
+ )
138
+ gene_features = regions.get_features(attr_type: options[:feature_name])
139
+
140
+ File.open(options[:output_file], 'w') do |f|
141
+ gene_features.each do |id, feat_ids|
142
+ feat_ids.each do |ft_id|
143
+ f.puts "#{id}\t#{ft_id}"
144
+ end
145
+ end
146
+ end