pets 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
@@ -0,0 +1,297 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ##########################
4
+ #LIBRARIES
5
+ ##########################
6
+ require 'optparse'
7
+
8
+ ##########################
9
+ #METHODS
10
+ ##########################
11
+ @success_percentage_distribution = []
12
+ @prediction_vector = []
13
+ @rankings = { :in => [], :out => []}
14
+ @genome_fraction_predicted = 0 #All positive cases
15
+ @good_predicted_subregions = 0 #True positive cases
16
+
17
+ def compute_rankings
18
+ if !@prediction_vector.empty?
19
+ n_preds = @prediction_vector.length.fdiv(100)
20
+ @prediction_vector.each_with_index do |in_control, i|
21
+ ranking = (i + 1).fdiv(n_preds)
22
+ if in_control
23
+ @rankings[:in] << ranking
24
+ else
25
+ @rankings[:out] << ranking
26
+ end
27
+ end
28
+ @prediction_vector = []
29
+ end
30
+ end
31
+
32
+ def load_prediction_file(input_file)
33
+ predicted_regions = []
34
+ File.open(input_file).each do |line|
35
+ line.chomp!
36
+ predicted_info = line.split("\t")
37
+ profile_index = predicted_info[0].gsub('ProfID:','').to_i
38
+ if predicted_info[1] != 'Results not found'
39
+ predicted_hpos_number = predicted_info[4].split(',').length
40
+ predicted_regions << [profile_index, predicted_info[1], predicted_info[2].to_i, predicted_info[3].to_i, predicted_info[6].to_f, predicted_hpos_number]
41
+ else
42
+ predicted_regions << [profile_index]
43
+ end
44
+ end
45
+ return predicted_regions # profile_index, pred_chr, pred_start, pred_stop, score
46
+ end
47
+
48
+ def get_imputation_scores(predicted_regions, integration_method)
49
+ # predicted_regions.map{|pred_reg| pred_reg.pop}
50
+ # STDERR.puts predicted_regions.inspect
51
+ selected_regions = predicted_regions.select{|r| r[4].class == Float }
52
+ score_regionLength_pairs = selected_regions.map{|r| [r[4], r[3] - r[2]]} #Get combined score and region length
53
+ score_regionLength_pairs.sort!{|p1, p2| p1.first <=> p2.first}
54
+ score_regionLength_pairs.reverse! if integration_method == 'fisher'
55
+ total_region_length = score_regionLength_pairs.map{|p| p.last }.inject{|sum, l| sum + l }
56
+
57
+ length2inspect = total_region_length/1000
58
+ acumulated_score = 0
59
+ inspected_length = 0
60
+ score_regionLength_pairs.each do |score, length|
61
+ acumulated_score += score * length
62
+ inspected_length += length
63
+ break if inspected_length >= length2inspect
64
+ end
65
+ return acumulated_score.fdiv(inspected_length)
66
+ end
67
+
68
+ def generate_random_imp_score(imputation_score, desv)
69
+ range = imputation_score * desv * rand()
70
+ if [true, false].sample
71
+ final_score = imputation_score - range
72
+ else
73
+ final_score = imputation_score + range
74
+ end
75
+ return final_score
76
+ end
77
+
78
+ def load_patient_data(input_data_file)
79
+ patient_data = []
80
+ File.open(input_data_file).each do |line|
81
+ line.chomp!
82
+ mutation_coords, hpo_profile = line.split("\t")
83
+ number_of_phenotypes = hpo_profile.split('|').length
84
+ chr, start_pos, stop_pos = mutation_coords.split(':')
85
+ patient_data << [chr, start_pos.to_i, stop_pos.to_i, number_of_phenotypes]
86
+ end
87
+ return patient_data #ctrl_chr, ctrl_start, ctrl_stop, #number_of_phens
88
+ end
89
+
90
+ def get_perfomance_table(ctrl_regions, predicted_regions, scale, imputation_score, hpo_min_recovery, apply_imputation)
91
+ table = []
92
+ last_profile_id = ctrl_chr = ctrl_start = ctrl_stop = predicted_hpos_number = number_of_phenotypes = nil
93
+ in_out_regions = []
94
+ predicted_regions.each do |profile_index, pred_chr, pred_start, pred_stop, score, predicted_hpos_number|
95
+ if last_profile_id != profile_index && !last_profile_id.nil?
96
+ table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation))
97
+ @genome_fraction_predicted = 0
98
+ @good_predicted_subregions = 0
99
+ in_out_regions = []
100
+ compute_rankings
101
+ end
102
+ ctrl_chr, ctrl_start, ctrl_stop, number_of_phenotypes = ctrl_regions[profile_index] #get position in array, for each prediction
103
+ unless predicted_hpos_number.nil? || number_of_phenotypes.nil?
104
+ hpo_recovery_percentage = ( predicted_hpos_number / number_of_phenotypes.to_f ) * 100
105
+ #STDERR.puts "#{predicted_hpos_number}\t#{number_of_phenotypes}"
106
+ if hpo_recovery_percentage > hpo_min_recovery
107
+ in_out_regions.concat(get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score))
108
+ last_profile_id = profile_index
109
+ end
110
+ end
111
+ end
112
+ table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation))
113
+ return table
114
+ end
115
+
116
+ def process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation)
117
+ @success_percentage_distribution << get_sucess_percentage(in_out_regions)
118
+ table = []
119
+ ctrl_length = ctrl_stop - ctrl_start
120
+ non_predicted_regions = ctrl_length - @good_predicted_subregions
121
+ if non_predicted_regions > 0
122
+ #total_predicted_region_length = in_out_regions.map{|s| s.last}.inject(0){|i, sum| i + sum}
123
+ #imputation_score = in_out_regions.map{|s| s[1] * s.last}.inject(0){|i, sum| i + sum}.fdiv(total_predicted_region_length)
124
+ #imputation_score += 0.25 * imputation_score
125
+ #imputation_score = in_out_regions.map{|s| s[1]}.max
126
+ #index = (9 * in_out_regions.length).fdiv(10).ceil
127
+ #imputation_score = in_out_regions.map{|s| s[1]}.sort[index]
128
+
129
+
130
+ #in_out_regions << ["in", generate_random_imp_score(0.764, 0.35), non_predicted_regions] if apply_imputation
131
+ in_out_regions << ["in", generate_random_imp_score(imputation_score, 0.35), non_predicted_regions] if apply_imputation
132
+ end
133
+ evaluated_genome_fraction = @genome_fraction_predicted + ctrl_length
134
+ in_out_regions.each do |group, score, region_length|
135
+ list_entries = (region_length.fdiv(evaluated_genome_fraction) * scale).ceil
136
+ table.concat(Array.new(list_entries, [group, score]))
137
+ end
138
+ return table
139
+ end
140
+
141
+ def get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score)
142
+ in_out_regions = []
143
+ if ctrl_chr == pred_chr
144
+ if pred_start < ctrl_start && pred_stop > ctrl_stop # predicted region larger than ctrl region
145
+ region_length = ctrl_stop - ctrl_start
146
+ in_out_regions << ["in", score, region_length]
147
+ @good_predicted_subregions += region_length
148
+ region_length = ctrl_start - pred_start
149
+ in_out_regions << ["out", score, region_length]
150
+ @genome_fraction_predicted += region_length
151
+ region_length = pred_stop - ctrl_stop
152
+ in_out_regions << ["out", score, region_length]
153
+ @genome_fraction_predicted += region_length
154
+ elsif pred_start >= ctrl_start && pred_stop <= ctrl_stop #within ctrl region
155
+ region_length = pred_stop - pred_start
156
+ in_out_regions << ["in", score, region_length]
157
+ @good_predicted_subregions += region_length
158
+ elsif ctrl_start < pred_stop && ctrl_stop >= pred_stop #upstream region out of ctrl region
159
+ region_length = pred_stop - ctrl_start
160
+ in_out_regions << ["in", score, region_length]
161
+ @good_predicted_subregions += region_length
162
+ region_length = ctrl_start - pred_start
163
+ in_out_regions << ["out", score, region_length]
164
+ @genome_fraction_predicted += region_length
165
+ elsif ctrl_start <= pred_start && ctrl_stop > pred_start #downstream region out of ctrl region
166
+ region_length = ctrl_stop - pred_start
167
+ in_out_regions << ["in", score, region_length]
168
+ @good_predicted_subregions += region_length
169
+ region_length = pred_stop - ctrl_stop
170
+ in_out_regions << ["out", score, region_length]
171
+ @genome_fraction_predicted += region_length
172
+ else #in same chr but not in ctrl region
173
+ region_length = pred_stop - pred_start
174
+ in_out_regions << ["out", score, region_length]
175
+ @genome_fraction_predicted += region_length
176
+ end
177
+ elsif !pred_chr.nil? #in different chr
178
+ region_length = pred_stop - pred_start
179
+ in_out_regions << ["out", score, region_length]
180
+ @genome_fraction_predicted += region_length
181
+ end
182
+ if in_out_regions.map{|reg| reg.first }.include?('in')
183
+ @prediction_vector << true
184
+ else
185
+ @prediction_vector << false
186
+ end
187
+ return in_out_regions
188
+ end
189
+
190
+ def get_sucess_percentage(in_out_regions)
191
+ percentage = 0
192
+ if !in_out_regions.empty?
193
+ count = 0
194
+ total = 0
195
+ in_out_regions.each do |group, score, reg_length|
196
+ count += reg_length if group == 'in'
197
+ total += reg_length
198
+ end
199
+ percentage = count.fdiv(total)
200
+ end
201
+ return percentage
202
+ end
203
+
204
+
205
+ ##########################
206
+ #OPT-PARSE
207
+ ##########################
208
+
209
+ options = {}
210
+ OptionParser.new do |opts|
211
+ opts.banner = "Usage: #{__FILE__} [options]"
212
+
213
+ options[:input_prediction] = nil
214
+ opts.on("-i", "--input_prediction PATH", "Input prediction file for checking") do |input_prediction|
215
+ options[:input_prediction] = input_prediction
216
+ end
217
+
218
+ options[:meth] = nil
219
+ opts.on("-m", "--meth STRING", "Method used in score integration calculation, affects to the imputation algorithm (if used)") do |meth|
220
+ options[:meth] = meth
221
+ end
222
+
223
+ options[:output_file] = 'final_values_for_pr_curve.txt'
224
+ opts.on("-o", "--output_file PATH", "Output results for PR curve") do |output_file|
225
+ options[:output_file] = output_file
226
+ end
227
+
228
+ options[:hpo_recovery] = 0
229
+ opts.on("-p", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery|
230
+ options[:hpo_recovery] = hpo_recovery.to_f
231
+ abort("Please, choose a recovery value higher than 0") if options[:hpo_recovery] <= 0
232
+ end
233
+
234
+ options[:input_regions] = nil
235
+ opts.on("-r", "--input_regions PATH", "Input patients true affected regions (ctrl file)") do |input_regions|
236
+ options[:input_regions] = input_regions
237
+ end
238
+
239
+ options[:success_percentage] = 'success_percentage'
240
+ opts.on("-s", "--success_percentage PATH", "Output results with success percentage for each prediction") do |success_percentage|
241
+ options[:success_percentage] = success_percentage
242
+ end
243
+
244
+ options[:apply_imputation] = false
245
+ opts.on("-y", "--apply_imputation", "Activates imputation") do
246
+ options[:apply_imputation] = true
247
+ end
248
+
249
+ options[:scale_size] = 100
250
+ opts.on("-z", "--scale_size INTEGER", "Scale region size to avoid long range regions") do |scale_size|
251
+ options[:scale_size] = scale_size.to_i
252
+ abort("Please, choose a scale value higher than 0") if options[:scale_size] <= 0
253
+ end
254
+
255
+ opts.on_tail("-h", "--help", "Show this message") do
256
+ puts opts
257
+ exit
258
+ end
259
+
260
+ end.parse!
261
+
262
+ ##########################
263
+ #MAIN
264
+ ##########################
265
+
266
+ predicted_regions = load_prediction_file(options[:input_prediction])
267
+ imputation_score = get_imputation_scores(predicted_regions, options[:meth])
268
+ patient_data = load_patient_data(options[:input_regions])
269
+
270
+ table = get_perfomance_table(patient_data, predicted_regions, options[:scale_size], imputation_score, options[:hpo_recovery], options[:apply_imputation])
271
+ File.open(options[:output_file], 'w') do |f|
272
+ f.puts "group\tscore"
273
+ table.each do |output, score|
274
+ if options[:apply_imputation]
275
+ #score = generate_random_imp_score(0.764, 0.35) if score.nil?
276
+ score = generate_random_imp_score(imputation_score, 0.35) if score.nil?
277
+ else
278
+ next if score.nil? #when no imputation
279
+ end
280
+ f.puts "#{output}\t#{score}"
281
+ end
282
+ end
283
+ File.open(options[:success_percentage], 'w') do |f|
284
+ f.puts 'perc'
285
+ @success_percentage_distribution.each do |pg|
286
+ f.puts pg
287
+ end
288
+ end
289
+ File.open('ranking', 'w') do |f|
290
+ f.puts "reg\tranking"
291
+ @rankings.each do |reg, ranks|
292
+ ranks.each do |rank|
293
+ f.puts "#{reg.to_s}\t#{rank}"
294
+ end
295
+ end
296
+ end
297
+
data/bin/plot_area.R ADDED
@@ -0,0 +1,71 @@
1
+ #! /usr/bin/env Rscript
2
+ # x,y graph
3
+
4
+ library(ggplot2)
5
+ library(optparse)
6
+
7
+ ################################################################
8
+ # OPTPARSE
9
+ ################################################################
10
+ option_list <- list(
11
+ make_option(c("-d", "--data_file"), type="character",
12
+ help="Tabulated file with information about each sample"),
13
+ make_option(c("-o", "--output"), type="character", default="results",
14
+ help="Output figure file"),
15
+ make_option(c("-x", "--x_values"), type="character",
16
+ help="Name of column with values to be plotted"),
17
+ make_option(c("-y", "--y_values"), type="character",
18
+ help="Name of column with values to be plotted"),
19
+ make_option(c("-f", "--density_values"), type="character",
20
+ help="Name of column to be used as density values"),
21
+ make_option(c("-H", "--header"), action="store_false", default=TRUE,
22
+ help="The input table not have header line"),
23
+ make_option(c("-X", "--x_title"), type="character",
24
+ help="Name of column to be used for bars titles"),
25
+ make_option(c("-Y", "--y_title"), type="character",
26
+ help="Title of y axis"),
27
+ make_option(c("-F", "--output_format"), type="character", default="pdf",
28
+ help="pdf or jpeg file output format"),
29
+ make_option(c("-m", "--maxs_file"), type="character", default="",
30
+ help="Tabulated file maximum of each sample"),
31
+ make_option(c("-t", "--graph_title"), type="character", default="",
32
+ help="Title of the graph")
33
+
34
+ )
35
+ opt <- parse_args(OptionParser(option_list=option_list))
36
+
37
+
38
+ ################################################################
39
+ ## MAIN
40
+ ################################################################
41
+
42
+ data <- read.table(opt$data_file, sep="\t", header=opt$header)
43
+ if (opt$output_format == "pdf"){
44
+ pdf(paste(opt$output, '.pdf', sep=""))
45
+ }else if(opt$output_format == "jpeg"){
46
+ jpeg(paste(opt$output, '.jpeg', sep=""))
47
+ }
48
+ goodChrOrder <- c(1:22,"X","Y")
49
+ data$V1 <- factor(data$V1,levels=goodChrOrder)
50
+
51
+ maxs <- c()
52
+ if(opt$maxs_file != ""){
53
+ maxs <- read.table(opt$maxs_file, sep="\t", header=FALSE)
54
+ #print(maxs)
55
+ }
56
+ #ggplot(data=data, aes(x=data[[opt$x_values]], y=data[[opt$y_values]] )) +
57
+ obj <- ggplot(data=data, aes(x=V2, y=V3 ))
58
+ #geom_area(aes(fill=data[[opt$density_values]], )) +
59
+ obj <- obj + geom_area(aes(fill=V1, ))
60
+ obj <- obj + facet_wrap(~ V1, ncol=2, strip.position = "right" )
61
+ if(length(maxs) > 0){
62
+ obj <- obj + geom_vline(data = maxs, aes(xintercept = V2))
63
+ }
64
+ obj <- obj + xlab(opt$x_title)
65
+ obj <- obj + ylab(opt$y_title)
66
+ obj <- obj + theme(axis.text.x = element_text(angle = 45, hjust = 1))
67
+ obj <- obj + guides(fill=FALSE)
68
+ #obj <- obj + labs(title = opt$graph_title)
69
+ obj <- obj + ggtitle(label = opt$graph_title)
70
+ obj
71
+ dev.off()
@@ -0,0 +1,21 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ args <- commandArgs(trailingOnly = TRUE)
5
+
6
+ data <- read.table(args[1], header=TRUE)
7
+ output <- args[2]
8
+ x_axis <- args[3]
9
+ y_axis <- args[4]
10
+ x_tag <- args[5]
11
+ y_tag <- args[6]
12
+ x_order <- unique(data[[x_axis]])
13
+ data[[x_axis]] <- factor(data[[x_axis]], levels = x_order)
14
+
15
+ pdf(file.path(output, 'boxplot.pdf'))
16
+ ggplot(data, aes(x=data[[x_axis]], y=data[[y_axis]])) +
17
+ geom_boxplot() +
18
+ xlab(x_tag) +
19
+ ylab(y_tag) +
20
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
21
+ dev.off()
@@ -0,0 +1,46 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+
5
+ args <- commandArgs(trailingOnly = TRUE)
6
+
7
+ file <- args[1]
8
+
9
+ values <- args[2]
10
+
11
+ #x_axis_limit <- strtoi(args[3])
12
+ x_axis_limit <- as.numeric(args[3])
13
+
14
+ groups <- args[4]
15
+ x_axis_limit_min <- as.numeric(args[5])
16
+ #categories <- args[3]
17
+
18
+ #xtitle <- args[4]
19
+
20
+ #ytitle <- args[5]
21
+
22
+ data <- read.table(file, header = TRUE , sep="\t")
23
+
24
+ pdf('out.pdf')
25
+ #ggplot(data, aes(x=data[[values]], colour=data[[categories]], fill=data[[categories]])) +
26
+ #geom_histogram(binwidth=.5, position="dodge") +
27
+ #geom_histogram(position="dodge") +
28
+ #xlab(xtitle) +
29
+ #ylab('Count') +
30
+ if(is.na(groups)){
31
+ obj <- ggplot(data, aes(x=data[[values]]))
32
+ obj <- obj + geom_density()
33
+ }else{
34
+ obj <- ggplot(data, aes(x=data[[values]], fill=data[[groups]]))
35
+ obj <- obj + geom_density(alpha=.3)
36
+ }
37
+ obj <- obj + theme(legend.title=element_blank())
38
+ if(!is.na(x_axis_limit)){
39
+ xmin <- 0
40
+ if(!is.na(x_axis_limit_min)){
41
+ xmin <-x_axis_limit_min
42
+ }
43
+ obj <- obj + xlim(xmin, x_axis_limit)
44
+ }
45
+ obj
46
+ dev.off()
@@ -0,0 +1,25 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ args <- commandArgs(trailingOnly = TRUE)
5
+
6
+ data <- read.table(args[1], header=TRUE)
7
+ output <- args[2]
8
+ x_axis <- args[3]
9
+ y_axis <- args[4]
10
+ density <- args[5]
11
+ x_tag <- args[6]
12
+ y_tag <- args[7]
13
+ size_tag <- args[8]
14
+ x_order <- unique(data[[x_axis]])
15
+ data[[x_axis]] <- factor(data[[x_axis]], levels = x_order)
16
+
17
+ pdf(file.path(output, 'scatterplot.pdf'))
18
+ ggplot(data, aes(x=data[[x_axis]], y=data[[y_axis]])) +
19
+ geom_point(aes(size=data[[density]])) +
20
+ xlab(x_tag) +
21
+ ylab(y_tag) +
22
+ labs(size = size_tag) +
23
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
24
+ dev.off()
25
+