pets 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
@@ -0,0 +1,297 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ ##########################
4
+ #LIBRARIES
5
+ ##########################
6
+ require 'optparse'
7
+
8
+ ##########################
9
+ #METHODS
10
+ ##########################
11
+ @success_percentage_distribution = []
12
+ @prediction_vector = []
13
+ @rankings = { :in => [], :out => []}
14
+ @genome_fraction_predicted = 0 #All positive cases
15
+ @good_predicted_subregions = 0 #True positive cases
16
+
17
+ def compute_rankings
18
+ if !@prediction_vector.empty?
19
+ n_preds = @prediction_vector.length.fdiv(100)
20
+ @prediction_vector.each_with_index do |in_control, i|
21
+ ranking = (i + 1).fdiv(n_preds)
22
+ if in_control
23
+ @rankings[:in] << ranking
24
+ else
25
+ @rankings[:out] << ranking
26
+ end
27
+ end
28
+ @prediction_vector = []
29
+ end
30
+ end
31
+
32
+ def load_prediction_file(input_file)
33
+ predicted_regions = []
34
+ File.open(input_file).each do |line|
35
+ line.chomp!
36
+ predicted_info = line.split("\t")
37
+ profile_index = predicted_info[0].gsub('ProfID:','').to_i
38
+ if predicted_info[1] != 'Results not found'
39
+ predicted_hpos_number = predicted_info[4].split(',').length
40
+ predicted_regions << [profile_index, predicted_info[1], predicted_info[2].to_i, predicted_info[3].to_i, predicted_info[6].to_f, predicted_hpos_number]
41
+ else
42
+ predicted_regions << [profile_index]
43
+ end
44
+ end
45
+ return predicted_regions # profile_index, pred_chr, pred_start, pred_stop, score
46
+ end
47
+
48
+ def get_imputation_scores(predicted_regions, integration_method)
49
+ # predicted_regions.map{|pred_reg| pred_reg.pop}
50
+ # STDERR.puts predicted_regions.inspect
51
+ selected_regions = predicted_regions.select{|r| r[4].class == Float }
52
+ score_regionLength_pairs = selected_regions.map{|r| [r[4], r[3] - r[2]]} #Get combined score and region length
53
+ score_regionLength_pairs.sort!{|p1, p2| p1.first <=> p2.first}
54
+ score_regionLength_pairs.reverse! if integration_method == 'fisher'
55
+ total_region_length = score_regionLength_pairs.map{|p| p.last }.inject{|sum, l| sum + l }
56
+
57
+ length2inspect = total_region_length/1000
58
+ acumulated_score = 0
59
+ inspected_length = 0
60
+ score_regionLength_pairs.each do |score, length|
61
+ acumulated_score += score * length
62
+ inspected_length += length
63
+ break if inspected_length >= length2inspect
64
+ end
65
+ return acumulated_score.fdiv(inspected_length)
66
+ end
67
+
68
+ def generate_random_imp_score(imputation_score, desv)
69
+ range = imputation_score * desv * rand()
70
+ if [true, false].sample
71
+ final_score = imputation_score - range
72
+ else
73
+ final_score = imputation_score + range
74
+ end
75
+ return final_score
76
+ end
77
+
78
+ def load_patient_data(input_data_file)
79
+ patient_data = []
80
+ File.open(input_data_file).each do |line|
81
+ line.chomp!
82
+ mutation_coords, hpo_profile = line.split("\t")
83
+ number_of_phenotypes = hpo_profile.split('|').length
84
+ chr, start_pos, stop_pos = mutation_coords.split(':')
85
+ patient_data << [chr, start_pos.to_i, stop_pos.to_i, number_of_phenotypes]
86
+ end
87
+ return patient_data #ctrl_chr, ctrl_start, ctrl_stop, #number_of_phens
88
+ end
89
+
90
+ def get_perfomance_table(ctrl_regions, predicted_regions, scale, imputation_score, hpo_min_recovery, apply_imputation)
91
+ table = []
92
+ last_profile_id = ctrl_chr = ctrl_start = ctrl_stop = predicted_hpos_number = number_of_phenotypes = nil
93
+ in_out_regions = []
94
+ predicted_regions.each do |profile_index, pred_chr, pred_start, pred_stop, score, predicted_hpos_number|
95
+ if last_profile_id != profile_index && !last_profile_id.nil?
96
+ table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation))
97
+ @genome_fraction_predicted = 0
98
+ @good_predicted_subregions = 0
99
+ in_out_regions = []
100
+ compute_rankings
101
+ end
102
+ ctrl_chr, ctrl_start, ctrl_stop, number_of_phenotypes = ctrl_regions[profile_index] #get position in array, for each prediction
103
+ unless predicted_hpos_number.nil? || number_of_phenotypes.nil?
104
+ hpo_recovery_percentage = ( predicted_hpos_number / number_of_phenotypes.to_f ) * 100
105
+ #STDERR.puts "#{predicted_hpos_number}\t#{number_of_phenotypes}"
106
+ if hpo_recovery_percentage > hpo_min_recovery
107
+ in_out_regions.concat(get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score))
108
+ last_profile_id = profile_index
109
+ end
110
+ end
111
+ end
112
+ table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation))
113
+ return table
114
+ end
115
+
116
+ def process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation)
117
+ @success_percentage_distribution << get_sucess_percentage(in_out_regions)
118
+ table = []
119
+ ctrl_length = ctrl_stop - ctrl_start
120
+ non_predicted_regions = ctrl_length - @good_predicted_subregions
121
+ if non_predicted_regions > 0
122
+ #total_predicted_region_length = in_out_regions.map{|s| s.last}.inject(0){|i, sum| i + sum}
123
+ #imputation_score = in_out_regions.map{|s| s[1] * s.last}.inject(0){|i, sum| i + sum}.fdiv(total_predicted_region_length)
124
+ #imputation_score += 0.25 * imputation_score
125
+ #imputation_score = in_out_regions.map{|s| s[1]}.max
126
+ #index = (9 * in_out_regions.length).fdiv(10).ceil
127
+ #imputation_score = in_out_regions.map{|s| s[1]}.sort[index]
128
+
129
+
130
+ #in_out_regions << ["in", generate_random_imp_score(0.764, 0.35), non_predicted_regions] if apply_imputation
131
+ in_out_regions << ["in", generate_random_imp_score(imputation_score, 0.35), non_predicted_regions] if apply_imputation
132
+ end
133
+ evaluated_genome_fraction = @genome_fraction_predicted + ctrl_length
134
+ in_out_regions.each do |group, score, region_length|
135
+ list_entries = (region_length.fdiv(evaluated_genome_fraction) * scale).ceil
136
+ table.concat(Array.new(list_entries, [group, score]))
137
+ end
138
+ return table
139
+ end
140
+
141
+ def get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score)
142
+ in_out_regions = []
143
+ if ctrl_chr == pred_chr
144
+ if pred_start < ctrl_start && pred_stop > ctrl_stop # predicted region larger than ctrl region
145
+ region_length = ctrl_stop - ctrl_start
146
+ in_out_regions << ["in", score, region_length]
147
+ @good_predicted_subregions += region_length
148
+ region_length = ctrl_start - pred_start
149
+ in_out_regions << ["out", score, region_length]
150
+ @genome_fraction_predicted += region_length
151
+ region_length = pred_stop - ctrl_stop
152
+ in_out_regions << ["out", score, region_length]
153
+ @genome_fraction_predicted += region_length
154
+ elsif pred_start >= ctrl_start && pred_stop <= ctrl_stop #within ctrl region
155
+ region_length = pred_stop - pred_start
156
+ in_out_regions << ["in", score, region_length]
157
+ @good_predicted_subregions += region_length
158
+ elsif ctrl_start < pred_stop && ctrl_stop >= pred_stop #upstream region out of ctrl region
159
+ region_length = pred_stop - ctrl_start
160
+ in_out_regions << ["in", score, region_length]
161
+ @good_predicted_subregions += region_length
162
+ region_length = ctrl_start - pred_start
163
+ in_out_regions << ["out", score, region_length]
164
+ @genome_fraction_predicted += region_length
165
+ elsif ctrl_start <= pred_start && ctrl_stop > pred_start #downstream region out of ctrl region
166
+ region_length = ctrl_stop - pred_start
167
+ in_out_regions << ["in", score, region_length]
168
+ @good_predicted_subregions += region_length
169
+ region_length = pred_stop - ctrl_stop
170
+ in_out_regions << ["out", score, region_length]
171
+ @genome_fraction_predicted += region_length
172
+ else #in same chr but not in ctrl region
173
+ region_length = pred_stop - pred_start
174
+ in_out_regions << ["out", score, region_length]
175
+ @genome_fraction_predicted += region_length
176
+ end
177
+ elsif !pred_chr.nil? #in different chr
178
+ region_length = pred_stop - pred_start
179
+ in_out_regions << ["out", score, region_length]
180
+ @genome_fraction_predicted += region_length
181
+ end
182
+ if in_out_regions.map{|reg| reg.first }.include?('in')
183
+ @prediction_vector << true
184
+ else
185
+ @prediction_vector << false
186
+ end
187
+ return in_out_regions
188
+ end
189
+
190
+ def get_sucess_percentage(in_out_regions)
191
+ percentage = 0
192
+ if !in_out_regions.empty?
193
+ count = 0
194
+ total = 0
195
+ in_out_regions.each do |group, score, reg_length|
196
+ count += reg_length if group == 'in'
197
+ total += reg_length
198
+ end
199
+ percentage = count.fdiv(total)
200
+ end
201
+ return percentage
202
+ end
203
+
204
+
205
+ ##########################
206
+ #OPT-PARSE
207
+ ##########################
208
+
209
+ options = {}
210
+ OptionParser.new do |opts|
211
+ opts.banner = "Usage: #{__FILE__} [options]"
212
+
213
+ options[:input_prediction] = nil
214
+ opts.on("-i", "--input_prediction PATH", "Input prediction file for checking") do |input_prediction|
215
+ options[:input_prediction] = input_prediction
216
+ end
217
+
218
+ options[:meth] = nil
219
+ opts.on("-m", "--meth STRING", "Method used in score integration calculation, affects to the imputation algorithm (if used)") do |meth|
220
+ options[:meth] = meth
221
+ end
222
+
223
+ options[:output_file] = 'final_values_for_pr_curve.txt'
224
+ opts.on("-o", "--output_file PATH", "Output results for PR curve") do |output_file|
225
+ options[:output_file] = output_file
226
+ end
227
+
228
+ options[:hpo_recovery] = 0
229
+ opts.on("-p", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery|
230
+ options[:hpo_recovery] = hpo_recovery.to_f
231
+ abort("Please, choose a recovery value higher than 0") if options[:hpo_recovery] <= 0
232
+ end
233
+
234
+ options[:input_regions] = nil
235
+ opts.on("-r", "--input_regions PATH", "Input patients true affected regions (ctrl file)") do |input_regions|
236
+ options[:input_regions] = input_regions
237
+ end
238
+
239
+ options[:success_percentage] = 'success_percentage'
240
+ opts.on("-s", "--success_percentage PATH", "Output results with success percentage for each prediction") do |success_percentage|
241
+ options[:success_percentage] = success_percentage
242
+ end
243
+
244
+ options[:apply_imputation] = false
245
+ opts.on("-y", "--apply_imputation", "Activates imputation") do
246
+ options[:apply_imputation] = true
247
+ end
248
+
249
+ options[:scale_size] = 100
250
+ opts.on("-z", "--scale_size INTEGER", "Scale region size to avoid long range regions") do |scale_size|
251
+ options[:scale_size] = scale_size.to_i
252
+ abort("Please, choose a scale value higher than 0") if options[:scale_size] <= 0
253
+ end
254
+
255
+ opts.on_tail("-h", "--help", "Show this message") do
256
+ puts opts
257
+ exit
258
+ end
259
+
260
+ end.parse!
261
+
262
+ ##########################
263
+ #MAIN
264
+ ##########################
265
+
266
+ predicted_regions = load_prediction_file(options[:input_prediction])
267
+ imputation_score = get_imputation_scores(predicted_regions, options[:meth])
268
+ patient_data = load_patient_data(options[:input_regions])
269
+
270
+ table = get_perfomance_table(patient_data, predicted_regions, options[:scale_size], imputation_score, options[:hpo_recovery], options[:apply_imputation])
271
+ File.open(options[:output_file], 'w') do |f|
272
+ f.puts "group\tscore"
273
+ table.each do |output, score|
274
+ if options[:apply_imputation]
275
+ #score = generate_random_imp_score(0.764, 0.35) if score.nil?
276
+ score = generate_random_imp_score(imputation_score, 0.35) if score.nil?
277
+ else
278
+ next if score.nil? #when no imputation
279
+ end
280
+ f.puts "#{output}\t#{score}"
281
+ end
282
+ end
283
+ File.open(options[:success_percentage], 'w') do |f|
284
+ f.puts 'perc'
285
+ @success_percentage_distribution.each do |pg|
286
+ f.puts pg
287
+ end
288
+ end
289
+ File.open('ranking', 'w') do |f|
290
+ f.puts "reg\tranking"
291
+ @rankings.each do |reg, ranks|
292
+ ranks.each do |rank|
293
+ f.puts "#{reg.to_s}\t#{rank}"
294
+ end
295
+ end
296
+ end
297
+
data/bin/plot_area.R ADDED
@@ -0,0 +1,71 @@
1
+ #! /usr/bin/env Rscript
2
+ # x,y graph
3
+
4
+ library(ggplot2)
5
+ library(optparse)
6
+
7
+ ################################################################
8
+ # OPTPARSE
9
+ ################################################################
10
+ option_list <- list(
11
+ make_option(c("-d", "--data_file"), type="character",
12
+ help="Tabulated file with information about each sample"),
13
+ make_option(c("-o", "--output"), type="character", default="results",
14
+ help="Output figure file"),
15
+ make_option(c("-x", "--x_values"), type="character",
16
+ help="Name of column with values to be plotted"),
17
+ make_option(c("-y", "--y_values"), type="character",
18
+ help="Name of column with values to be plotted"),
19
+ make_option(c("-f", "--density_values"), type="character",
20
+ help="Name of column to be used as density values"),
21
+ make_option(c("-H", "--header"), action="store_false", default=TRUE,
22
+ help="The input table not have header line"),
23
+ make_option(c("-X", "--x_title"), type="character",
24
+ help="Name of column to be used for bars titles"),
25
+ make_option(c("-Y", "--y_title"), type="character",
26
+ help="Title of y axis"),
27
+ make_option(c("-F", "--output_format"), type="character", default="pdf",
28
+ help="pdf or jpeg file output format"),
29
+ make_option(c("-m", "--maxs_file"), type="character", default="",
30
+ help="Tabulated file maximum of each sample"),
31
+ make_option(c("-t", "--graph_title"), type="character", default="",
32
+ help="Title of the graph")
33
+
34
+ )
35
+ opt <- parse_args(OptionParser(option_list=option_list))
36
+
37
+
38
+ ################################################################
39
+ ## MAIN
40
+ ################################################################
41
+
42
+ data <- read.table(opt$data_file, sep="\t", header=opt$header)
43
+ if (opt$output_format == "pdf"){
44
+ pdf(paste(opt$output, '.pdf', sep=""))
45
+ }else if(opt$output_format == "jpeg"){
46
+ jpeg(paste(opt$output, '.jpeg', sep=""))
47
+ }
48
+ goodChrOrder <- c(1:22,"X","Y")
49
+ data$V1 <- factor(data$V1,levels=goodChrOrder)
50
+
51
+ maxs <- c()
52
+ if(opt$maxs_file != ""){
53
+ maxs <- read.table(opt$maxs_file, sep="\t", header=FALSE)
54
+ #print(maxs)
55
+ }
56
+ #ggplot(data=data, aes(x=data[[opt$x_values]], y=data[[opt$y_values]] )) +
57
+ obj <- ggplot(data=data, aes(x=V2, y=V3 ))
58
+ #geom_area(aes(fill=data[[opt$density_values]], )) +
59
+ obj <- obj + geom_area(aes(fill=V1, ))
60
+ obj <- obj + facet_wrap(~ V1, ncol=2, strip.position = "right" )
61
+ if(length(maxs) > 0){
62
+ obj <- obj + geom_vline(data = maxs, aes(xintercept = V2))
63
+ }
64
+ obj <- obj + xlab(opt$x_title)
65
+ obj <- obj + ylab(opt$y_title)
66
+ obj <- obj + theme(axis.text.x = element_text(angle = 45, hjust = 1))
67
+ obj <- obj + guides(fill=FALSE)
68
+ #obj <- obj + labs(title = opt$graph_title)
69
+ obj <- obj + ggtitle(label = opt$graph_title)
70
+ obj
71
+ dev.off()
@@ -0,0 +1,21 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ args <- commandArgs(trailingOnly = TRUE)
5
+
6
+ data <- read.table(args[1], header=TRUE)
7
+ output <- args[2]
8
+ x_axis <- args[3]
9
+ y_axis <- args[4]
10
+ x_tag <- args[5]
11
+ y_tag <- args[6]
12
+ x_order <- unique(data[[x_axis]])
13
+ data[[x_axis]] <- factor(data[[x_axis]], levels = x_order)
14
+
15
+ pdf(file.path(output, 'boxplot.pdf'))
16
+ ggplot(data, aes(x=data[[x_axis]], y=data[[y_axis]])) +
17
+ geom_boxplot() +
18
+ xlab(x_tag) +
19
+ ylab(y_tag) +
20
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
21
+ dev.off()
@@ -0,0 +1,46 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+
5
+ args <- commandArgs(trailingOnly = TRUE)
6
+
7
+ file <- args[1]
8
+
9
+ values <- args[2]
10
+
11
+ #x_axis_limit <- strtoi(args[3])
12
+ x_axis_limit <- as.numeric(args[3])
13
+
14
+ groups <- args[4]
15
+ x_axis_limit_min <- as.numeric(args[5])
16
+ #categories <- args[3]
17
+
18
+ #xtitle <- args[4]
19
+
20
+ #ytitle <- args[5]
21
+
22
+ data <- read.table(file, header = TRUE , sep="\t")
23
+
24
+ pdf('out.pdf')
25
+ #ggplot(data, aes(x=data[[values]], colour=data[[categories]], fill=data[[categories]])) +
26
+ #geom_histogram(binwidth=.5, position="dodge") +
27
+ #geom_histogram(position="dodge") +
28
+ #xlab(xtitle) +
29
+ #ylab('Count') +
30
+ if(is.na(groups)){
31
+ obj <- ggplot(data, aes(x=data[[values]]))
32
+ obj <- obj + geom_density()
33
+ }else{
34
+ obj <- ggplot(data, aes(x=data[[values]], fill=data[[groups]]))
35
+ obj <- obj + geom_density(alpha=.3)
36
+ }
37
+ obj <- obj + theme(legend.title=element_blank())
38
+ if(!is.na(x_axis_limit)){
39
+ xmin <- 0
40
+ if(!is.na(x_axis_limit_min)){
41
+ xmin <-x_axis_limit_min
42
+ }
43
+ obj <- obj + xlim(xmin, x_axis_limit)
44
+ }
45
+ obj
46
+ dev.off()
@@ -0,0 +1,25 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ args <- commandArgs(trailingOnly = TRUE)
5
+
6
+ data <- read.table(args[1], header=TRUE)
7
+ output <- args[2]
8
+ x_axis <- args[3]
9
+ y_axis <- args[4]
10
+ density <- args[5]
11
+ x_tag <- args[6]
12
+ y_tag <- args[7]
13
+ size_tag <- args[8]
14
+ x_order <- unique(data[[x_axis]])
15
+ data[[x_axis]] <- factor(data[[x_axis]], levels = x_order)
16
+
17
+ pdf(file.path(output, 'scatterplot.pdf'))
18
+ ggplot(data, aes(x=data[[x_axis]], y=data[[y_axis]])) +
19
+ geom_point(aes(size=data[[density]])) +
20
+ xlab(x_tag) +
21
+ ylab(y_tag) +
22
+ labs(size = size_tag) +
23
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
24
+ dev.off()
25
+