pets 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "gephepred"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,72 @@
1
+ #! /usr/bin/env ruby
2
+ #Code for calculating F measure for precision-recall curves
3
+
4
+ require 'optparse'
5
+
6
+ ##########################
7
+ #METHODS
8
+ ##########################
9
+
10
+ def load_pr_data(file)
11
+ counter = 0
12
+ pr_data = {}
13
+ File.open(file).each do |line|
14
+ if counter > 0
15
+ line.chomp!
16
+ cutoff, prec, rec, meth = line.split("\t")
17
+ query = pr_data[meth]
18
+ pr_info = [cutoff.to_f, prec.to_f, rec.to_f]
19
+ if query.nil?
20
+ pr_data[meth] = [pr_info]
21
+ else
22
+ query << pr_info
23
+ end
24
+ end
25
+ counter += 1
26
+ end
27
+ return pr_data
28
+ end
29
+
30
+ def calculate_youden(pr_data)
31
+ #the max f_measure is the best cutoff
32
+ best_cutoffs = []
33
+ pr_data.each do |meth, pr_values|
34
+ max_f_measure = 0
35
+ best_cutoff = 0
36
+ #next if meth != 'cosine'
37
+ pr_values.each do |cutoff, prec, rec|
38
+ f_measure = 2 * prec * rec / (prec + rec)
39
+ if max_f_measure < f_measure
40
+ max_f_measure = f_measure
41
+ best_cutoff = cutoff
42
+ end
43
+ end
44
+ best_cutoffs << [meth, best_cutoff]
45
+ end
46
+ return best_cutoffs
47
+ end
48
+
49
+ ##########################
50
+ #OPT-PARSE
51
+ ##########################
52
+ options = {}
53
+ OptionParser.new do |opts|
54
+ opts.banner = "Usage: #{__FILE__} [options]"
55
+
56
+ options[:input_file] = nil
57
+ opts.on("-f", "--input_file PATH", "Input file with precision-recall values") do |input_file|
58
+ options[:input_file] = input_file
59
+ end
60
+
61
+ end.parse!
62
+
63
+ ##########################
64
+ #MAIN
65
+ ##########################
66
+
67
+ pr_data = load_pr_data(options[:input_file])
68
+ best_cutoffs = calculate_youden(pr_data)
69
+ #puts best_cutoffs.inspect
70
+ best_cutoffs.each do |cutoffs|
71
+ puts cutoffs.join("\t")
72
+ end
@@ -0,0 +1,90 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ N_CUTS = 200
4
+
5
+ pp_limit = ARGV[1].to_f if !ARGV[1].nil?
6
+
7
+ count = 0
8
+ values = []
9
+ File.open(ARGV[0]).each do |line|
10
+ line.chomp!
11
+ count += 1
12
+ next if count == 1
13
+ fields = line.split("\t")
14
+ values << [fields.first, fields.last.to_f]
15
+ end
16
+ values.sort!{|v1, v2| v1.last <=> v2.last}
17
+ max_score = values.last.last
18
+ min_score = values.first.last
19
+
20
+ interval = (max_score - min_score).fdiv(N_CUTS)
21
+ cuts = []
22
+ current = min_score
23
+ while cuts.length < N_CUTS
24
+ cuts << current
25
+ current += interval
26
+ end
27
+
28
+ header = %w[tp tn fp fn cut pre rec]
29
+ header << 'group' if !ARGV[2].nil?
30
+ puts header.join("\t")
31
+ last_pre = 1
32
+ last_rec = 0
33
+ change_data = false
34
+ pre_range = 0
35
+ rec_range = 0
36
+ all_weigths = []
37
+ total_weigth = 0
38
+ cuts.reverse.each_with_index do |cut, i|
39
+ tp = 0
40
+ tn = 0
41
+ fp = 0
42
+ fn = 0
43
+ values.each do |label, score|
44
+ if score >= cut
45
+ if label == 'in'
46
+ tp += 1
47
+ else
48
+ fp += 1
49
+ end
50
+ else
51
+ if label == 'out'
52
+ tn += 1
53
+ else
54
+ fn += 1
55
+ end
56
+ end
57
+ end
58
+ pre = tp.fdiv(tp+fp)
59
+ rec = tp.fdiv(tp+fn)
60
+ if !ARGV[1].nil? && pp_limit > 0
61
+ if !change_data
62
+ pp = pre/last_pre
63
+ if pp >= pp_limit
64
+ change_data = true
65
+ #pre_range = last_pre/(N_CUTS - i)
66
+ pre_range = last_pre#/(N_CUTS - i)
67
+ rec_range = (1 - last_rec)/(N_CUTS - i)
68
+ (N_CUTS - i).times do |n|
69
+ all_weigths << (n+1)**8
70
+ total_weigth += (n+1)**8
71
+ end
72
+ else
73
+ last_pre = pre
74
+ last_rec = rec
75
+ end
76
+ end
77
+ if change_data
78
+ tp = tn = fp = fn = 0
79
+ last_pre -= pre_range * all_weigths.pop.fdiv(total_weigth)
80
+ last_rec += rec_range
81
+ pre = last_pre
82
+ pre = 0 if pre < 0
83
+ rec = last_rec
84
+ end
85
+ end
86
+ row = [tp, tn, fp, fn, cut, pre, rec]
87
+ row << ARGV[2] if !ARGV[2].nil?
88
+ puts row.join("\t")
89
+ end
90
+
@@ -0,0 +1,18 @@
1
+ #! /usr/bin/env Rscript
2
+ args <- commandArgs(trailingOnly = TRUE)
3
+
4
+ file <- args[1]
5
+ output <- args[2]
6
+
7
+ # matrix_data <- read.table(file, sep="\t", header=TRUE, quote = '')
8
+ matrix_data <- read.table(file, sep="\t", quote = '')
9
+ d <- dist(matrix_data, method = "euclidean") # distance matrix
10
+ fit <- hclust(d, method="ward.D2")
11
+ fit$height <- round(fit$height, 6)
12
+ groups <- cutree(fit, h=1.5)
13
+ write.table(groups, file=file.path(output, 'cluster_asignation'), sep="\t", quote=FALSE, col.names=FALSE)
14
+
15
+ pdf(file.path(output, 'figures.pdf'))
16
+ plot(fit) # display dendogram
17
+ rect.hclust(fit, h=1.5, border="red") # draw dendogram with red borders around the 5 clusters
18
+ dev.off()
@@ -0,0 +1,197 @@
1
+ #! /usr/bin/env ruby
2
+ # Rojano E. & Seoane P., March 2019
3
+ # Code to prepare data to get the associations between pathological phenotypes (HPO) and genomic regions (SOR)
4
+
5
+
6
+ ROOT_PATH = File.dirname(__FILE__)
7
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
8
+
9
+ ##############################
10
+ #LIBRARIES
11
+ ##############################
12
+ require 'generalMethods.rb'
13
+ require 'optparse'
14
+ require "benchmark"
15
+
16
+ ###############################
17
+ #METHODS
18
+ ###############################
19
+
20
+ def loadPatientFile(patient_file, hpo_storage, hpo_dictionary, add_parents)
21
+ patient2phenotype = {}
22
+ hpo_count = {}
23
+ not_found = []
24
+ patients_genomic_region_by_chr = {}
25
+ File.open(patient_file).each do |line|
26
+ line.chomp!
27
+ next if line.include?("#")
28
+ patient, chr, start, stop, phenotype_profile = line.split("\t", 5)
29
+ next if phenotype_profile.nil? #For skipping patients without phenotypes
30
+ phenotypes = phenotype_profile.split('|')
31
+ phenotypes.each do |hpo_name|
32
+ hpo_code = hpo_dictionary[hpo_name]
33
+ if hpo_code.nil?
34
+ not_found << hpo_name if !not_found.include?(hpo_name)
35
+ else
36
+ get_all_hpos(patient, hpo_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
37
+ end
38
+ end
39
+ info = [patient, start.to_i, stop.to_i]
40
+ add_record(patients_genomic_region_by_chr, chr, info)
41
+ end
42
+ if add_parents == 'coh'
43
+ general_parents_in_cohort = get_parents_in_patients(patient2phenotype, hpo_storage)
44
+ parent_patient2phenotype = {} # For new parent hpo added to patients.
45
+ end
46
+ return patient2phenotype, hpo_count, not_found, patients_genomic_region_by_chr
47
+ end
48
+
49
+ def get_parents_in_patients(patient2phenotype, hpo_storage)
50
+ all_hpo_codes = []
51
+ patient2phenotype.each do |patient, hpo_codes|
52
+ all_hpo_codes = all_hpo_codes | hpo_codes
53
+ end
54
+ end
55
+
56
+ def get_all_hpos(patient, hpo_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
57
+ add_record(hpo_count, hpo_code, patient)
58
+ add_record(patient2phenotype, patient, hpo_code)
59
+ if add_parents == 'root'
60
+ hpo_parent_codes = hpo_storage[hpo_code][2]
61
+ hpo_parent_codes.each do |parent_code|
62
+ get_all_hpos(patient, parent_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
63
+ end
64
+ end
65
+ end
66
+
67
+ def build_tripartite_network(patients2hpo, hpo_stats, ic_threshold, patients_by_cluster)
68
+ tripartite_network = []
69
+ patients_by_cluster.each do |patient, node_ids|
70
+ node_ids.each do |node_id|
71
+ tripartite_network << [node_id, patient]
72
+ end
73
+ end
74
+ patients_list = patients_by_cluster.keys
75
+ patients2hpo.each do |patient, code|
76
+ if patients_list.include?(patient)
77
+ code.each do |c|
78
+ tripartite_network << [c, patient] if hpo_stats[c].last >= ic_threshold
79
+ end
80
+ end
81
+ end
82
+ return tripartite_network
83
+ end
84
+
85
+ def compute_hpo_stats(hpo_count, patient_number)
86
+ hpo_stats = {}
87
+ patient_hpo_ic = []
88
+ hpo_count.each do |hpo_code, patient_ids|
89
+ hpo_freq = patient_ids.length.fdiv(patient_number) #hpo frequency in patients
90
+ hpo_ic = -Math.log10(hpo_freq)
91
+ hpo_stats[hpo_code] = [hpo_freq, hpo_ic]
92
+ patient_ids.each do |patient_id|
93
+ patient_hpo_ic << [patient_id, hpo_code, hpo_ic]
94
+ end
95
+ end
96
+ return hpo_stats, patient_hpo_ic.sort{|a,b| a.first.to_i <=> b.first.to_i}
97
+ end
98
+
99
+ def write_hash(hash, file_path, header = [])
100
+ File.open(file_path, 'w') do |handler|
101
+ handler.puts header.join("\t") if !header.empty?
102
+ hash.each do |key, array|
103
+ handler.puts "#{key}\t#{array.join("\t")}"
104
+ end
105
+ end
106
+ end
107
+
108
+ def write_array(array, file_path)
109
+ File.open(file_path, 'w') do |handler|
110
+ array.each do |record|
111
+ if record.class == String
112
+ line = record
113
+ else
114
+ line = record.join("\t")
115
+ end
116
+ handler.puts line
117
+ end
118
+ end
119
+ end
120
+
121
+ ##############################
122
+ #OPTPARSE
123
+ ##############################
124
+
125
+ options = {}
126
+ OptionParser.new do |opts|
127
+ opts.banner = "Usage: #{__FILE__} [options]"
128
+
129
+ options[:cluster_file] = 'cluster_coords.txt'
130
+ opts.on("-c", "--cluster_file PATH", "Cluster coords output file that will be used to translate SOR nodes") do |value|
131
+ options[:cluster_file] = value
132
+ end
133
+
134
+ options[:excluded_hpo] = nil
135
+ opts.on("-e", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
136
+ options[:excluded_hpo] = excluded_hpo
137
+ end
138
+
139
+ options[:patient_file] = nil
140
+ opts.on("-i", "--input_file PATH", "Input file with patients for parsing phenotypes to HPO codes") do |value|
141
+ options[:patient_file] = value
142
+ end
143
+
144
+ options[:mutation_type] = 'A'
145
+ opts.on("-m", "--mutation_type STRING", "Type of patient mutation, either it is a deletion (d) or duplication (D)") do |type|
146
+ options[:mutation_type] = type
147
+ end
148
+
149
+ options[:output_file] = 'tripartite_network.txt'
150
+ opts.on("-o", "--output_file PATH", "Output file for the tripartite network") do |value|
151
+ options[:output_file] = value
152
+ end
153
+
154
+ options[:hpo_file] = nil
155
+ opts.on("-p", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
156
+ options[:hpo_file] = value
157
+ end
158
+
159
+ options[:add_parents] = nil
160
+ opts.on("-r", "--parents STRING", "'root' to add all parents until the ontology root. 'coh' to add parents until the most general term in the cohort.") do |value|
161
+ options[:add_parents] = value
162
+ end
163
+
164
+ options[:hpo_stat_file] = 'hpo_stats.txt'
165
+ opts.on("-s", "--hpo_stat_file PATH", "Output file with HPO codes, their frequency and CI") do |value|
166
+ options[:hpo_stat_file] = value
167
+ end
168
+
169
+ options[:thresold] = 0
170
+ opts.on("-t", "--info_thresold FLOAT", "IC thresold to discard non informative hpo. Default: 0.") do |thresold|
171
+ options[:thresold] = thresold.to_f
172
+ end
173
+
174
+ opts.on_tail("-h", "--help", "Show this message") do
175
+ puts opts
176
+ exit
177
+ end
178
+
179
+ end.parse!
180
+
181
+ ###############################
182
+ #MAIN
183
+ ###############################
184
+ hpo_black_list = load_hpo_black_list(options[:excluded_hpo])
185
+ hpo_storage = load_hpo_file(options[:hpo_file], hpo_black_list)
186
+ hpo_dictionary = create_hpo_dictionary(hpo_storage)
187
+ patients2hpo, hpo_count, not_found, chr_patients_genomic_region = loadPatientFile(options[:patient_file], hpo_storage, hpo_dictionary, options[:add_parents])
188
+ hpo_stats, patient_hpo_ic = compute_hpo_stats(hpo_count, patients2hpo.length)
189
+ patients_by_cluster, sors = generate_cluster_regions(chr_patients_genomic_region, options[:mutation_type])
190
+ tripartite_network = build_tripartite_network(patients2hpo, hpo_stats, options[:thresold], patients_by_cluster)
191
+
192
+ write_array(not_found - hpo_black_list, 'missing_hpo_names')
193
+ write_array(sors, options[:cluster_file])
194
+ write_hash(hpo_stats.select{|hp_code, stats| stats.last > options[:thresold]}, options[:hpo_stat_file], %w[HPOcode Frequency IC])
195
+ write_array(tripartite_network, options[:output_file])
196
+ write_array(patient_hpo_ic, 'filtered_hpo.txt')
197
+
data/bin/lines.R ADDED
@@ -0,0 +1,77 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ library(optparse)
5
+
6
+ #####################
7
+ ## OPTPARSE
8
+ #####################
9
+ option_list <- list(
10
+ make_option(c("-d", "--data_file"), type="character",
11
+ help="Tabulated file with information about each sample"),
12
+ make_option(c("-o", "--output"), type="character", default="results",
13
+ help="Output figure file"),
14
+ make_option(c("-x", "--x_column"), type="character",
15
+ help="Name of column to be used for X dimension"),
16
+ make_option(c("-y", "--y_column"), type="character",
17
+ help="Name of column to be used for Y dimension"),
18
+ make_option(c("-s", "--set_column"), type="character", default="",
19
+ help="Name of column to be used on set groups"),
20
+ make_option(c("-L", "--no_legend"), action="store_true", default=FALSE,
21
+ help="Remove legend"),
22
+ make_option(c("-l", "--legend_title"), type="character", default="Association methods",
23
+ help="Title for legend"),
24
+ make_option(c("-c", "--colours"), type="character", default="",
25
+ help="Define which color is asigned to each data series. List colours comma separated."),
26
+ make_option(c("-m", "--set_geom"), type="character", default="line",
27
+ help="Choose the type of graphical representation, using points or lines"),
28
+ make_option(c("-e", "--establish_limits"), action="store_true", default=FALSE,
29
+ help="Allow establishing limits for X and Y axis. If true, please set x_limit and y_limit"),
30
+ make_option(c("-X", "--x_limit"), type="integer", default=0,
31
+ help="Set x axis limit"),
32
+ make_option(c("-Y", "--y_limit"), type="integer", default=1,
33
+ help="Set y axis limit")
34
+
35
+ )
36
+ opt <- parse_args(OptionParser(option_list=option_list))
37
+
38
+
39
+ ################################################################
40
+ ## MAIN
41
+ ################################################################
42
+
43
+ data <- read.table(opt$data_file, sep="\t", header=TRUE)
44
+
45
+ pdf(paste(opt$output, '.pdf', sep=""))
46
+ if(opt$set_column != ""){
47
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]], color=data[[opt$set_column]]))
48
+ }else{
49
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]]))
50
+ }
51
+ if(opt$colours != ""){
52
+ colours <- unlist(strsplit(opt$colours, ','))
53
+ obj <- obj + scale_color_manual(values=c(colours))
54
+ }
55
+
56
+ if(opt$set_geom == 'point'){
57
+ obj <- obj + geom_point()
58
+ }else if(opt$set_geom == 'line'){
59
+ obj <- obj + geom_line()
60
+ }
61
+
62
+ obj <- obj + xlim(0, 1)
63
+ obj <- obj + ylim(0, 1)
64
+
65
+ if (opt$establish_limits){
66
+ obj <- obj + xlim(opt$x_limit, 1)
67
+ obj <- obj + ylim(0, opt$y_limit)
68
+ }
69
+ obj <- obj + xlab(opt$x_column)
70
+ obj <- obj + ylab(opt$y_column)
71
+ obj <- obj + labs(fill = opt$legend_title)
72
+ if(opt$no_legend){
73
+ obj <- obj + guides(color=FALSE)
74
+ }
75
+ obj
76
+ dev.off()
77
+