pets 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +7 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +6 -0
  9. data/bin/area_under_curve_pr.rb +118 -0
  10. data/bin/association_metrics_average.rb +94 -0
  11. data/bin/coPatReporter.rb +531 -0
  12. data/bin/console +14 -0
  13. data/bin/fmeasure_index.rb +72 -0
  14. data/bin/get_PR_values.rb +90 -0
  15. data/bin/get_clusters.R +18 -0
  16. data/bin/get_network_nodes.rb +197 -0
  17. data/bin/lines.R +77 -0
  18. data/bin/merge_by_cluster.rb +62 -0
  19. data/bin/merge_pairs.rb +138 -0
  20. data/bin/paco_translator.rb +102 -0
  21. data/bin/phen2reg.rb +385 -0
  22. data/bin/phen2reg_predictor_check.rb +297 -0
  23. data/bin/plot_area.R +71 -0
  24. data/bin/plot_boxplot.R +21 -0
  25. data/bin/plot_density.R +46 -0
  26. data/bin/plot_scatterplot.R +25 -0
  27. data/bin/reg2phen.rb +116 -0
  28. data/bin/region_to_patients_generator.rb +84 -0
  29. data/bin/relate_CI_to_association_value.rb +90 -0
  30. data/bin/setup +8 -0
  31. data/bin/standardize_scores.R +40 -0
  32. data/bin/xyplot_graph.R +60 -0
  33. data/external_data/biosystems_gene.gz +0 -0
  34. data/external_data/bsid2info.gz +0 -0
  35. data/external_data/chromosome_sizes_hg19.txt +24 -0
  36. data/external_data/gene_data.gz +0 -0
  37. data/external_data/gene_data_with_pathways.gz +0 -0
  38. data/external_data/gene_location.gz +0 -0
  39. data/external_data/hp.obo +146363 -0
  40. data/external_data/remove +0 -0
  41. data/lib/pets.rb +6 -0
  42. data/lib/pets/coPatReporterMethods.rb +77 -0
  43. data/lib/pets/generalMethods.rb +556 -0
  44. data/lib/pets/phen2reg_methods.rb +432 -0
  45. data/lib/pets/version.rb +3 -0
  46. data/pets.gemspec +47 -0
  47. data/templates/cohort_report.erb +93 -0
  48. data/templates/patient_report.erb +209 -0
  49. metadata +183 -0
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "gephepred"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,72 @@
1
+ #! /usr/bin/env ruby
2
+ #Code for calculating F measure for precision-recall curves
3
+
4
+ require 'optparse'
5
+
6
+ ##########################
7
+ #METHODS
8
+ ##########################
9
+
10
+ def load_pr_data(file)
11
+ counter = 0
12
+ pr_data = {}
13
+ File.open(file).each do |line|
14
+ if counter > 0
15
+ line.chomp!
16
+ cutoff, prec, rec, meth = line.split("\t")
17
+ query = pr_data[meth]
18
+ pr_info = [cutoff.to_f, prec.to_f, rec.to_f]
19
+ if query.nil?
20
+ pr_data[meth] = [pr_info]
21
+ else
22
+ query << pr_info
23
+ end
24
+ end
25
+ counter += 1
26
+ end
27
+ return pr_data
28
+ end
29
+
30
+ def calculate_youden(pr_data)
31
+ #the max f_measure is the best cutoff
32
+ best_cutoffs = []
33
+ pr_data.each do |meth, pr_values|
34
+ max_f_measure = 0
35
+ best_cutoff = 0
36
+ #next if meth != 'cosine'
37
+ pr_values.each do |cutoff, prec, rec|
38
+ f_measure = 2 * prec * rec / (prec + rec)
39
+ if max_f_measure < f_measure
40
+ max_f_measure = f_measure
41
+ best_cutoff = cutoff
42
+ end
43
+ end
44
+ best_cutoffs << [meth, best_cutoff]
45
+ end
46
+ return best_cutoffs
47
+ end
48
+
49
+ ##########################
50
+ #OPT-PARSE
51
+ ##########################
52
+ options = {}
53
+ OptionParser.new do |opts|
54
+ opts.banner = "Usage: #{__FILE__} [options]"
55
+
56
+ options[:input_file] = nil
57
+ opts.on("-f", "--input_file PATH", "Input file with precision-recall values") do |input_file|
58
+ options[:input_file] = input_file
59
+ end
60
+
61
+ end.parse!
62
+
63
+ ##########################
64
+ #MAIN
65
+ ##########################
66
+
67
+ pr_data = load_pr_data(options[:input_file])
68
+ best_cutoffs = calculate_youden(pr_data)
69
+ #puts best_cutoffs.inspect
70
+ best_cutoffs.each do |cutoffs|
71
+ puts cutoffs.join("\t")
72
+ end
@@ -0,0 +1,90 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ N_CUTS = 200
4
+
5
+ pp_limit = ARGV[1].to_f if !ARGV[1].nil?
6
+
7
+ count = 0
8
+ values = []
9
+ File.open(ARGV[0]).each do |line|
10
+ line.chomp!
11
+ count += 1
12
+ next if count == 1
13
+ fields = line.split("\t")
14
+ values << [fields.first, fields.last.to_f]
15
+ end
16
+ values.sort!{|v1, v2| v1.last <=> v2.last}
17
+ max_score = values.last.last
18
+ min_score = values.first.last
19
+
20
+ interval = (max_score - min_score).fdiv(N_CUTS)
21
+ cuts = []
22
+ current = min_score
23
+ while cuts.length < N_CUTS
24
+ cuts << current
25
+ current += interval
26
+ end
27
+
28
+ header = %w[tp tn fp fn cut pre rec]
29
+ header << 'group' if !ARGV[2].nil?
30
+ puts header.join("\t")
31
+ last_pre = 1
32
+ last_rec = 0
33
+ change_data = false
34
+ pre_range = 0
35
+ rec_range = 0
36
+ all_weigths = []
37
+ total_weigth = 0
38
+ cuts.reverse.each_with_index do |cut, i|
39
+ tp = 0
40
+ tn = 0
41
+ fp = 0
42
+ fn = 0
43
+ values.each do |label, score|
44
+ if score >= cut
45
+ if label == 'in'
46
+ tp += 1
47
+ else
48
+ fp += 1
49
+ end
50
+ else
51
+ if label == 'out'
52
+ tn += 1
53
+ else
54
+ fn += 1
55
+ end
56
+ end
57
+ end
58
+ pre = tp.fdiv(tp+fp)
59
+ rec = tp.fdiv(tp+fn)
60
+ if !ARGV[1].nil? && pp_limit > 0
61
+ if !change_data
62
+ pp = pre/last_pre
63
+ if pp >= pp_limit
64
+ change_data = true
65
+ #pre_range = last_pre/(N_CUTS - i)
66
+ pre_range = last_pre#/(N_CUTS - i)
67
+ rec_range = (1 - last_rec)/(N_CUTS - i)
68
+ (N_CUTS - i).times do |n|
69
+ all_weigths << (n+1)**8
70
+ total_weigth += (n+1)**8
71
+ end
72
+ else
73
+ last_pre = pre
74
+ last_rec = rec
75
+ end
76
+ end
77
+ if change_data
78
+ tp = tn = fp = fn = 0
79
+ last_pre -= pre_range * all_weigths.pop.fdiv(total_weigth)
80
+ last_rec += rec_range
81
+ pre = last_pre
82
+ pre = 0 if pre < 0
83
+ rec = last_rec
84
+ end
85
+ end
86
+ row = [tp, tn, fp, fn, cut, pre, rec]
87
+ row << ARGV[2] if !ARGV[2].nil?
88
+ puts row.join("\t")
89
+ end
90
+
@@ -0,0 +1,18 @@
1
+ #! /usr/bin/env Rscript
2
+ args <- commandArgs(trailingOnly = TRUE)
3
+
4
+ file <- args[1]
5
+ output <- args[2]
6
+
7
+ # matrix_data <- read.table(file, sep="\t", header=TRUE, quote = '')
8
+ matrix_data <- read.table(file, sep="\t", quote = '')
9
+ d <- dist(matrix_data, method = "euclidean") # distance matrix
10
+ fit <- hclust(d, method="ward.D2")
11
+ fit$height <- round(fit$height, 6)
12
+ groups <- cutree(fit, h=1.5)
13
+ write.table(groups, file=file.path(output, 'cluster_asignation'), sep="\t", quote=FALSE, col.names=FALSE)
14
+
15
+ pdf(file.path(output, 'figures.pdf'))
16
+ plot(fit) # display dendogram
17
+ rect.hclust(fit, h=1.5, border="red") # draw dendogram with red borders around the 5 clusters
18
+ dev.off()
@@ -0,0 +1,197 @@
1
+ #! /usr/bin/env ruby
2
+ # Rojano E. & Seoane P., March 2019
3
+ # Code to prepare data to get the associations between pathological phenotypes (HPO) and genomic regions (SOR)
4
+
5
+
6
+ ROOT_PATH = File.dirname(__FILE__)
7
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'gephepred'))
8
+
9
+ ##############################
10
+ #LIBRARIES
11
+ ##############################
12
+ require 'generalMethods.rb'
13
+ require 'optparse'
14
+ require "benchmark"
15
+
16
+ ###############################
17
+ #METHODS
18
+ ###############################
19
+
20
+ def loadPatientFile(patient_file, hpo_storage, hpo_dictionary, add_parents)
21
+ patient2phenotype = {}
22
+ hpo_count = {}
23
+ not_found = []
24
+ patients_genomic_region_by_chr = {}
25
+ File.open(patient_file).each do |line|
26
+ line.chomp!
27
+ next if line.include?("#")
28
+ patient, chr, start, stop, phenotype_profile = line.split("\t", 5)
29
+ next if phenotype_profile.nil? #For skipping patients without phenotypes
30
+ phenotypes = phenotype_profile.split('|')
31
+ phenotypes.each do |hpo_name|
32
+ hpo_code = hpo_dictionary[hpo_name]
33
+ if hpo_code.nil?
34
+ not_found << hpo_name if !not_found.include?(hpo_name)
35
+ else
36
+ get_all_hpos(patient, hpo_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
37
+ end
38
+ end
39
+ info = [patient, start.to_i, stop.to_i]
40
+ add_record(patients_genomic_region_by_chr, chr, info)
41
+ end
42
+ if add_parents == 'coh'
43
+ general_parents_in_cohort = get_parents_in_patients(patient2phenotype, hpo_storage)
44
+ parent_patient2phenotype = {} # For new parent hpo added to patients.
45
+ end
46
+ return patient2phenotype, hpo_count, not_found, patients_genomic_region_by_chr
47
+ end
48
+
49
+ def get_parents_in_patients(patient2phenotype, hpo_storage)
50
+ all_hpo_codes = []
51
+ patient2phenotype.each do |patient, hpo_codes|
52
+ all_hpo_codes = all_hpo_codes | hpo_codes
53
+ end
54
+ end
55
+
56
+ def get_all_hpos(patient, hpo_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
57
+ add_record(hpo_count, hpo_code, patient)
58
+ add_record(patient2phenotype, patient, hpo_code)
59
+ if add_parents == 'root'
60
+ hpo_parent_codes = hpo_storage[hpo_code][2]
61
+ hpo_parent_codes.each do |parent_code|
62
+ get_all_hpos(patient, parent_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
63
+ end
64
+ end
65
+ end
66
+
67
+ def build_tripartite_network(patients2hpo, hpo_stats, ic_threshold, patients_by_cluster)
68
+ tripartite_network = []
69
+ patients_by_cluster.each do |patient, node_ids|
70
+ node_ids.each do |node_id|
71
+ tripartite_network << [node_id, patient]
72
+ end
73
+ end
74
+ patients_list = patients_by_cluster.keys
75
+ patients2hpo.each do |patient, code|
76
+ if patients_list.include?(patient)
77
+ code.each do |c|
78
+ tripartite_network << [c, patient] if hpo_stats[c].last >= ic_threshold
79
+ end
80
+ end
81
+ end
82
+ return tripartite_network
83
+ end
84
+
85
+ def compute_hpo_stats(hpo_count, patient_number)
86
+ hpo_stats = {}
87
+ patient_hpo_ic = []
88
+ hpo_count.each do |hpo_code, patient_ids|
89
+ hpo_freq = patient_ids.length.fdiv(patient_number) #hpo frequency in patients
90
+ hpo_ic = -Math.log10(hpo_freq)
91
+ hpo_stats[hpo_code] = [hpo_freq, hpo_ic]
92
+ patient_ids.each do |patient_id|
93
+ patient_hpo_ic << [patient_id, hpo_code, hpo_ic]
94
+ end
95
+ end
96
+ return hpo_stats, patient_hpo_ic.sort{|a,b| a.first.to_i <=> b.first.to_i}
97
+ end
98
+
99
+ def write_hash(hash, file_path, header = [])
100
+ File.open(file_path, 'w') do |handler|
101
+ handler.puts header.join("\t") if !header.empty?
102
+ hash.each do |key, array|
103
+ handler.puts "#{key}\t#{array.join("\t")}"
104
+ end
105
+ end
106
+ end
107
+
108
+ def write_array(array, file_path)
109
+ File.open(file_path, 'w') do |handler|
110
+ array.each do |record|
111
+ if record.class == String
112
+ line = record
113
+ else
114
+ line = record.join("\t")
115
+ end
116
+ handler.puts line
117
+ end
118
+ end
119
+ end
120
+
121
+ ##############################
122
+ #OPTPARSE
123
+ ##############################
124
+
125
+ options = {}
126
+ OptionParser.new do |opts|
127
+ opts.banner = "Usage: #{__FILE__} [options]"
128
+
129
+ options[:cluster_file] = 'cluster_coords.txt'
130
+ opts.on("-c", "--cluster_file PATH", "Cluster coords output file that will be used to translate SOR nodes") do |value|
131
+ options[:cluster_file] = value
132
+ end
133
+
134
+ options[:excluded_hpo] = nil
135
+ opts.on("-e", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
136
+ options[:excluded_hpo] = excluded_hpo
137
+ end
138
+
139
+ options[:patient_file] = nil
140
+ opts.on("-i", "--input_file PATH", "Input file with patients for parsing phenotypes to HPO codes") do |value|
141
+ options[:patient_file] = value
142
+ end
143
+
144
+ options[:mutation_type] = 'A'
145
+ opts.on("-m", "--mutation_type STRING", "Type of patient mutation, either it is a deletion (d) or duplication (D)") do |type|
146
+ options[:mutation_type] = type
147
+ end
148
+
149
+ options[:output_file] = 'tripartite_network.txt'
150
+ opts.on("-o", "--output_file PATH", "Output file for the tripartite network") do |value|
151
+ options[:output_file] = value
152
+ end
153
+
154
+ options[:hpo_file] = nil
155
+ opts.on("-p", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
156
+ options[:hpo_file] = value
157
+ end
158
+
159
+ options[:add_parents] = nil
160
+ opts.on("-r", "--parents STRING", "'root' to add all parents until the ontology root. 'coh' to add parents until the most general term in the cohort.") do |value|
161
+ options[:add_parents] = value
162
+ end
163
+
164
+ options[:hpo_stat_file] = 'hpo_stats.txt'
165
+ opts.on("-s", "--hpo_stat_file PATH", "Output file with HPO codes, their frequency and CI") do |value|
166
+ options[:hpo_stat_file] = value
167
+ end
168
+
169
+ options[:thresold] = 0
170
+ opts.on("-t", "--info_thresold FLOAT", "IC thresold to discard non informative hpo. Default: 0.") do |thresold|
171
+ options[:thresold] = thresold.to_f
172
+ end
173
+
174
+ opts.on_tail("-h", "--help", "Show this message") do
175
+ puts opts
176
+ exit
177
+ end
178
+
179
+ end.parse!
180
+
181
+ ###############################
182
+ #MAIN
183
+ ###############################
184
+ hpo_black_list = load_hpo_black_list(options[:excluded_hpo])
185
+ hpo_storage = load_hpo_file(options[:hpo_file], hpo_black_list)
186
+ hpo_dictionary = create_hpo_dictionary(hpo_storage)
187
+ patients2hpo, hpo_count, not_found, chr_patients_genomic_region = loadPatientFile(options[:patient_file], hpo_storage, hpo_dictionary, options[:add_parents])
188
+ hpo_stats, patient_hpo_ic = compute_hpo_stats(hpo_count, patients2hpo.length)
189
+ patients_by_cluster, sors = generate_cluster_regions(chr_patients_genomic_region, options[:mutation_type])
190
+ tripartite_network = build_tripartite_network(patients2hpo, hpo_stats, options[:thresold], patients_by_cluster)
191
+
192
+ write_array(not_found - hpo_black_list, 'missing_hpo_names')
193
+ write_array(sors, options[:cluster_file])
194
+ write_hash(hpo_stats.select{|hp_code, stats| stats.last > options[:thresold]}, options[:hpo_stat_file], %w[HPOcode Frequency IC])
195
+ write_array(tripartite_network, options[:output_file])
196
+ write_array(patient_hpo_ic, 'filtered_hpo.txt')
197
+
data/bin/lines.R ADDED
@@ -0,0 +1,77 @@
1
+ #! /usr/bin/env Rscript
2
+
3
+ library(ggplot2)
4
+ library(optparse)
5
+
6
+ #####################
7
+ ## OPTPARSE
8
+ #####################
9
+ option_list <- list(
10
+ make_option(c("-d", "--data_file"), type="character",
11
+ help="Tabulated file with information about each sample"),
12
+ make_option(c("-o", "--output"), type="character", default="results",
13
+ help="Output figure file"),
14
+ make_option(c("-x", "--x_column"), type="character",
15
+ help="Name of column to be used for X dimension"),
16
+ make_option(c("-y", "--y_column"), type="character",
17
+ help="Name of column to be used for Y dimension"),
18
+ make_option(c("-s", "--set_column"), type="character", default="",
19
+ help="Name of column to be used on set groups"),
20
+ make_option(c("-L", "--no_legend"), action="store_true", default=FALSE,
21
+ help="Remove legend"),
22
+ make_option(c("-l", "--legend_title"), type="character", default="Association methods",
23
+ help="Title for legend"),
24
+ make_option(c("-c", "--colours"), type="character", default="",
25
+ help="Define which color is asigned to each data series. List colours comma separated."),
26
+ make_option(c("-m", "--set_geom"), type="character", default="line",
27
+ help="Choose the type of graphical representation, using points or lines"),
28
+ make_option(c("-e", "--establish_limits"), action="store_true", default=FALSE,
29
+ help="Allow establishing limits for X and Y axis. If true, please set x_limit and y_limit"),
30
+ make_option(c("-X", "--x_limit"), type="integer", default=0,
31
+ help="Set x axis limit"),
32
+ make_option(c("-Y", "--y_limit"), type="integer", default=1,
33
+ help="Set y axis limit")
34
+
35
+ )
36
+ opt <- parse_args(OptionParser(option_list=option_list))
37
+
38
+
39
+ ################################################################
40
+ ## MAIN
41
+ ################################################################
42
+
43
+ data <- read.table(opt$data_file, sep="\t", header=TRUE)
44
+
45
+ pdf(paste(opt$output, '.pdf', sep=""))
46
+ if(opt$set_column != ""){
47
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]], color=data[[opt$set_column]]))
48
+ }else{
49
+ obj <- ggplot(data, aes(x=data[[opt$x_column]], y=data[[opt$y_column]]))
50
+ }
51
+ if(opt$colours != ""){
52
+ colours <- unlist(strsplit(opt$colours, ','))
53
+ obj <- obj + scale_color_manual(values=c(colours))
54
+ }
55
+
56
+ if(opt$set_geom == 'point'){
57
+ obj <- obj + geom_point()
58
+ }else if(opt$set_geom == 'line'){
59
+ obj <- obj + geom_line()
60
+ }
61
+
62
+ obj <- obj + xlim(0, 1)
63
+ obj <- obj + ylim(0, 1)
64
+
65
+ if (opt$establish_limits){
66
+ obj <- obj + xlim(opt$x_limit, 1)
67
+ obj <- obj + ylim(0, opt$y_limit)
68
+ }
69
+ obj <- obj + xlab(opt$x_column)
70
+ obj <- obj + ylab(opt$y_column)
71
+ obj <- obj + labs(fill = opt$legend_title)
72
+ if(opt$no_legend){
73
+ obj <- obj + guides(color=FALSE)
74
+ }
75
+ obj
76
+ dev.off()
77
+