anncrsnp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/anncrsnp.gemspec +35 -0
- data/bin/console +14 -0
- data/bin/grdbfinder.rb +472 -0
- data/bin/grdbmanager.rb +226 -0
- data/bin/masterfeatures.rb +188 -0
- data/bin/setup +7 -0
- data/bin/statistics.rb +193 -0
- data/database/deleteme +0 -0
- data/lib/anncrsnp/dataset.rb +178 -0
- data/lib/anncrsnp/parsers/ucscparser.rb +35 -0
- data/lib/anncrsnp/version.rb +3 -0
- data/lib/anncrsnp.rb +5 -0
- metadata +144 -0
data/bin/grdbmanager.rb
ADDED
@@ -0,0 +1,226 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp'))
|
5
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'parsers'))
|
6
|
+
|
7
|
+
require 'optparse'
|
8
|
+
require 'ucscparser'
|
9
|
+
require 'dataset'
|
10
|
+
require 'sqlite3'
|
11
|
+
|
12
|
+
options = {}
|
13
|
+
OptionParser.new do |opts|
|
14
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
15
|
+
|
16
|
+
options[:data] = nil
|
17
|
+
opts.on("-d", "--data_directory PATH", "Directory used to extract data") do |data|
|
18
|
+
options[:data] = data
|
19
|
+
end
|
20
|
+
|
21
|
+
options[:create_sql] = FALSE
|
22
|
+
opts.on("-s", "--create_sql", "Create SQL DB") do
|
23
|
+
options[:create_sql] = TRUE
|
24
|
+
end
|
25
|
+
|
26
|
+
options[:output_path] = "genomic_data.sqlite"
|
27
|
+
opts.on("-o", '--output_path PATH', 'Output path for DB') do |output_path|
|
28
|
+
options[:output_path] = output_path
|
29
|
+
end
|
30
|
+
|
31
|
+
options[:verbose] = nil
|
32
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
33
|
+
options[:verbose] = v
|
34
|
+
end
|
35
|
+
|
36
|
+
end.parse!
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
all_data = {}
|
41
|
+
if File.exist?(options[:data])
|
42
|
+
Dir.glob(File.join(options[:data],'*.{txt,bed,csv}')).each do |file| # we get the path to each file on directory
|
43
|
+
current_file = File.basename(file)
|
44
|
+
### Definitive sources
|
45
|
+
#If bin field from UCSC doesn't exist, put FALSE as input data to parseUCSCformat method
|
46
|
+
if current_file == "wgEncodeAwgDnaseMasterSites.bed"
|
47
|
+
header = [:score, :floatScore, :sourceCount, :sourceIds]
|
48
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
49
|
+
current_dataset.numeric_filter(:sourceCount, 2)
|
50
|
+
current_dataset.drop_columns(header)
|
51
|
+
current_dataset.add_metadata(:classification, 'DNAseHS')
|
52
|
+
all_data['dnaseData'] = current_dataset
|
53
|
+
elsif current_file == "wgEncodeHaibMethyl450Ag04449SitesRep1.bed"
|
54
|
+
header = [:score, :strand, :thickStart, :thickEnd, :itemRgb]
|
55
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
56
|
+
current_dataset.drop_columns(header)
|
57
|
+
current_dataset.add_metadata(:classification, 'Metilation_sites')
|
58
|
+
all_data['metilationData'] = current_dataset
|
59
|
+
elsif current_file == "snp144Common.txt" # current_file == "test.txt"
|
60
|
+
header = [:score, :strand, :refNCBI, :refUCSC, :observed, :molType, :class, :valid, :avHet, :avHetSE, :func, :locType, :weight, :exceptions, :submitterCount, :submitters, :alleleFreqCount, :alleles, :alleleNs, :alleleFreqs, :bitfields]
|
61
|
+
current_dataset = parseUCSCformat(file, header)
|
62
|
+
current_dataset.drop_columns([:score, :strand, :refNCBI, :refUCSC, :observed, :molType, :valid, :avHet, :avHetSE, :locType, :weight, :exceptions, :submitterCount, :submitters, :alleleFreqCount, :alleles, :alleleNs, :alleleFreqs, :bitfields])
|
63
|
+
current_dataset.add_metadata(:classification, 'SNP')
|
64
|
+
all_data['snpDbSnp'] = current_dataset
|
65
|
+
elsif current_file == "refGene.txt"
|
66
|
+
header = [:name, :strand, :cdsStart, :cdsEnd, :exonCount, :exonStarts, :exonEnds, :score, :cdsStartStat, :cdsEndStat, :exonFrames]
|
67
|
+
current_dataset = parseUCSCrefseqformat(file, header)
|
68
|
+
current_dataset.drop_columns(header)
|
69
|
+
current_dataset.add_metadata(:classification, 'gene')
|
70
|
+
all_data['gene'] = current_dataset
|
71
|
+
elsif current_file == "TFBSMasterSites.txt" #Must be generated with "masterfeatures.rb tfbs/files.txt antibody import_data/TFBSMasterSites.txt tfbs/"
|
72
|
+
header = []
|
73
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
74
|
+
current_dataset.add_metadata(:classification, 'TFBS')
|
75
|
+
all_data['tfbs'] = current_dataset
|
76
|
+
elsif current_file == "HistoneModMasterSites.txt" #Must be generated with "masterfeatures.rb tfbs/files.txt antibody import_data/TFBSMasterSites.txt tfbs/"
|
77
|
+
header = []
|
78
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
79
|
+
current_dataset.add_metadata(:classification, 'HistoneModification')
|
80
|
+
all_data['HistoneModification'] = current_dataset
|
81
|
+
elsif current_file == "46waycons.txt"
|
82
|
+
header = [:span, :count, :offset, :file, :lowerLimit, :dataRange, :validCount, :sumData, :sumSquares]
|
83
|
+
current_dataset = parseUCSCformat(file, header)
|
84
|
+
current_dataset.drop_columns(header)
|
85
|
+
current_dataset.add_metadata(:classification, 'ConservedRegions')
|
86
|
+
all_data['ConservedRegions'] = current_dataset
|
87
|
+
elsif current_file == "enhancer_tss_associations.bed"
|
88
|
+
header = [:score, :strand, :enh_start, :enh_stop, :array, :index, :val1, :val2]
|
89
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
90
|
+
current_dataset.drop_columns(header)
|
91
|
+
current_dataset.add_metadata(:classification, 'Enhancers')
|
92
|
+
all_data['Enhancers'] = current_dataset
|
93
|
+
elsif current_file == "enhancers.csv"
|
94
|
+
header = [:cell_line, :index1, :index2, :index3, :index4, :index5, :index6, :index7]
|
95
|
+
current_dataset = parseDENdbCSVformat(file, header)
|
96
|
+
current_dataset.drop_columns(header)
|
97
|
+
current_dataset.add_metadata(:classification, 'DENdbEnhancers')
|
98
|
+
all_data['DENdbEnhancers'] = current_dataset
|
99
|
+
elsif current_file == "all_hg19_bed.bed"
|
100
|
+
header = [:counter]
|
101
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
102
|
+
current_dataset.drop_columns(header)
|
103
|
+
current_dataset.add_metadata(:classification, 'SuperEnhancers')
|
104
|
+
all_data['SuperEnhancers'] = current_dataset
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
if options[:create_sql]
|
110
|
+
commands = []
|
111
|
+
if !File.exists?(options[:output_path])
|
112
|
+
commands << "CREATE TABLE GenomicRange(
|
113
|
+
bin,
|
114
|
+
chr,
|
115
|
+
start,
|
116
|
+
end,
|
117
|
+
type,
|
118
|
+
name,
|
119
|
+
AnnotationId
|
120
|
+
)"
|
121
|
+
commands << "CREATE TABLE Annotation(
|
122
|
+
value,
|
123
|
+
AnnotationTypeId
|
124
|
+
)"
|
125
|
+
commands << "CREATE TABLE AnnotationType(
|
126
|
+
type
|
127
|
+
)"
|
128
|
+
#File.delete(options[:output_path])
|
129
|
+
end
|
130
|
+
|
131
|
+
DB = SQLite3::Database.new( options[:output_path] )
|
132
|
+
commands.each do |cmd|
|
133
|
+
DB.execute(cmd)
|
134
|
+
end
|
135
|
+
# Import data process speed up configuration
|
136
|
+
DB.execute("PRAGMA synchronous = OFF;")
|
137
|
+
DB.execute("PRAGMA journal_mode = MEMORY;")
|
138
|
+
|
139
|
+
# Creating memory indexes for incremental updates
|
140
|
+
annotation_type_index = DB.execute("SELECT rowid, * FROM AnnotationType").group_by {|r| r[1]}
|
141
|
+
annotation_index = DB.execute("SELECT rowid, * FROM Annotation").group_by {|r| r[1]}
|
142
|
+
|
143
|
+
all_data.each do |class_data, dataset|
|
144
|
+
puts "#{class_data} import started"
|
145
|
+
# Save and create AnnotationType data
|
146
|
+
#------------------------------------------------------------------
|
147
|
+
header = dataset.get_metadata(:header)
|
148
|
+
annotation_type = header.map{|h| h.to_s}
|
149
|
+
annotation_type.shift(4)
|
150
|
+
if !commands.empty?
|
151
|
+
records = annotation_type
|
152
|
+
else
|
153
|
+
records = annotation_type.select{|at| annotation_type_index[at].first.nil?}
|
154
|
+
end
|
155
|
+
|
156
|
+
DB.transaction do |db|
|
157
|
+
db.prepare("INSERT INTO AnnotationType(type) VALUES(?)") do |smnt| # Precompile query for speed up process
|
158
|
+
records.each do |rec|
|
159
|
+
smnt.execute(rec)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
annotation_type_index = DB.execute("SELECT rowid, * FROM AnnotationType").group_by {|r| r[1]} if !records.empty?
|
164
|
+
|
165
|
+
# Save and create AnnotationType data
|
166
|
+
#------------------------------------------------------------------
|
167
|
+
annotations = {}
|
168
|
+
annotation_type.each do |at|
|
169
|
+
annotations[at] = {}
|
170
|
+
end
|
171
|
+
if dataset.first.length > 4
|
172
|
+
dataset.each_record do |record|
|
173
|
+
record[5..record.length - 1].each_with_index do |annotation, i|
|
174
|
+
annotations[annotation_type[i]][annotation] = nil
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
records = []
|
180
|
+
annotations.each do |annotation_type, values|
|
181
|
+
annotation_type_id = annotation_type_index[annotation_type].first.first
|
182
|
+
if !commands.empty?
|
183
|
+
records = records.concat(values.keys.map{|v| [v, annotation_type_id]})
|
184
|
+
else
|
185
|
+
records = records.concat(values.keys.select{|v| annotation_index[v].nil? }.map{|v| [v, annotation_type_id]})
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
DB.transaction do |db|
|
190
|
+
db.prepare("INSERT INTO Annotation(value, AnnotationTypeId) VALUES(?, ?)") do |smnt|
|
191
|
+
records.each do |rec|
|
192
|
+
smnt.execute(rec[0], rec[1])
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
annotation_index = DB.execute("SELECT rowid, * FROM Annotation").group_by {|r| r[1]} if !records.empty?
|
197
|
+
|
198
|
+
# Save and create GenomicRange data
|
199
|
+
#------------------------------------------------------------------
|
200
|
+
DB.transaction do |db|
|
201
|
+
db.prepare("INSERT INTO GenomicRange(bin, chr, start, end, type, name, AnnotationId) VALUES(?, ?, ?, ?, ?, ?, ?)") do |smnt|
|
202
|
+
dataset.each_record do |record|
|
203
|
+
region_data = record.shift(4)
|
204
|
+
annotation_ids = []
|
205
|
+
record.each do |annotation|
|
206
|
+
id = annotation_index[annotation]
|
207
|
+
annotation_ids << id.first.first if !id.nil?
|
208
|
+
end
|
209
|
+
smnt.execute(
|
210
|
+
region_data[1]/10000,
|
211
|
+
region_data[0],
|
212
|
+
region_data[1],
|
213
|
+
region_data[2],
|
214
|
+
class_data,
|
215
|
+
region_data[3],
|
216
|
+
annotation_ids.join(',')
|
217
|
+
)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
puts "#{class_data} import finished"
|
222
|
+
end
|
223
|
+
end
|
224
|
+
DB.execute("CREATE INDEX name_index ON GenomicRange (name)")
|
225
|
+
DB.execute("CREATE INDEX bin_index ON GenomicRange (bin)")
|
226
|
+
DB.close
|
@@ -0,0 +1,188 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'benchmark'
|
4
|
+
|
5
|
+
#Description
|
6
|
+
#--------------
|
7
|
+
#Tool to unify data from ENCODE
|
8
|
+
|
9
|
+
#Methods
|
10
|
+
#--------------
|
11
|
+
|
12
|
+
def load_metadata_file(file)
|
13
|
+
name_storage = {}
|
14
|
+
file_text = File.open(file)
|
15
|
+
file_text.each do |line|
|
16
|
+
line.chomp!
|
17
|
+
fields = line.split("\t")
|
18
|
+
features_storage = {} #metadata hash
|
19
|
+
features = fields[1].split("; ")
|
20
|
+
features.each do |feature|
|
21
|
+
met_name, metadata = feature.split("=")
|
22
|
+
features_storage[met_name] = metadata
|
23
|
+
end
|
24
|
+
name_storage[fields[0].gsub('.gz', '')] = features_storage
|
25
|
+
end
|
26
|
+
file_text.close
|
27
|
+
return name_storage
|
28
|
+
end
|
29
|
+
|
30
|
+
def element_grouper(grouping_element, name_storage) #erase redundance by antibody by default
|
31
|
+
package_grouping = {}
|
32
|
+
name_storage.each do |file_name, metadata|
|
33
|
+
selected_element = metadata[grouping_element]
|
34
|
+
if !selected_element.nil? #verify is there is an element in this field (avoid mistakes).
|
35
|
+
if !package_grouping[selected_element].nil?
|
36
|
+
package_grouping[selected_element] << file_name
|
37
|
+
else
|
38
|
+
package_grouping[selected_element] = [file_name]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
return package_grouping
|
43
|
+
end
|
44
|
+
|
45
|
+
def load_files_to_compare(file_name)
|
46
|
+
genomic_regions = {}
|
47
|
+
File.open(file_name).each do |line|
|
48
|
+
line.chomp!
|
49
|
+
genomic_info = line.split("\t")
|
50
|
+
chr = genomic_info.shift
|
51
|
+
genomic_info = genomic_info[0..1].map{|c| c.to_i}
|
52
|
+
bin = genomic_info.first/10000
|
53
|
+
query = genomic_regions[chr]
|
54
|
+
if query.nil?
|
55
|
+
genomic_regions[chr] = {bin => [genomic_info]}
|
56
|
+
else
|
57
|
+
query_bin = query[bin]
|
58
|
+
if query_bin.nil?
|
59
|
+
query[bin] = [genomic_info]
|
60
|
+
else
|
61
|
+
query_bin << genomic_info
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
return genomic_regions
|
66
|
+
end
|
67
|
+
|
68
|
+
def compare_genomics_regions(main_genomic_regions, genomic_regions_to_compare, thresold_overlap)
|
69
|
+
selected_genomic_regions = {}
|
70
|
+
genomic_regions_to_compare.each do |chr_reg, genomic_region_to_compare|
|
71
|
+
genomic_region_to_compare.each do |bin, regs|
|
72
|
+
batch_match = false
|
73
|
+
query_main = main_genomic_regions[chr_reg] # main_genomic_regions has chr_reg?
|
74
|
+
if !query_main.nil? # main_genomic_regions has chr_reg!
|
75
|
+
query_main_bin = query_main[bin]
|
76
|
+
if !query_main_bin.nil?
|
77
|
+
batch_match = true
|
78
|
+
regs.each do |reg|
|
79
|
+
match = false
|
80
|
+
query_main_bin.each do |main|
|
81
|
+
match = compare_genomics_regions_coords(main, reg, thresold_overlap)
|
82
|
+
break if match
|
83
|
+
end
|
84
|
+
save_reg(selected_genomic_regions, chr_reg, bin, reg) if !match
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
if !batch_match
|
89
|
+
regs.each do |reg|
|
90
|
+
save_reg(selected_genomic_regions, chr_reg, bin, reg)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
return selected_genomic_regions
|
96
|
+
end
|
97
|
+
|
98
|
+
def save_reg(selected_genomic_regions, chr_reg, bin, reg)
|
99
|
+
query_chr = selected_genomic_regions[chr_reg]
|
100
|
+
if !query_chr.nil?
|
101
|
+
query_bin = query_chr[bin]
|
102
|
+
if query_bin.nil?
|
103
|
+
query_chr[bin] = [reg]
|
104
|
+
else
|
105
|
+
query_bin << reg
|
106
|
+
end
|
107
|
+
else
|
108
|
+
selected_genomic_regions[chr_reg] = {bin => [reg]}
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def save_reg_concat(selected_genomic_regions, chr_reg, bin, reg)
|
113
|
+
query_chr = selected_genomic_regions[chr_reg]
|
114
|
+
if !query_chr.nil?
|
115
|
+
query_bin = query_chr[bin]
|
116
|
+
if query_bin.nil?
|
117
|
+
query_chr[bin] = reg
|
118
|
+
else
|
119
|
+
query_bin.concat(reg)
|
120
|
+
end
|
121
|
+
else
|
122
|
+
selected_genomic_regions[chr_reg] = {bin => reg}
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def compare_genomics_regions_coords(main_genomic_region, genomic_region_to_compare, thresold_overlap)
|
127
|
+
match = false
|
128
|
+
|
129
|
+
main_beg, main_end = main_genomic_region
|
130
|
+
reg_beg, reg_end = genomic_region_to_compare
|
131
|
+
size_main_genomic_region = main_end - main_beg
|
132
|
+
size_genomic_region_to_compare = reg_end - reg_beg
|
133
|
+
|
134
|
+
absolute_overlap = 0
|
135
|
+
if reg_beg >= main_beg && reg_beg <= main_end
|
136
|
+
absolute_overlap = main_end - reg_beg
|
137
|
+
elsif reg_end >= main_beg && reg_end <= main_end
|
138
|
+
absolute_overlap = reg_end - main_beg
|
139
|
+
elsif reg_beg <= main_beg && reg_end >= main_end
|
140
|
+
absolute_overlap = size_main_genomic_region
|
141
|
+
elsif reg_beg >= main_beg && reg_end <= main_end
|
142
|
+
absolute_overlap = size_genomic_region_to_compare
|
143
|
+
end
|
144
|
+
main_relative_overlap = absolute_overlap / size_main_genomic_region * 1.0
|
145
|
+
compare_relative_overlap = absolute_overlap / size_genomic_region_to_compare * 1.0
|
146
|
+
if main_relative_overlap >= thresold_overlap || compare_relative_overlap >= thresold_overlap
|
147
|
+
match = true
|
148
|
+
end
|
149
|
+
|
150
|
+
return match
|
151
|
+
end
|
152
|
+
|
153
|
+
#Main
|
154
|
+
#--------------
|
155
|
+
file_input_folder = ARGV[3]
|
156
|
+
name_storage = load_metadata_file(ARGV[0])
|
157
|
+
package_grouping = element_grouper(ARGV[1], name_storage)
|
158
|
+
file_writer = File.open(ARGV[2],'w')
|
159
|
+
|
160
|
+
package_grouping.each do |grouping_element, file_names|
|
161
|
+
#abrir el primer archivo del paquete
|
162
|
+
genomic_regions_references = load_files_to_compare(File.join(file_input_folder, file_names.shift))
|
163
|
+
file_names.each do |f_name|
|
164
|
+
file2compare = load_files_to_compare(File.join(file_input_folder, f_name))
|
165
|
+
selected_genomic_regions = selected_genomic_regions = compare_genomics_regions(genomic_regions_references, file2compare, 0.8)
|
166
|
+
#puts Benchmark.measure{selected_genomic_regions = compare_genomics_regions(genomic_regions_references, file2compare, 0.8)}
|
167
|
+
selected_genomic_regions.each do |chr, ge_regs|
|
168
|
+
ge_regs.each do |bin, reg|
|
169
|
+
save_reg_concat(genomic_regions_references, chr, bin, reg)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
genomic_regions_references.each do |chr, ge_regs|
|
174
|
+
ge_regs.each do |bin, reg|
|
175
|
+
reg.each do |gr|
|
176
|
+
file_writer.puts "#{chr}\t#{gr.join("\t")}\t#{grouping_element}"
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
puts "Wrote #{grouping_element}"
|
181
|
+
end
|
182
|
+
|
183
|
+
file_writer.close
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
|
data/bin/setup
ADDED
data/bin/statistics.rb
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'scbi_plot'
|
3
|
+
#METHODS
|
4
|
+
#----------
|
5
|
+
def load_snp_data(input_file, fields_length)
|
6
|
+
snp_storage = {}
|
7
|
+
index = {}
|
8
|
+
counter = 0
|
9
|
+
File.open(input_file).each do |line|
|
10
|
+
line.chomp!
|
11
|
+
fields = line.split("\t")
|
12
|
+
snp_fields = fields.shift(fields_length) #in fields you store the genomic factors (histone modif, tfbs...)
|
13
|
+
if counter == 0
|
14
|
+
fields.each_with_index do |category, position|
|
15
|
+
index[category] = position
|
16
|
+
end
|
17
|
+
else
|
18
|
+
categories = {
|
19
|
+
"HistoneModification" => [],
|
20
|
+
"tfbs" => [],
|
21
|
+
"dnaseData" => [],
|
22
|
+
"metilationData" => [],
|
23
|
+
"ConservedRegions" => [],
|
24
|
+
"Enhancers" => [],
|
25
|
+
"DENdbEnhancers" => [],
|
26
|
+
"SuperEnhancers" => []
|
27
|
+
}
|
28
|
+
categories.each do |category_name, category_value|
|
29
|
+
column_position = index[category_name]
|
30
|
+
if !column_position.nil?
|
31
|
+
snp_category_values = fields[column_position]
|
32
|
+
category_value.concat(snp_category_values.split(',')) if snp_category_values != '-'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
snp_storage[snp_fields[0]] = categories
|
36
|
+
|
37
|
+
#bloque de código para tratar la información de los snps
|
38
|
+
end
|
39
|
+
counter += 1
|
40
|
+
end
|
41
|
+
return snp_storage
|
42
|
+
end
|
43
|
+
|
44
|
+
def snp_calculate_stats(snp_storage)
|
45
|
+
snp_percentage = {
|
46
|
+
"HistoneModification" => 0,
|
47
|
+
"tfbs" => 0,
|
48
|
+
"dnaseData" => 0,
|
49
|
+
"metilationData" => 0,
|
50
|
+
"ConservedRegions" => 0,
|
51
|
+
"Enhancers" => 0,
|
52
|
+
"DENdbEnhancers" => 0,
|
53
|
+
"SuperEnhancers" => 0
|
54
|
+
}
|
55
|
+
snp_storage.each do |snp_name, annotations|
|
56
|
+
annotations.each do |annotation_category, annotation_value|
|
57
|
+
if !annotation_value.empty?
|
58
|
+
snp_percentage[annotation_category] += 1 #possible error point!
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
total_snps = snp_storage.length.to_f
|
63
|
+
snp_percentage.each do |annotation_category, true_positive_number|
|
64
|
+
percentage = true_positive_number / total_snps * 100
|
65
|
+
snp_percentage[annotation_category] = percentage
|
66
|
+
end
|
67
|
+
return snp_percentage
|
68
|
+
end
|
69
|
+
|
70
|
+
def create_histogram(snp_percentage, name)
|
71
|
+
# create Histogram
|
72
|
+
p=ScbiPlot::Histogram.new(name,'SNPs genomic region annotations')
|
73
|
+
|
74
|
+
# add x axis data
|
75
|
+
|
76
|
+
p.add_x(snp_percentage.keys)
|
77
|
+
puts snp_percentage.keys.inspect
|
78
|
+
# add y axis data
|
79
|
+
p.add_y(snp_percentage.values)
|
80
|
+
puts snp_percentage.values.inspect
|
81
|
+
# generate graph
|
82
|
+
p.do_graph
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
def snp_calculate_stats_with_reference(snp_storage, snp_storage_reference)
|
87
|
+
snp_percentage = {
|
88
|
+
"HistoneModification" => 0,
|
89
|
+
"tfbs" => 0,
|
90
|
+
"dnaseData" => 0,
|
91
|
+
"metilationData" => 0,
|
92
|
+
"ConservedRegions" => 0,
|
93
|
+
"Enhancers" => 0,
|
94
|
+
"DENdbEnhancers" => 0,
|
95
|
+
"SuperEnhancers" => 0
|
96
|
+
}
|
97
|
+
|
98
|
+
snp_storage_reference.each do |snp_name_ref, annotations_ref|
|
99
|
+
query = snp_storage[snp_name_ref]
|
100
|
+
if !query.nil?
|
101
|
+
annotations_ref.each do |annotation_category_ref, annotation_value_ref|
|
102
|
+
annotation_value = query[annotation_category_ref]
|
103
|
+
if annotation_comparison(annotation_value_ref, annotation_value, annotation_category_ref)
|
104
|
+
snp_percentage[annotation_category_ref] += 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
total_snps = snp_storage_reference.length.to_f
|
111
|
+
snp_percentage.each do |annotation_category, true_positive_number|
|
112
|
+
percentage = true_positive_number / total_snps * 100
|
113
|
+
snp_percentage[annotation_category] = percentage
|
114
|
+
end
|
115
|
+
return snp_percentage
|
116
|
+
end
|
117
|
+
|
118
|
+
def annotation_comparison(annotation_value_ref, annotation_value, annotation_category_ref)
|
119
|
+
result = false
|
120
|
+
annotation_value_ref.uniq!
|
121
|
+
annotation_value.uniq!
|
122
|
+
#puts "#{annotation_value_ref.inspect} => #{annotation_value}" if annotation_category_ref == 'dnaseData'
|
123
|
+
if annotation_value_ref.sort == annotation_value.sort
|
124
|
+
result = true
|
125
|
+
elsif annotation_category_ref == 'dnaseData' &&
|
126
|
+
!annotation_value.empty?
|
127
|
+
result = true
|
128
|
+
elsif annotation_category_ref == 'tfbs'
|
129
|
+
if !(annotation_value_ref & annotation_value).empty? || annotation_value.length >= 5
|
130
|
+
result= true
|
131
|
+
end
|
132
|
+
elsif annotation_category_ref == 'metilationData' &&
|
133
|
+
!annotation_value.empty?
|
134
|
+
result = true
|
135
|
+
elsif annotation_category_ref == 'HistoneModification'
|
136
|
+
annotation_value_ref = annotation_value_ref.map{|an|
|
137
|
+
if /(H\d+K\d+)\w*/ =~ an
|
138
|
+
$1
|
139
|
+
else
|
140
|
+
an
|
141
|
+
end
|
142
|
+
}.uniq
|
143
|
+
annotation_value = annotation_value.map{|an|
|
144
|
+
if /(H\d+K\d+)\w*/ =~ an
|
145
|
+
$1
|
146
|
+
else
|
147
|
+
an
|
148
|
+
end
|
149
|
+
}.uniq
|
150
|
+
if !(annotation_value_ref & annotation_value).empty? || annotation_value.length >= 5
|
151
|
+
result= true
|
152
|
+
end
|
153
|
+
elsif annotation_category_ref == 'ConservedRegions' &&
|
154
|
+
!annotation_value.empty?
|
155
|
+
result = true
|
156
|
+
elsif annotation_category_ref == 'Enhancers' &&
|
157
|
+
!annotation_value.empty?
|
158
|
+
result = true
|
159
|
+
elsif annotation_category_ref == 'DENdbEnhancers' &&
|
160
|
+
!annotation_value.empty?
|
161
|
+
result = true
|
162
|
+
elsif annotation_category_ref == 'SuperEnhancers' &&
|
163
|
+
!annotation_value.empty?
|
164
|
+
result = true
|
165
|
+
end
|
166
|
+
return result
|
167
|
+
end
|
168
|
+
|
169
|
+
#MAIN
|
170
|
+
#----------
|
171
|
+
|
172
|
+
#RECUERDA: este programa hace analisis estadisticos y compara resultados para dos archivos dados.
|
173
|
+
#En nuestro caso, comparamos los datos dados por nuestro programa con los datos obtenidos experimentalmente.
|
174
|
+
#nuestros datos = ARGV[0], datos del experimento = ARGV[1]
|
175
|
+
#si no se especifica segundo argumento de entrada = se hace el análisis sobre el propio resultado del programa
|
176
|
+
fields_length = 5
|
177
|
+
fields_length = ARGV[2].to_i if !ARGV[2].nil?
|
178
|
+
|
179
|
+
snp_storage = load_snp_data(ARGV[0], fields_length)
|
180
|
+
if !ARGV[1].nil? && ARGV[1].downcase != 'false'
|
181
|
+
snp_storage_reference = load_snp_data(ARGV[1])
|
182
|
+
snp_percentage = snp_calculate_stats_with_reference(snp_storage, snp_storage_reference)
|
183
|
+
else
|
184
|
+
snp_percentage = snp_calculate_stats(snp_storage)
|
185
|
+
end
|
186
|
+
snp_percentage.each do |category_name, percentage|
|
187
|
+
puts "#{category_name}\t#{percentage}\t#{ARGV[3]}"
|
188
|
+
end
|
189
|
+
|
190
|
+
#El archivo de graficado aparecera donde se ejecute el script
|
191
|
+
# file_name = File.basename(ARGV[0], ".txt")
|
192
|
+
# graph_name = file_name + ".png"
|
193
|
+
# create_histogram(snp_percentage, graph_name)
|
data/database/deleteme
ADDED
File without changes
|