anncrsnp 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +6 -0
- data/anncrsnp.gemspec +35 -0
- data/bin/console +14 -0
- data/bin/grdbfinder.rb +472 -0
- data/bin/grdbmanager.rb +226 -0
- data/bin/masterfeatures.rb +188 -0
- data/bin/setup +7 -0
- data/bin/statistics.rb +193 -0
- data/database/deleteme +0 -0
- data/lib/anncrsnp/dataset.rb +178 -0
- data/lib/anncrsnp/parsers/ucscparser.rb +35 -0
- data/lib/anncrsnp/version.rb +3 -0
- data/lib/anncrsnp.rb +5 -0
- metadata +144 -0
data/bin/grdbmanager.rb
ADDED
@@ -0,0 +1,226 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
ROOT_PATH = File.dirname(__FILE__)
|
4
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp'))
|
5
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'parsers'))
|
6
|
+
|
7
|
+
require 'optparse'
|
8
|
+
require 'ucscparser'
|
9
|
+
require 'dataset'
|
10
|
+
require 'sqlite3'
|
11
|
+
|
12
|
+
options = {}
|
13
|
+
OptionParser.new do |opts|
|
14
|
+
opts.banner = "Usage: #{__FILE__} [options]"
|
15
|
+
|
16
|
+
options[:data] = nil
|
17
|
+
opts.on("-d", "--data_directory PATH", "Directory used to extract data") do |data|
|
18
|
+
options[:data] = data
|
19
|
+
end
|
20
|
+
|
21
|
+
options[:create_sql] = FALSE
|
22
|
+
opts.on("-s", "--create_sql", "Create SQL DB") do
|
23
|
+
options[:create_sql] = TRUE
|
24
|
+
end
|
25
|
+
|
26
|
+
options[:output_path] = "genomic_data.sqlite"
|
27
|
+
opts.on("-o", '--output_path PATH', 'Output path for DB') do |output_path|
|
28
|
+
options[:output_path] = output_path
|
29
|
+
end
|
30
|
+
|
31
|
+
options[:verbose] = nil
|
32
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
33
|
+
options[:verbose] = v
|
34
|
+
end
|
35
|
+
|
36
|
+
end.parse!
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
all_data = {}
|
41
|
+
if File.exist?(options[:data])
|
42
|
+
Dir.glob(File.join(options[:data],'*.{txt,bed,csv}')).each do |file| # we get the path to each file on directory
|
43
|
+
current_file = File.basename(file)
|
44
|
+
### Definitive sources
|
45
|
+
#If bin field from UCSC doesn't exist, put FALSE as input data to parseUCSCformat method
|
46
|
+
if current_file == "wgEncodeAwgDnaseMasterSites.bed"
|
47
|
+
header = [:score, :floatScore, :sourceCount, :sourceIds]
|
48
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
49
|
+
current_dataset.numeric_filter(:sourceCount, 2)
|
50
|
+
current_dataset.drop_columns(header)
|
51
|
+
current_dataset.add_metadata(:classification, 'DNAseHS')
|
52
|
+
all_data['dnaseData'] = current_dataset
|
53
|
+
elsif current_file == "wgEncodeHaibMethyl450Ag04449SitesRep1.bed"
|
54
|
+
header = [:score, :strand, :thickStart, :thickEnd, :itemRgb]
|
55
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
56
|
+
current_dataset.drop_columns(header)
|
57
|
+
current_dataset.add_metadata(:classification, 'Metilation_sites')
|
58
|
+
all_data['metilationData'] = current_dataset
|
59
|
+
elsif current_file == "snp144Common.txt" # current_file == "test.txt"
|
60
|
+
header = [:score, :strand, :refNCBI, :refUCSC, :observed, :molType, :class, :valid, :avHet, :avHetSE, :func, :locType, :weight, :exceptions, :submitterCount, :submitters, :alleleFreqCount, :alleles, :alleleNs, :alleleFreqs, :bitfields]
|
61
|
+
current_dataset = parseUCSCformat(file, header)
|
62
|
+
current_dataset.drop_columns([:score, :strand, :refNCBI, :refUCSC, :observed, :molType, :valid, :avHet, :avHetSE, :locType, :weight, :exceptions, :submitterCount, :submitters, :alleleFreqCount, :alleles, :alleleNs, :alleleFreqs, :bitfields])
|
63
|
+
current_dataset.add_metadata(:classification, 'SNP')
|
64
|
+
all_data['snpDbSnp'] = current_dataset
|
65
|
+
elsif current_file == "refGene.txt"
|
66
|
+
header = [:name, :strand, :cdsStart, :cdsEnd, :exonCount, :exonStarts, :exonEnds, :score, :cdsStartStat, :cdsEndStat, :exonFrames]
|
67
|
+
current_dataset = parseUCSCrefseqformat(file, header)
|
68
|
+
current_dataset.drop_columns(header)
|
69
|
+
current_dataset.add_metadata(:classification, 'gene')
|
70
|
+
all_data['gene'] = current_dataset
|
71
|
+
elsif current_file == "TFBSMasterSites.txt" #Must be generated with "masterfeatures.rb tfbs/files.txt antibody import_data/TFBSMasterSites.txt tfbs/"
|
72
|
+
header = []
|
73
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
74
|
+
current_dataset.add_metadata(:classification, 'TFBS')
|
75
|
+
all_data['tfbs'] = current_dataset
|
76
|
+
elsif current_file == "HistoneModMasterSites.txt" #Must be generated with "masterfeatures.rb tfbs/files.txt antibody import_data/TFBSMasterSites.txt tfbs/"
|
77
|
+
header = []
|
78
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
79
|
+
current_dataset.add_metadata(:classification, 'HistoneModification')
|
80
|
+
all_data['HistoneModification'] = current_dataset
|
81
|
+
elsif current_file == "46waycons.txt"
|
82
|
+
header = [:span, :count, :offset, :file, :lowerLimit, :dataRange, :validCount, :sumData, :sumSquares]
|
83
|
+
current_dataset = parseUCSCformat(file, header)
|
84
|
+
current_dataset.drop_columns(header)
|
85
|
+
current_dataset.add_metadata(:classification, 'ConservedRegions')
|
86
|
+
all_data['ConservedRegions'] = current_dataset
|
87
|
+
elsif current_file == "enhancer_tss_associations.bed"
|
88
|
+
header = [:score, :strand, :enh_start, :enh_stop, :array, :index, :val1, :val2]
|
89
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
90
|
+
current_dataset.drop_columns(header)
|
91
|
+
current_dataset.add_metadata(:classification, 'Enhancers')
|
92
|
+
all_data['Enhancers'] = current_dataset
|
93
|
+
elsif current_file == "enhancers.csv"
|
94
|
+
header = [:cell_line, :index1, :index2, :index3, :index4, :index5, :index6, :index7]
|
95
|
+
current_dataset = parseDENdbCSVformat(file, header)
|
96
|
+
current_dataset.drop_columns(header)
|
97
|
+
current_dataset.add_metadata(:classification, 'DENdbEnhancers')
|
98
|
+
all_data['DENdbEnhancers'] = current_dataset
|
99
|
+
elsif current_file == "all_hg19_bed.bed"
|
100
|
+
header = [:counter]
|
101
|
+
current_dataset = parseUCSCformat(file, header, FALSE)
|
102
|
+
current_dataset.drop_columns(header)
|
103
|
+
current_dataset.add_metadata(:classification, 'SuperEnhancers')
|
104
|
+
all_data['SuperEnhancers'] = current_dataset
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
if options[:create_sql]
|
110
|
+
commands = []
|
111
|
+
if !File.exists?(options[:output_path])
|
112
|
+
commands << "CREATE TABLE GenomicRange(
|
113
|
+
bin,
|
114
|
+
chr,
|
115
|
+
start,
|
116
|
+
end,
|
117
|
+
type,
|
118
|
+
name,
|
119
|
+
AnnotationId
|
120
|
+
)"
|
121
|
+
commands << "CREATE TABLE Annotation(
|
122
|
+
value,
|
123
|
+
AnnotationTypeId
|
124
|
+
)"
|
125
|
+
commands << "CREATE TABLE AnnotationType(
|
126
|
+
type
|
127
|
+
)"
|
128
|
+
#File.delete(options[:output_path])
|
129
|
+
end
|
130
|
+
|
131
|
+
DB = SQLite3::Database.new( options[:output_path] )
|
132
|
+
commands.each do |cmd|
|
133
|
+
DB.execute(cmd)
|
134
|
+
end
|
135
|
+
# Import data process speed up configuration
|
136
|
+
DB.execute("PRAGMA synchronous = OFF;")
|
137
|
+
DB.execute("PRAGMA journal_mode = MEMORY;")
|
138
|
+
|
139
|
+
# Creating memory indexes for incremental updates
|
140
|
+
annotation_type_index = DB.execute("SELECT rowid, * FROM AnnotationType").group_by {|r| r[1]}
|
141
|
+
annotation_index = DB.execute("SELECT rowid, * FROM Annotation").group_by {|r| r[1]}
|
142
|
+
|
143
|
+
all_data.each do |class_data, dataset|
|
144
|
+
puts "#{class_data} import started"
|
145
|
+
# Save and create AnnotationType data
|
146
|
+
#------------------------------------------------------------------
|
147
|
+
header = dataset.get_metadata(:header)
|
148
|
+
annotation_type = header.map{|h| h.to_s}
|
149
|
+
annotation_type.shift(4)
|
150
|
+
if !commands.empty?
|
151
|
+
records = annotation_type
|
152
|
+
else
|
153
|
+
records = annotation_type.select{|at| annotation_type_index[at].first.nil?}
|
154
|
+
end
|
155
|
+
|
156
|
+
DB.transaction do |db|
|
157
|
+
db.prepare("INSERT INTO AnnotationType(type) VALUES(?)") do |smnt| # Precompile query for speed up process
|
158
|
+
records.each do |rec|
|
159
|
+
smnt.execute(rec)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
annotation_type_index = DB.execute("SELECT rowid, * FROM AnnotationType").group_by {|r| r[1]} if !records.empty?
|
164
|
+
|
165
|
+
# Save and create AnnotationType data
|
166
|
+
#------------------------------------------------------------------
|
167
|
+
annotations = {}
|
168
|
+
annotation_type.each do |at|
|
169
|
+
annotations[at] = {}
|
170
|
+
end
|
171
|
+
if dataset.first.length > 4
|
172
|
+
dataset.each_record do |record|
|
173
|
+
record[5..record.length - 1].each_with_index do |annotation, i|
|
174
|
+
annotations[annotation_type[i]][annotation] = nil
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
records = []
|
180
|
+
annotations.each do |annotation_type, values|
|
181
|
+
annotation_type_id = annotation_type_index[annotation_type].first.first
|
182
|
+
if !commands.empty?
|
183
|
+
records = records.concat(values.keys.map{|v| [v, annotation_type_id]})
|
184
|
+
else
|
185
|
+
records = records.concat(values.keys.select{|v| annotation_index[v].nil? }.map{|v| [v, annotation_type_id]})
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
DB.transaction do |db|
|
190
|
+
db.prepare("INSERT INTO Annotation(value, AnnotationTypeId) VALUES(?, ?)") do |smnt|
|
191
|
+
records.each do |rec|
|
192
|
+
smnt.execute(rec[0], rec[1])
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
annotation_index = DB.execute("SELECT rowid, * FROM Annotation").group_by {|r| r[1]} if !records.empty?
|
197
|
+
|
198
|
+
# Save and create GenomicRange data
|
199
|
+
#------------------------------------------------------------------
|
200
|
+
DB.transaction do |db|
|
201
|
+
db.prepare("INSERT INTO GenomicRange(bin, chr, start, end, type, name, AnnotationId) VALUES(?, ?, ?, ?, ?, ?, ?)") do |smnt|
|
202
|
+
dataset.each_record do |record|
|
203
|
+
region_data = record.shift(4)
|
204
|
+
annotation_ids = []
|
205
|
+
record.each do |annotation|
|
206
|
+
id = annotation_index[annotation]
|
207
|
+
annotation_ids << id.first.first if !id.nil?
|
208
|
+
end
|
209
|
+
smnt.execute(
|
210
|
+
region_data[1]/10000,
|
211
|
+
region_data[0],
|
212
|
+
region_data[1],
|
213
|
+
region_data[2],
|
214
|
+
class_data,
|
215
|
+
region_data[3],
|
216
|
+
annotation_ids.join(',')
|
217
|
+
)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
puts "#{class_data} import finished"
|
222
|
+
end
|
223
|
+
end
|
224
|
+
DB.execute("CREATE INDEX name_index ON GenomicRange (name)")
|
225
|
+
DB.execute("CREATE INDEX bin_index ON GenomicRange (bin)")
|
226
|
+
DB.close
|
@@ -0,0 +1,188 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'benchmark'
|
4
|
+
|
5
|
+
#Description
|
6
|
+
#--------------
|
7
|
+
#Tool to unify data from ENCODE
|
8
|
+
|
9
|
+
#Methods
|
10
|
+
#--------------
|
11
|
+
|
12
|
+
def load_metadata_file(file)
|
13
|
+
name_storage = {}
|
14
|
+
file_text = File.open(file)
|
15
|
+
file_text.each do |line|
|
16
|
+
line.chomp!
|
17
|
+
fields = line.split("\t")
|
18
|
+
features_storage = {} #metadata hash
|
19
|
+
features = fields[1].split("; ")
|
20
|
+
features.each do |feature|
|
21
|
+
met_name, metadata = feature.split("=")
|
22
|
+
features_storage[met_name] = metadata
|
23
|
+
end
|
24
|
+
name_storage[fields[0].gsub('.gz', '')] = features_storage
|
25
|
+
end
|
26
|
+
file_text.close
|
27
|
+
return name_storage
|
28
|
+
end
|
29
|
+
|
30
|
+
def element_grouper(grouping_element, name_storage) #erase redundance by antibody by default
|
31
|
+
package_grouping = {}
|
32
|
+
name_storage.each do |file_name, metadata|
|
33
|
+
selected_element = metadata[grouping_element]
|
34
|
+
if !selected_element.nil? #verify is there is an element in this field (avoid mistakes).
|
35
|
+
if !package_grouping[selected_element].nil?
|
36
|
+
package_grouping[selected_element] << file_name
|
37
|
+
else
|
38
|
+
package_grouping[selected_element] = [file_name]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
return package_grouping
|
43
|
+
end
|
44
|
+
|
45
|
+
def load_files_to_compare(file_name)
|
46
|
+
genomic_regions = {}
|
47
|
+
File.open(file_name).each do |line|
|
48
|
+
line.chomp!
|
49
|
+
genomic_info = line.split("\t")
|
50
|
+
chr = genomic_info.shift
|
51
|
+
genomic_info = genomic_info[0..1].map{|c| c.to_i}
|
52
|
+
bin = genomic_info.first/10000
|
53
|
+
query = genomic_regions[chr]
|
54
|
+
if query.nil?
|
55
|
+
genomic_regions[chr] = {bin => [genomic_info]}
|
56
|
+
else
|
57
|
+
query_bin = query[bin]
|
58
|
+
if query_bin.nil?
|
59
|
+
query[bin] = [genomic_info]
|
60
|
+
else
|
61
|
+
query_bin << genomic_info
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
return genomic_regions
|
66
|
+
end
|
67
|
+
|
68
|
+
def compare_genomics_regions(main_genomic_regions, genomic_regions_to_compare, thresold_overlap)
|
69
|
+
selected_genomic_regions = {}
|
70
|
+
genomic_regions_to_compare.each do |chr_reg, genomic_region_to_compare|
|
71
|
+
genomic_region_to_compare.each do |bin, regs|
|
72
|
+
batch_match = false
|
73
|
+
query_main = main_genomic_regions[chr_reg] # main_genomic_regions has chr_reg?
|
74
|
+
if !query_main.nil? # main_genomic_regions has chr_reg!
|
75
|
+
query_main_bin = query_main[bin]
|
76
|
+
if !query_main_bin.nil?
|
77
|
+
batch_match = true
|
78
|
+
regs.each do |reg|
|
79
|
+
match = false
|
80
|
+
query_main_bin.each do |main|
|
81
|
+
match = compare_genomics_regions_coords(main, reg, thresold_overlap)
|
82
|
+
break if match
|
83
|
+
end
|
84
|
+
save_reg(selected_genomic_regions, chr_reg, bin, reg) if !match
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
if !batch_match
|
89
|
+
regs.each do |reg|
|
90
|
+
save_reg(selected_genomic_regions, chr_reg, bin, reg)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
return selected_genomic_regions
|
96
|
+
end
|
97
|
+
|
98
|
+
def save_reg(selected_genomic_regions, chr_reg, bin, reg)
|
99
|
+
query_chr = selected_genomic_regions[chr_reg]
|
100
|
+
if !query_chr.nil?
|
101
|
+
query_bin = query_chr[bin]
|
102
|
+
if query_bin.nil?
|
103
|
+
query_chr[bin] = [reg]
|
104
|
+
else
|
105
|
+
query_bin << reg
|
106
|
+
end
|
107
|
+
else
|
108
|
+
selected_genomic_regions[chr_reg] = {bin => [reg]}
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def save_reg_concat(selected_genomic_regions, chr_reg, bin, reg)
|
113
|
+
query_chr = selected_genomic_regions[chr_reg]
|
114
|
+
if !query_chr.nil?
|
115
|
+
query_bin = query_chr[bin]
|
116
|
+
if query_bin.nil?
|
117
|
+
query_chr[bin] = reg
|
118
|
+
else
|
119
|
+
query_bin.concat(reg)
|
120
|
+
end
|
121
|
+
else
|
122
|
+
selected_genomic_regions[chr_reg] = {bin => reg}
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def compare_genomics_regions_coords(main_genomic_region, genomic_region_to_compare, thresold_overlap)
|
127
|
+
match = false
|
128
|
+
|
129
|
+
main_beg, main_end = main_genomic_region
|
130
|
+
reg_beg, reg_end = genomic_region_to_compare
|
131
|
+
size_main_genomic_region = main_end - main_beg
|
132
|
+
size_genomic_region_to_compare = reg_end - reg_beg
|
133
|
+
|
134
|
+
absolute_overlap = 0
|
135
|
+
if reg_beg >= main_beg && reg_beg <= main_end
|
136
|
+
absolute_overlap = main_end - reg_beg
|
137
|
+
elsif reg_end >= main_beg && reg_end <= main_end
|
138
|
+
absolute_overlap = reg_end - main_beg
|
139
|
+
elsif reg_beg <= main_beg && reg_end >= main_end
|
140
|
+
absolute_overlap = size_main_genomic_region
|
141
|
+
elsif reg_beg >= main_beg && reg_end <= main_end
|
142
|
+
absolute_overlap = size_genomic_region_to_compare
|
143
|
+
end
|
144
|
+
main_relative_overlap = absolute_overlap / size_main_genomic_region * 1.0
|
145
|
+
compare_relative_overlap = absolute_overlap / size_genomic_region_to_compare * 1.0
|
146
|
+
if main_relative_overlap >= thresold_overlap || compare_relative_overlap >= thresold_overlap
|
147
|
+
match = true
|
148
|
+
end
|
149
|
+
|
150
|
+
return match
|
151
|
+
end
|
152
|
+
|
153
|
+
#Main
|
154
|
+
#--------------
|
155
|
+
file_input_folder = ARGV[3]
|
156
|
+
name_storage = load_metadata_file(ARGV[0])
|
157
|
+
package_grouping = element_grouper(ARGV[1], name_storage)
|
158
|
+
file_writer = File.open(ARGV[2],'w')
|
159
|
+
|
160
|
+
package_grouping.each do |grouping_element, file_names|
|
161
|
+
#abrir el primer archivo del paquete
|
162
|
+
genomic_regions_references = load_files_to_compare(File.join(file_input_folder, file_names.shift))
|
163
|
+
file_names.each do |f_name|
|
164
|
+
file2compare = load_files_to_compare(File.join(file_input_folder, f_name))
|
165
|
+
selected_genomic_regions = selected_genomic_regions = compare_genomics_regions(genomic_regions_references, file2compare, 0.8)
|
166
|
+
#puts Benchmark.measure{selected_genomic_regions = compare_genomics_regions(genomic_regions_references, file2compare, 0.8)}
|
167
|
+
selected_genomic_regions.each do |chr, ge_regs|
|
168
|
+
ge_regs.each do |bin, reg|
|
169
|
+
save_reg_concat(genomic_regions_references, chr, bin, reg)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
genomic_regions_references.each do |chr, ge_regs|
|
174
|
+
ge_regs.each do |bin, reg|
|
175
|
+
reg.each do |gr|
|
176
|
+
file_writer.puts "#{chr}\t#{gr.join("\t")}\t#{grouping_element}"
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
puts "Wrote #{grouping_element}"
|
181
|
+
end
|
182
|
+
|
183
|
+
file_writer.close
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
|
data/bin/setup
ADDED
data/bin/statistics.rb
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'scbi_plot'
|
3
|
+
#METHODS
|
4
|
+
#----------
|
5
|
+
def load_snp_data(input_file, fields_length)
|
6
|
+
snp_storage = {}
|
7
|
+
index = {}
|
8
|
+
counter = 0
|
9
|
+
File.open(input_file).each do |line|
|
10
|
+
line.chomp!
|
11
|
+
fields = line.split("\t")
|
12
|
+
snp_fields = fields.shift(fields_length) #in fields you store the genomic factors (histone modif, tfbs...)
|
13
|
+
if counter == 0
|
14
|
+
fields.each_with_index do |category, position|
|
15
|
+
index[category] = position
|
16
|
+
end
|
17
|
+
else
|
18
|
+
categories = {
|
19
|
+
"HistoneModification" => [],
|
20
|
+
"tfbs" => [],
|
21
|
+
"dnaseData" => [],
|
22
|
+
"metilationData" => [],
|
23
|
+
"ConservedRegions" => [],
|
24
|
+
"Enhancers" => [],
|
25
|
+
"DENdbEnhancers" => [],
|
26
|
+
"SuperEnhancers" => []
|
27
|
+
}
|
28
|
+
categories.each do |category_name, category_value|
|
29
|
+
column_position = index[category_name]
|
30
|
+
if !column_position.nil?
|
31
|
+
snp_category_values = fields[column_position]
|
32
|
+
category_value.concat(snp_category_values.split(',')) if snp_category_values != '-'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
snp_storage[snp_fields[0]] = categories
|
36
|
+
|
37
|
+
#bloque de código para tratar la información de los snps
|
38
|
+
end
|
39
|
+
counter += 1
|
40
|
+
end
|
41
|
+
return snp_storage
|
42
|
+
end
|
43
|
+
|
44
|
+
def snp_calculate_stats(snp_storage)
|
45
|
+
snp_percentage = {
|
46
|
+
"HistoneModification" => 0,
|
47
|
+
"tfbs" => 0,
|
48
|
+
"dnaseData" => 0,
|
49
|
+
"metilationData" => 0,
|
50
|
+
"ConservedRegions" => 0,
|
51
|
+
"Enhancers" => 0,
|
52
|
+
"DENdbEnhancers" => 0,
|
53
|
+
"SuperEnhancers" => 0
|
54
|
+
}
|
55
|
+
snp_storage.each do |snp_name, annotations|
|
56
|
+
annotations.each do |annotation_category, annotation_value|
|
57
|
+
if !annotation_value.empty?
|
58
|
+
snp_percentage[annotation_category] += 1 #possible error point!
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
total_snps = snp_storage.length.to_f
|
63
|
+
snp_percentage.each do |annotation_category, true_positive_number|
|
64
|
+
percentage = true_positive_number / total_snps * 100
|
65
|
+
snp_percentage[annotation_category] = percentage
|
66
|
+
end
|
67
|
+
return snp_percentage
|
68
|
+
end
|
69
|
+
|
70
|
+
def create_histogram(snp_percentage, name)
|
71
|
+
# create Histogram
|
72
|
+
p=ScbiPlot::Histogram.new(name,'SNPs genomic region annotations')
|
73
|
+
|
74
|
+
# add x axis data
|
75
|
+
|
76
|
+
p.add_x(snp_percentage.keys)
|
77
|
+
puts snp_percentage.keys.inspect
|
78
|
+
# add y axis data
|
79
|
+
p.add_y(snp_percentage.values)
|
80
|
+
puts snp_percentage.values.inspect
|
81
|
+
# generate graph
|
82
|
+
p.do_graph
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
def snp_calculate_stats_with_reference(snp_storage, snp_storage_reference)
|
87
|
+
snp_percentage = {
|
88
|
+
"HistoneModification" => 0,
|
89
|
+
"tfbs" => 0,
|
90
|
+
"dnaseData" => 0,
|
91
|
+
"metilationData" => 0,
|
92
|
+
"ConservedRegions" => 0,
|
93
|
+
"Enhancers" => 0,
|
94
|
+
"DENdbEnhancers" => 0,
|
95
|
+
"SuperEnhancers" => 0
|
96
|
+
}
|
97
|
+
|
98
|
+
snp_storage_reference.each do |snp_name_ref, annotations_ref|
|
99
|
+
query = snp_storage[snp_name_ref]
|
100
|
+
if !query.nil?
|
101
|
+
annotations_ref.each do |annotation_category_ref, annotation_value_ref|
|
102
|
+
annotation_value = query[annotation_category_ref]
|
103
|
+
if annotation_comparison(annotation_value_ref, annotation_value, annotation_category_ref)
|
104
|
+
snp_percentage[annotation_category_ref] += 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
total_snps = snp_storage_reference.length.to_f
|
111
|
+
snp_percentage.each do |annotation_category, true_positive_number|
|
112
|
+
percentage = true_positive_number / total_snps * 100
|
113
|
+
snp_percentage[annotation_category] = percentage
|
114
|
+
end
|
115
|
+
return snp_percentage
|
116
|
+
end
|
117
|
+
|
118
|
+
def annotation_comparison(annotation_value_ref, annotation_value, annotation_category_ref)
|
119
|
+
result = false
|
120
|
+
annotation_value_ref.uniq!
|
121
|
+
annotation_value.uniq!
|
122
|
+
#puts "#{annotation_value_ref.inspect} => #{annotation_value}" if annotation_category_ref == 'dnaseData'
|
123
|
+
if annotation_value_ref.sort == annotation_value.sort
|
124
|
+
result = true
|
125
|
+
elsif annotation_category_ref == 'dnaseData' &&
|
126
|
+
!annotation_value.empty?
|
127
|
+
result = true
|
128
|
+
elsif annotation_category_ref == 'tfbs'
|
129
|
+
if !(annotation_value_ref & annotation_value).empty? || annotation_value.length >= 5
|
130
|
+
result= true
|
131
|
+
end
|
132
|
+
elsif annotation_category_ref == 'metilationData' &&
|
133
|
+
!annotation_value.empty?
|
134
|
+
result = true
|
135
|
+
elsif annotation_category_ref == 'HistoneModification'
|
136
|
+
annotation_value_ref = annotation_value_ref.map{|an|
|
137
|
+
if /(H\d+K\d+)\w*/ =~ an
|
138
|
+
$1
|
139
|
+
else
|
140
|
+
an
|
141
|
+
end
|
142
|
+
}.uniq
|
143
|
+
annotation_value = annotation_value.map{|an|
|
144
|
+
if /(H\d+K\d+)\w*/ =~ an
|
145
|
+
$1
|
146
|
+
else
|
147
|
+
an
|
148
|
+
end
|
149
|
+
}.uniq
|
150
|
+
if !(annotation_value_ref & annotation_value).empty? || annotation_value.length >= 5
|
151
|
+
result= true
|
152
|
+
end
|
153
|
+
elsif annotation_category_ref == 'ConservedRegions' &&
|
154
|
+
!annotation_value.empty?
|
155
|
+
result = true
|
156
|
+
elsif annotation_category_ref == 'Enhancers' &&
|
157
|
+
!annotation_value.empty?
|
158
|
+
result = true
|
159
|
+
elsif annotation_category_ref == 'DENdbEnhancers' &&
|
160
|
+
!annotation_value.empty?
|
161
|
+
result = true
|
162
|
+
elsif annotation_category_ref == 'SuperEnhancers' &&
|
163
|
+
!annotation_value.empty?
|
164
|
+
result = true
|
165
|
+
end
|
166
|
+
return result
|
167
|
+
end
|
168
|
+
|
169
|
+
#MAIN
|
170
|
+
#----------
|
171
|
+
|
172
|
+
#RECUERDA: este programa hace analisis estadisticos y compara resultados para dos archivos dados.
|
173
|
+
#En nuestro caso, comparamos los datos dados por nuestro programa con los datos obtenidos experimentalmente.
|
174
|
+
#nuestros datos = ARGV[0], datos del experimento = ARGV[1]
|
175
|
+
#si no se especifica segundo argumento de entrada = se hace el análisis sobre el propio resultado del programa
|
176
|
+
fields_length = 5
|
177
|
+
fields_length = ARGV[2].to_i if !ARGV[2].nil?
|
178
|
+
|
179
|
+
snp_storage = load_snp_data(ARGV[0], fields_length)
|
180
|
+
if !ARGV[1].nil? && ARGV[1].downcase != 'false'
|
181
|
+
snp_storage_reference = load_snp_data(ARGV[1])
|
182
|
+
snp_percentage = snp_calculate_stats_with_reference(snp_storage, snp_storage_reference)
|
183
|
+
else
|
184
|
+
snp_percentage = snp_calculate_stats(snp_storage)
|
185
|
+
end
|
186
|
+
snp_percentage.each do |category_name, percentage|
|
187
|
+
puts "#{category_name}\t#{percentage}\t#{ARGV[3]}"
|
188
|
+
end
|
189
|
+
|
190
|
+
#El archivo de graficado aparecera donde se ejecute el script
|
191
|
+
# file_name = File.basename(ARGV[0], ".txt")
|
192
|
+
# graph_name = file_name + ".png"
|
193
|
+
# create_histogram(snp_percentage, graph_name)
|
data/database/deleteme
ADDED
File without changes
|