anncrsnp 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/grdbfinder.rb +38 -10
- data/bin/retriever.rb +147 -0
- data/lib/anncrsnp/file_parser.rb +53 -0
- data/lib/anncrsnp/file_parsers/wigfix_parser.rb +65 -0
- data/lib/anncrsnp/position_selection_manager.rb +214 -0
- data/lib/anncrsnp/position_selection_worker.rb +140 -0
- data/lib/anncrsnp/preprocessing_manager.rb +87 -0
- data/lib/anncrsnp/preprocessing_worker.rb +139 -0
- data/lib/anncrsnp/version.rb +1 -1
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ee18ae125e8e7b9738d3dd493ee66d57a374f885
|
4
|
+
data.tar.gz: 27e49ea014fbbdfb2aba568847c3214d24ee621e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd254d0ed92720ce4a6bc1909e2bff86a845a3714549460e29f4f290bed7376e6d9b872aea58bc8093724dc0545fa3403213a43d3fe269130477cf6cb8ced556
|
7
|
+
data.tar.gz: 1269b73c8b8f6147940428a27ad5d54ec06b6ea8381fe0894ace30854e14b851cf893778cb4310630db56c21f7fb704ecfc1889ffa9624a315b454e4f8911d0d
|
data/bin/grdbfinder.rb
CHANGED
@@ -347,16 +347,44 @@ def download_database(database_path)
|
|
347
347
|
out_path = File.dirname(database_path)
|
348
348
|
puts "Downloading database in #{out_path}, please be patient..."
|
349
349
|
zip_path = File.join(out_path, 'database.zip')
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
350
|
+
# Code from https://www.ruby-forum.com/topic/4413829
|
351
|
+
target = "http://bio-267-data.uma.es/database.zip"
|
352
|
+
|
353
|
+
bytes_total = nil
|
354
|
+
|
355
|
+
open(target, "rb",
|
356
|
+
:content_length_proc => lambda{|content_length|
|
357
|
+
bytes_total = content_length},
|
358
|
+
:progress_proc => lambda{|bytes_transferred|
|
359
|
+
if bytes_total
|
360
|
+
# Print progress
|
361
|
+
print("\r#{bytes_transferred}/#{bytes_total}")
|
362
|
+
else
|
363
|
+
# We don’t know how much we get, so just print number
|
364
|
+
# of transferred bytes
|
365
|
+
print("\r#{bytes_transferred} (total size unknown)")
|
366
|
+
end
|
367
|
+
}) do |page|
|
368
|
+
# Now the real operation
|
369
|
+
File.open(zip_path, "wb") do |file|
|
370
|
+
# The file may not fit into RAM entirely, so copy it
|
371
|
+
# chunk by chunk.
|
372
|
+
while chunk = page.read(1024)
|
373
|
+
file.write(chunk)
|
374
|
+
end
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
if File.exists?(zip_path)
|
379
|
+
puts "\nDecompressing database..."
|
380
|
+
Zip::File.open(zip_path) do |zip_file|
|
381
|
+
zip_file.each do |entry|
|
382
|
+
entry.extract(database_path)
|
383
|
+
end
|
384
|
+
end
|
385
|
+
else
|
386
|
+
puts "ERROR: #{zip_path} was not found"
|
387
|
+
Process.exit
|
360
388
|
end
|
361
389
|
if File.exists?(database_path)
|
362
390
|
File.delete(zip_path)
|
data/bin/retriever.rb
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
ROOT_PATH = File.dirname(__FILE__)
|
3
|
+
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp'))
|
4
|
+
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
require 'scbi_mapreduce'
|
8
|
+
require 'preprocessing_manager'
|
9
|
+
require 'position_selection_manager'
|
10
|
+
|
11
|
+
#####################################################################
|
12
|
+
### OPTPARSE
|
13
|
+
#####################################################################
|
14
|
+
|
15
|
+
options = {}
|
16
|
+
OptionParser.new do |opts|
|
17
|
+
opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
|
18
|
+
|
19
|
+
### PARALELISATION OPTIONS
|
20
|
+
#####################################################################
|
21
|
+
options[:server_ip] = '0.0.0.0'
|
22
|
+
opts.on( '-s', '--server IP', 'Server ip. You can use a partial ip to select the apropriate interface' ) do |server_ip|
|
23
|
+
options[:server_ip] = server_ip
|
24
|
+
end
|
25
|
+
|
26
|
+
# server port
|
27
|
+
options[:port] = 0 # any free port
|
28
|
+
opts.on( '-p', '--port PORT', 'Server port. If set to 0, an arbitrary empty port will be used') do |port|
|
29
|
+
options[:port] = port.to_i
|
30
|
+
end
|
31
|
+
|
32
|
+
# set number of workers. You can also provide an array with worker names.
|
33
|
+
# Those workers names can be read from a file produced by the existing
|
34
|
+
# queue system, if any.
|
35
|
+
options[:workers] = 2
|
36
|
+
opts.on( '-w', '--workers COUNT', 'Number of workers, or file containing machine names to launch workers with ssh' ) do |workers|
|
37
|
+
if File.exists?(workers)
|
38
|
+
# use workers file
|
39
|
+
options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
|
40
|
+
else
|
41
|
+
begin
|
42
|
+
options[:workers] = Integer(workers)
|
43
|
+
rescue
|
44
|
+
STDERR.puts "ERROR:Invalid workers parameter #{options[:workers]}"
|
45
|
+
exit
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# chunk size
|
51
|
+
options[:chunk_size] = 1
|
52
|
+
opts.on( '-g', '--group_size chunk_size', 'Group sequences in chunks of size <chunk_size>' ) do |cs|
|
53
|
+
options[:chunk_size] = cs.to_i
|
54
|
+
end
|
55
|
+
|
56
|
+
### EXECUTION OPTIONS
|
57
|
+
#####################################################################
|
58
|
+
options[:index_size] = 1000000
|
59
|
+
opts.on( '-x', '--index_size INTEGER', 'Size of genomic features data packs' ) do |is|
|
60
|
+
options[:index_size] = is.to_i
|
61
|
+
end
|
62
|
+
|
63
|
+
options[:file] = nil
|
64
|
+
opts.on("-f", "--file-links PATH", "Input file with links to retrieve data") do |f|
|
65
|
+
options[:file] = f
|
66
|
+
end
|
67
|
+
|
68
|
+
options[:output] = 'data'
|
69
|
+
opts.on("-o", "--output PATH", "Folder output path") do |f|
|
70
|
+
options[:output] = f
|
71
|
+
end
|
72
|
+
|
73
|
+
options[:downloaded_only] = FALSE
|
74
|
+
opts.on("--download_only", "Only download gemonic features files but not process them") do
|
75
|
+
options[:downloaded_only] = TRUE
|
76
|
+
end
|
77
|
+
|
78
|
+
options[:no_auc] = FALSE
|
79
|
+
opts.on("--no_auc", "No calculate auc by each genomic feature") do
|
80
|
+
options[:no_auc] = TRUE
|
81
|
+
end
|
82
|
+
|
83
|
+
options[:selected_positions] = nil
|
84
|
+
opts.on("--selected_positions PATH", "Tabular file with chromosome (as chrN) and base 1 coordinates. Optionally a third field can be added with 0/1 values for positive/negative groups") do |selected|
|
85
|
+
options[:selected_positions] = selected
|
86
|
+
end
|
87
|
+
|
88
|
+
end.parse!
|
89
|
+
|
90
|
+
#####################################################################
|
91
|
+
### MAIN
|
92
|
+
#####################################################################
|
93
|
+
|
94
|
+
# GENERAL FOLDER
|
95
|
+
Dir.mkdir(options[:output]) if !Dir.exist?(options[:output])
|
96
|
+
|
97
|
+
# MAPREDUCE LAUNCHING
|
98
|
+
##########################################################
|
99
|
+
$LOG = Logger.new(STDOUT)
|
100
|
+
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
|
101
|
+
# Genomic feature data downloading and preprocessing
|
102
|
+
#-----------------------------------------------------------------------------
|
103
|
+
if !options[:file].nil?
|
104
|
+
if File.exists?(options[:file])
|
105
|
+
temp = File.join(options[:output], 'temp')
|
106
|
+
options[:temp] = temp
|
107
|
+
Dir.mkdir(temp) if !Dir.exist?(temp)
|
108
|
+
preprocessed_data = File.join(options[:output], 'preprocessed_data')
|
109
|
+
options[:preprocessed_data] = preprocessed_data
|
110
|
+
Dir.mkdir(preprocessed_data) if !Dir.exist?(preprocessed_data)
|
111
|
+
|
112
|
+
$LOG.info 'Starting PREPROCESSING server'
|
113
|
+
custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'preprocessing_worker.rb')
|
114
|
+
PreprocessingManager.init_work_manager(options)
|
115
|
+
|
116
|
+
mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PreprocessingManager, custom_worker_file, STDOUT) # launch processor server
|
117
|
+
mgr.chunk_size = options[:chunk_size]
|
118
|
+
mgr.start_server # start processing
|
119
|
+
$LOG.info 'Closing PREPROCESSING server'
|
120
|
+
else
|
121
|
+
puts "Links file not exists\n#{options[:file]}"
|
122
|
+
Process.exit()
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Genomic feature data position selection
|
127
|
+
#-----------------------------------------------------------------------------
|
128
|
+
if !options[:selected_positions].nil?
|
129
|
+
if File.exist?(options[:selected_positions])
|
130
|
+
selected_positions_folder = File.join(options[:output], 'selected_positions')
|
131
|
+
options[:selected_positions_folder] = selected_positions_folder
|
132
|
+
Dir.mkdir(selected_positions_folder) if !Dir.exist?(selected_positions_folder)
|
133
|
+
|
134
|
+
$LOG.info 'Starting POSITION_SELECTION server'
|
135
|
+
custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'position_selection_worker.rb')
|
136
|
+
PositionSelectionManager.init_work_manager(options)
|
137
|
+
|
138
|
+
mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PositionSelectionManager, custom_worker_file, STDOUT) # launch processor server
|
139
|
+
mgr.chunk_size = options[:chunk_size]
|
140
|
+
mgr.start_server # start processing
|
141
|
+
$LOG.info 'Closing POSITION_SELECTION server'
|
142
|
+
else
|
143
|
+
puts "File with selected positions not exists:\n#{options[:selected_positions]}"
|
144
|
+
Process.exit()
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'yajl'
|
2
|
+
class FileParser
|
3
|
+
@@parsers = {}
|
4
|
+
def self.get_descendants
|
5
|
+
return ObjectSpace.each_object(Class).select { |klass| klass < self }
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.load
|
9
|
+
path_parsers = File.join(File.dirname(__FILE__), 'file_parsers')
|
10
|
+
Dir.glob(path_parsers+'/*').each do |parser|
|
11
|
+
require parser
|
12
|
+
end
|
13
|
+
get_descendants.each do |descendant|
|
14
|
+
@@parsers[descendant.format] = descendant if descendant.available?
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.select(format)
|
19
|
+
return @@parsers[format]
|
20
|
+
end
|
21
|
+
|
22
|
+
########################################################################################
|
23
|
+
## PARSER DEPENDANT METHODS
|
24
|
+
########################################################################################
|
25
|
+
def self.available?
|
26
|
+
return FALSE
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.format
|
30
|
+
return 'master'
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(folder, chunk_size)
|
34
|
+
@folder = folder
|
35
|
+
@chunk_size = chunk_size
|
36
|
+
@chrom = nil
|
37
|
+
@coords = []
|
38
|
+
@packs = 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse(line)
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def write_compressed_data
|
46
|
+
p = @packs * @chunk_size
|
47
|
+
gz_path = File.join(@folder, "#{@chrom}_#{p}.gz")
|
48
|
+
Zlib::GzipWriter.open(gz_path) do |writer|
|
49
|
+
Yajl::Encoder.encode(@coords, writer)
|
50
|
+
end
|
51
|
+
@packs += 1
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
class WigfixParser < FileParser
|
2
|
+
def initialize(folder, chunk_size)
|
3
|
+
super
|
4
|
+
@start = 1
|
5
|
+
@step = 1
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.available?
|
9
|
+
return TRUE
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format
|
13
|
+
return 'wigfix'
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse(line)
|
17
|
+
#fixedStep chrom=chr11 start=60001 step=1
|
18
|
+
if line.include?('fixedStep')
|
19
|
+
line =~ /fixedStep chrom=(\S+) start=(\d+) step=(\d+)/
|
20
|
+
if !@chrom.nil? && @chrom != $1 #We change of chromosome, we write the buffered coordinates
|
21
|
+
#puts "=> #{@packs}\t#{@start}\tx"
|
22
|
+
#puts @coords.first.inspect
|
23
|
+
#puts @coords.last.inspect
|
24
|
+
write_compressed_data
|
25
|
+
@coords = []
|
26
|
+
end
|
27
|
+
@chrom = $1
|
28
|
+
last_start = @start
|
29
|
+
@start = $2.to_i
|
30
|
+
diff = @start - last_start #Create dummy files to fill gaps on coordinate scores
|
31
|
+
if diff >= @chunk_size
|
32
|
+
(diff/@chunk_size).times do
|
33
|
+
#puts "=> #{@packs}\t#{@start}\td"
|
34
|
+
#puts @coords.first.inspect
|
35
|
+
#puts @coords.last.inspect
|
36
|
+
write_compressed_data
|
37
|
+
@coords = []
|
38
|
+
end
|
39
|
+
else
|
40
|
+
if @start/@chunk_size != last_start/@chunk_size #Current coordinate belongs to another pack that the previous, write the buffered coordinates
|
41
|
+
#puts "=> #{@packs}\t#{@start}\te"
|
42
|
+
#puts @coords.first.inspect
|
43
|
+
#puts @coords.last.inspect
|
44
|
+
write_compressed_data
|
45
|
+
@coords = []
|
46
|
+
end
|
47
|
+
end
|
48
|
+
@step = $3.to_i
|
49
|
+
else
|
50
|
+
if @start % @chunk_size == 0 # We have reached the chun size, write it to disk
|
51
|
+
#puts "=> #{@packs}\t#{@start}\tl"
|
52
|
+
#puts @coords.first.inspect
|
53
|
+
#puts @coords.last.inspect
|
54
|
+
write_compressed_data
|
55
|
+
@coords = []
|
56
|
+
end
|
57
|
+
@coords << [@start, line.to_f]
|
58
|
+
@start += @step
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_data
|
63
|
+
return @coords
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'rroc'
|
3
|
+
require 'gchart'
|
4
|
+
|
5
|
+
# MyWorkerManager class is used to implement the methods
|
6
|
+
# to send and receive the data to or from workers
|
7
|
+
class PositionSelectionManager < ScbiMapreduce::WorkManager
|
8
|
+
|
9
|
+
######################################################################################################
|
10
|
+
## MANAGER BASIC METHODS
|
11
|
+
######################################################################################################
|
12
|
+
|
13
|
+
# init_work_manager is executed at the start, prior to any processing.
|
14
|
+
# You can use init_work_manager to initialize global variables, open files, etc...
|
15
|
+
# Note that an instance of MyWorkerManager will be created for each
|
16
|
+
# worker connection, and thus, all global variables here should be
|
17
|
+
# class variables (starting with @@)
|
18
|
+
def self.init_work_manager(options)
|
19
|
+
@@options = options
|
20
|
+
@@positions, @@groups = load_selected_positions(@@options[:selected_positions])
|
21
|
+
@@active_data = File.open(File.join(@@options[:preprocessed_data], 'active_data')).readlines.map {|item| item.chomp}
|
22
|
+
@@used_data = 0
|
23
|
+
@@used_position = 0
|
24
|
+
@@all_data = {}
|
25
|
+
end
|
26
|
+
|
27
|
+
# end_work_manager is executed at the end, when all the process is done.
|
28
|
+
# You can use it to close files opened in init_work_manager
|
29
|
+
def self.end_work_manager
|
30
|
+
positions_ids = []
|
31
|
+
scores = {} # Create genomic features table
|
32
|
+
$LOG.info "Create general scores table"
|
33
|
+
@@all_data.each do |data, positions_info|
|
34
|
+
data_scores = []
|
35
|
+
positions_info.each do |chr, position_info|
|
36
|
+
position_info.each do |position, score|
|
37
|
+
data_scores << ["#{chr}_#{position.to_s}", score]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
data_scores.sort!{|sc1, sc2| sc1.first <=> sc2.first}
|
41
|
+
scores[data] = data_scores.map{|sc| sc.last}
|
42
|
+
positions_ids = data_scores.map{|sc| sc.first} if positions_ids.empty?
|
43
|
+
end
|
44
|
+
|
45
|
+
if !@@groups.empty?
|
46
|
+
tags = positions_ids.map{|id| # Create vector tag group related to scores table
|
47
|
+
tag = @@groups[id]
|
48
|
+
if tag == 0
|
49
|
+
tag = -1
|
50
|
+
else
|
51
|
+
tag = 1
|
52
|
+
end
|
53
|
+
}
|
54
|
+
if !@@options[:no_auc]
|
55
|
+
$LOG.info "Calculating AUC for each genomic feature"
|
56
|
+
aucs = get_aucs(tags, scores) # GEnerate area under curve by each genomic feature
|
57
|
+
File.open(File.join(@@options[:selected_positions_folder], 'AUCs'), 'w'){ |f|
|
58
|
+
aucs.each do |data_type, auc|
|
59
|
+
f.puts "#{data_type}\t#{auc.join("\t")}"
|
60
|
+
end
|
61
|
+
}
|
62
|
+
end
|
63
|
+
$LOG.info "Creating training files for tensorflow"
|
64
|
+
create_positions_sets_for_tensorflow(@@options[:selected_positions_folder], scores, tags)
|
65
|
+
end
|
66
|
+
|
67
|
+
data_types = scores.keys
|
68
|
+
File.open(File.join(@@options[:selected_positions_folder], 'all_data'), 'w'){ |f| #final genomic feature scores table for goldstandard
|
69
|
+
f.puts ['position'].concat(data_types).join("\t")
|
70
|
+
positions_ids.each_with_index do |id, i|
|
71
|
+
record = [id]
|
72
|
+
data_types.each do |dt|
|
73
|
+
record << scores[dt][i]
|
74
|
+
end
|
75
|
+
f.puts record.join("\t")
|
76
|
+
end
|
77
|
+
}
|
78
|
+
end
|
79
|
+
|
80
|
+
# worker_initial_config is used to send initial parameters to workers.
|
81
|
+
# The method is executed once per each worker
|
82
|
+
def worker_initial_config
|
83
|
+
return @@options
|
84
|
+
end
|
85
|
+
|
86
|
+
# next_work method is called every time a worker needs a new work
|
87
|
+
# Here you can read data from disk
|
88
|
+
# This method must return the work data or nil if no more data is available
|
89
|
+
def next_work
|
90
|
+
begin
|
91
|
+
if @@used_data >= @@active_data.length
|
92
|
+
e = nil # worker signal disconect
|
93
|
+
else
|
94
|
+
chr = @@positions.keys[@@used_position]
|
95
|
+
e = [@@active_data[@@used_data], chr, @@positions[chr]]
|
96
|
+
@@used_position += 1
|
97
|
+
if @@used_position >= @@positions.length
|
98
|
+
@@used_data +=1
|
99
|
+
@@used_position = 0
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
rescue Exception => e
|
104
|
+
puts e.message
|
105
|
+
puts e.backtrace
|
106
|
+
|
107
|
+
end
|
108
|
+
return e
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# work_received is executed each time a worker has finished a job.
|
114
|
+
# Here you can write results down to disk, perform some aggregated statistics, etc...
|
115
|
+
def work_received(results)
|
116
|
+
results.each do |data, positions_info|
|
117
|
+
query = @@all_data[data]
|
118
|
+
if query.nil?
|
119
|
+
@@all_data[data] = positions_info
|
120
|
+
else
|
121
|
+
@@all_data[data] = query.merge(positions_info)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
######################################################################################################
|
127
|
+
## CUSTOM ADDITIONAL METHODS
|
128
|
+
######################################################################################################
|
129
|
+
|
130
|
+
def self.load_selected_positions(file_path)
|
131
|
+
selected_positions = {}
|
132
|
+
groups = {}
|
133
|
+
File.open(file_path).each do |line|
|
134
|
+
line.chomp!
|
135
|
+
chr, position, group = line.split("\t")
|
136
|
+
record = position.to_i
|
137
|
+
if !group.nil?
|
138
|
+
group = group.to_i
|
139
|
+
groups["#{chr}_#{position}"] = group
|
140
|
+
end
|
141
|
+
query = selected_positions[chr]
|
142
|
+
if query.nil?
|
143
|
+
selected_positions[chr] = [record]
|
144
|
+
else
|
145
|
+
query << record
|
146
|
+
query.uniq!
|
147
|
+
end
|
148
|
+
end
|
149
|
+
selected_positions.each do |chr, positions|
|
150
|
+
positions.sort!
|
151
|
+
end
|
152
|
+
return selected_positions, groups
|
153
|
+
end
|
154
|
+
|
155
|
+
def self.get_aucs(tags, scores)
|
156
|
+
aucs = {}
|
157
|
+
scores.each do | data_type, scores|
|
158
|
+
matrix = []
|
159
|
+
scores.each_with_index do |score, i|
|
160
|
+
matrix << [score, tags[i]]
|
161
|
+
end
|
162
|
+
pts = ROC.curve_points(matrix)
|
163
|
+
aucs[data_type] = [ROC.auc(matrix), GChart.scatter(:data => [pts.collect { |x| x[0] }, pts.collect { |x| x[1] }]).to_url]
|
164
|
+
end
|
165
|
+
return aucs
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.create_positions_sets_for_tensorflow(path_folder, scores, tags)
|
169
|
+
validation_set_proportion = 0.2
|
170
|
+
positions_number = tags.length
|
171
|
+
validation_set_length = (positions_number * validation_set_proportion).to_i
|
172
|
+
training_set_length = positions_number - validation_set_length
|
173
|
+
validation_set_positions = [] # Set which positions will belong to validation set
|
174
|
+
while validation_set_positions.length < validation_set_length
|
175
|
+
position = rand(positions_number - 1) # We need random 0 based positions
|
176
|
+
validation_set_positions << position if !validation_set_positions.include?(position)
|
177
|
+
end
|
178
|
+
tags.map!{|t| #tensorflow nedd positive integer as tags, we change tag used in AUC operation
|
179
|
+
if t == -1
|
180
|
+
0
|
181
|
+
else
|
182
|
+
t
|
183
|
+
end
|
184
|
+
}
|
185
|
+
genomic_features = scores.keys
|
186
|
+
training_set = []
|
187
|
+
validation_set = []
|
188
|
+
tags.each_with_index do |tag, n|
|
189
|
+
record = [] # Create record position
|
190
|
+
genomic_features.each do |gf|
|
191
|
+
record << scores[gf][n]
|
192
|
+
end
|
193
|
+
record << tag
|
194
|
+
if validation_set_positions.include?(n) # Send record to correspondant set
|
195
|
+
validation_set << record
|
196
|
+
else
|
197
|
+
training_set << record
|
198
|
+
end
|
199
|
+
end
|
200
|
+
tag_names = tags.uniq #TODO: improve to ensure exact correspondance
|
201
|
+
training_set.unshift([training_set.length, genomic_features.length].concat(tag_names)) # set headers
|
202
|
+
validation_set.unshift([validation_set.length, genomic_features.length].concat(tag_names)) # set headers
|
203
|
+
write_set(training_set, File.join(path_folder, 'training_set.csv'))
|
204
|
+
write_set(validation_set, File.join(path_folder, 'validation_set.csv'))
|
205
|
+
end
|
206
|
+
|
207
|
+
def self.write_set(set, path)
|
208
|
+
File.open(path, 'w'){|f|
|
209
|
+
set.each do |record|
|
210
|
+
f.puts record.join(',')
|
211
|
+
end
|
212
|
+
}
|
213
|
+
end
|
214
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
ROOT_PATH=File.dirname(__FILE__)
|
2
|
+
$: << File.expand_path(ROOT_PATH)
|
3
|
+
|
4
|
+
require 'benchmark'
|
5
|
+
|
6
|
+
# MyWorker defines the behaviour of workers.
|
7
|
+
# Here is where the real processing takes place
|
8
|
+
class PositionSelectionWorker < ScbiMapreduce::Worker
|
9
|
+
|
10
|
+
######################################################################################################
|
11
|
+
## WORKER BASIC METHODS
|
12
|
+
######################################################################################################
|
13
|
+
|
14
|
+
# starting_worker method is called one time at initialization
|
15
|
+
# and allows you to initialize your variables
|
16
|
+
def starting_worker
|
17
|
+
|
18
|
+
# You can use worker logs at any time in this way:
|
19
|
+
$WORKER_LOG.info "Starting a worker"
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
# receive_initial_config is called only once just after
|
25
|
+
# the first connection, when initial parameters are
|
26
|
+
# received from manager
|
27
|
+
def receive_initial_config(parameters)
|
28
|
+
@options = parameters
|
29
|
+
# Reads the parameters
|
30
|
+
|
31
|
+
# You can use worker logs at any time in this way:
|
32
|
+
$WORKER_LOG.info "Params received"
|
33
|
+
|
34
|
+
# save received parameters, if any
|
35
|
+
# @params = parameters
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# process_object method is called for each received object.
|
40
|
+
# Be aware that objs is always an array, and you must iterate
|
41
|
+
# over it if you need to process it independently
|
42
|
+
#
|
43
|
+
# The value returned here will be received by the work_received
|
44
|
+
# method at your worker_manager subclass.
|
45
|
+
def process_object(objs)
|
46
|
+
all_data = nil
|
47
|
+
Benchmark.bm do |x|
|
48
|
+
x.report('PosS'){
|
49
|
+
|
50
|
+
packs, datas = get_info_to_search(objs)
|
51
|
+
all_data = {}
|
52
|
+
datas.each do |data|
|
53
|
+
selected_scores = {}
|
54
|
+
packs.each do |chr, ps|
|
55
|
+
scores = []
|
56
|
+
ps.each do |pack, positions|
|
57
|
+
info_path = File.join(@options[:preprocessed_data], data, "#{chr}_#{pack}.gz")
|
58
|
+
#puts info_path
|
59
|
+
if File.exists?(info_path)
|
60
|
+
chr_data = []
|
61
|
+
Zlib::GzipReader.open(info_path) {|gz| chr_data = JSON.parse(gz.read)}
|
62
|
+
scores.concat(get_scores(chr_data, positions))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
selected_scores[chr] = scores
|
66
|
+
end
|
67
|
+
all_data[data] = selected_scores
|
68
|
+
end
|
69
|
+
# return objs back to manager
|
70
|
+
|
71
|
+
}
|
72
|
+
end
|
73
|
+
return all_data
|
74
|
+
end
|
75
|
+
|
76
|
+
# called once, when the worker is about to be closed
|
77
|
+
def closing_worker
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
######################################################################################################
|
82
|
+
## WORKER CUSTOM METHODS
|
83
|
+
######################################################################################################
|
84
|
+
|
85
|
+
def get_info_to_search(objs)
|
86
|
+
packs = {}
|
87
|
+
datas = []
|
88
|
+
objs.each do |data, chr, positions| # Analyse which chromosomes and packs must be loaded
|
89
|
+
datas << data if !datas.include?(data)
|
90
|
+
positions.each do |position|
|
91
|
+
pack = position/@options[:index_size]
|
92
|
+
pack = pack * @options[:index_size]
|
93
|
+
#puts "#{position} ==> #{pack}"
|
94
|
+
query_chr = packs[chr]
|
95
|
+
if query_chr.nil?
|
96
|
+
packs[chr] = { pack => [position]}
|
97
|
+
else
|
98
|
+
query_pack = query_chr[pack]
|
99
|
+
if query_pack.nil?
|
100
|
+
query_chr[pack] = [position]
|
101
|
+
else
|
102
|
+
query_pack << position
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
return packs, datas
|
108
|
+
end
|
109
|
+
|
110
|
+
def get_scores(chr_data, positions)
|
111
|
+
positions_scores = []
|
112
|
+
# Remove positions out of existing coordinates
|
113
|
+
lower_limit = chr_data.first.first
|
114
|
+
upper_limit = chr_data.last.first
|
115
|
+
positions_scores.concat(positions.select{|pos| pos < lower_limit}.map{|pos| [pos, 0]}) #At the beginning
|
116
|
+
filtered_positions = positions.select{|pos| pos >= lower_limit && pos <= upper_limit }
|
117
|
+
#--------------------------------------------------------------------------------------------------
|
118
|
+
if !filtered_positions.empty?
|
119
|
+
current_position = filtered_positions.shift
|
120
|
+
chr_data.each do |coord, score|
|
121
|
+
if coord == current_position
|
122
|
+
positions_scores << [current_position, score]
|
123
|
+
break if filtered_positions.empty?
|
124
|
+
current_position = filtered_positions.shift
|
125
|
+
elsif coord > current_position # We have encountered a gap and current position is in it
|
126
|
+
while coord > current_position # drop positions within the gap
|
127
|
+
positions_scores << [current_position, 0]
|
128
|
+
break if filtered_positions.empty?
|
129
|
+
current_position = filtered_positions.shift
|
130
|
+
end
|
131
|
+
break if filtered_positions.empty?
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
positions_scores.concat(positions.select{|pos| pos > upper_limit}.map{|pos| [pos, 0]}) # At the end
|
137
|
+
|
138
|
+
return positions_scores
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
# MyWorkerManager class is used to implement the methods
|
4
|
+
# to send and receive the data to or from workers
|
5
|
+
class PreprocessingManager < ScbiMapreduce::WorkManager
|
6
|
+
|
7
|
+
######################################################################################################
|
8
|
+
## MANAGER BASIC METHODS
|
9
|
+
######################################################################################################
|
10
|
+
|
11
|
+
# init_work_manager is executed at the start, prior to any processing.
|
12
|
+
# You can use init_work_manager to initialize global variables, open files, etc...
|
13
|
+
# Note that an instance of MyWorkerManager will be created for each
|
14
|
+
# worker connection, and thus, all global variables here should be
|
15
|
+
# class variables (starting with @@)
|
16
|
+
def self.init_work_manager(options)
|
17
|
+
@@options = options
|
18
|
+
$LOG.info 'Load genomic features links'
|
19
|
+
@@features = load_links(options[:file])
|
20
|
+
$LOG.info "Loaded #{@@features.length} genomic features links"
|
21
|
+
|
22
|
+
# FEATURE DIRECTORIES
|
23
|
+
@@features.each do |feature|
|
24
|
+
ft_folder = File.join(@@options[:preprocessed_data], feature[1]) #feature name
|
25
|
+
ft_temp_folder = File.join(@@options[:temp], feature[1])
|
26
|
+
Dir.mkdir(ft_folder) if !Dir.exist?(ft_folder)
|
27
|
+
Dir.mkdir(ft_temp_folder) if !Dir.exist?(ft_temp_folder)
|
28
|
+
end
|
29
|
+
@@processed_features = 0
|
30
|
+
end
|
31
|
+
|
32
|
+
# end_work_manager is executed at the end, when all the process is done.
|
33
|
+
# You can use it to close files opened in init_work_manager
|
34
|
+
def self.end_work_manager
|
35
|
+
File.open(File.join(@@options[:preprocessed_data], 'active_data'), 'w'){ |f| f.puts @@features.map{|f| f[1]}.uniq.join("\n")}
|
36
|
+
end
|
37
|
+
|
38
|
+
# worker_initial_config is used to send initial parameters to workers.
|
39
|
+
# The method is executed once per each worker
|
40
|
+
def worker_initial_config
|
41
|
+
return @@options
|
42
|
+
end
|
43
|
+
|
44
|
+
# next_work method is called every time a worker needs a new work
|
45
|
+
# Here you can read data from disk
|
46
|
+
# This method must return the work data or nil if no more data is available
|
47
|
+
def next_work
|
48
|
+
begin
|
49
|
+
if @@processed_features >= @@features.length
|
50
|
+
e = nil # worker signal disconect
|
51
|
+
else
|
52
|
+
e = @@features[@@processed_features]
|
53
|
+
end
|
54
|
+
|
55
|
+
@@processed_features += 1
|
56
|
+
rescue Exception => e
|
57
|
+
puts e.message
|
58
|
+
puts e.backtrace
|
59
|
+
|
60
|
+
end
|
61
|
+
return e
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# work_received is executed each time a worker has finished a job.
|
67
|
+
# Here you can write results down to disk, perform some aggregated statistics, etc...
|
68
|
+
def work_received(results)
|
69
|
+
|
70
|
+
# write_data_to_disk(results)
|
71
|
+
end
|
72
|
+
|
73
|
+
######################################################################################################
|
74
|
+
## CUSTOM ADDITIONAL METHODS
|
75
|
+
######################################################################################################
|
76
|
+
|
77
|
+
def self.load_links(file_path)
|
78
|
+
features = []
|
79
|
+
File.open(file_path).each do |line|
|
80
|
+
line.chomp!
|
81
|
+
link, feature, cols, header, format = line.split("\t")
|
82
|
+
features << [link, feature, cols.split(',').map{|col| col.to_i}, header.to_i, format]
|
83
|
+
end
|
84
|
+
return features
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
ROOT_PATH=File.dirname(__FILE__)
|
2
|
+
$: << File.expand_path(ROOT_PATH)
|
3
|
+
|
4
|
+
require 'yajl'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'benchmark'
|
7
|
+
require 'file_parser'
|
8
|
+
|
9
|
+
|
10
|
+
# MyWorker defines the behaviour of workers.
|
11
|
+
# Here is where the real processing takes place
|
12
|
+
class PreprocessingWorker < ScbiMapreduce::Worker
|
13
|
+
|
14
|
+
######################################################################################################
|
15
|
+
## WORKER BASIC METHODS
|
16
|
+
######################################################################################################
|
17
|
+
|
18
|
+
# starting_worker method is called one time at initialization
|
19
|
+
# and allows you to initialize your variables
|
20
|
+
def starting_worker
|
21
|
+
|
22
|
+
# You can use worker logs at any time in this way:
|
23
|
+
$WORKER_LOG.info "Starting a worker"
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
# receive_initial_config is called only once just after
|
29
|
+
# the first connection, when initial parameters are
|
30
|
+
# received from manager
|
31
|
+
def receive_initial_config(parameters)
|
32
|
+
@options = parameters
|
33
|
+
# Reads the parameters
|
34
|
+
|
35
|
+
# You can use worker logs at any time in this way:
|
36
|
+
$WORKER_LOG.info "Params received"
|
37
|
+
|
38
|
+
# save received parameters, if any
|
39
|
+
# @params = parameters
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
# process_object method is called for each received object.
|
44
|
+
# Be aware that objs is always an array, and you must iterate
|
45
|
+
# over it if you need to process it independently
|
46
|
+
#
|
47
|
+
# The value returned here will be received by the work_received
|
48
|
+
# method at your worker_manager subclass.
|
49
|
+
def process_object(objs)
|
50
|
+
Benchmark.bm do |x|
|
51
|
+
x.report('Prep'){
|
52
|
+
|
53
|
+
FileParser.load
|
54
|
+
objs.each do |link, feature, cols, header, format| # iterate over all objects received
|
55
|
+
$WORKER_LOG.info "Processing link: #{feature}, #{format}, #{link}"
|
56
|
+
ft_folder = File.join(@options[:preprocessed_data], feature)
|
57
|
+
ft_temp_folder = File.join(@options[:temp], feature)
|
58
|
+
temp_file = download_data(link, cols, header, format, ft_temp_folder)
|
59
|
+
if !@options[:downloaded_only]
|
60
|
+
if File.exist?(temp_file)
|
61
|
+
extract_data(format, temp_file, ft_folder)
|
62
|
+
else
|
63
|
+
$WORKER_LOG.info "WARNING: Temporal file #{temp_file} have not been downloaded for feature #{feature} so it will be skipped"
|
64
|
+
end
|
65
|
+
else
|
66
|
+
$WORKER_LOG.info "Download only mode, skipping processing temp files"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
}
|
71
|
+
end
|
72
|
+
# return objs back to manager
|
73
|
+
return []
|
74
|
+
end
|
75
|
+
|
76
|
+
# called once, when the worker is about to be closed
|
77
|
+
def closing_worker
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
######################################################################################################
|
82
|
+
## WORKER CUSTOM METHODS
|
83
|
+
######################################################################################################
|
84
|
+
|
85
|
+
# Download protocols
|
86
|
+
#-----------------------------------------
|
87
|
+
def download_data(link, cols, header, format, temp)
|
88
|
+
protocol, url = link.split('://')
|
89
|
+
temp_file = nil
|
90
|
+
if protocol == 'http'
|
91
|
+
temp_file = File.join(temp, url.split('/').last)
|
92
|
+
if !File.exist?(temp_file)
|
93
|
+
get_http_data(url, temp_file)
|
94
|
+
$WORKER_LOG.info "Downloading #{link}"
|
95
|
+
else
|
96
|
+
$WORKER_LOG.info "Link was downloaded in a previous execution. Skipping download #{link}"
|
97
|
+
end
|
98
|
+
elsif protocol == 'ftp'
|
99
|
+
else
|
100
|
+
$WORKER_LOG.info "WARNING: protocol: #{protocol} in link: #{link} is not supported"
|
101
|
+
end
|
102
|
+
return temp_file
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_http_data(url, temp)
|
106
|
+
File.open(temp, "wb") do |saved_file|
|
107
|
+
open("http://#{url}", "rb") do |read_file|
|
108
|
+
saved_file.write(read_file.read)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# File decompression methods
|
114
|
+
#-----------------------------------------
|
115
|
+
def extract_data(format, temp, folder)
|
116
|
+
data = {}
|
117
|
+
parser_class = FileParser.select(format)
|
118
|
+
parser = parser_class.new(folder, @options[:index_size])
|
119
|
+
$WORKER_LOG.info "Processing temporal file #{temp}"
|
120
|
+
if temp.include?('.gz')
|
121
|
+
#data = get_gz(temp, parser)
|
122
|
+
get_gz(temp, parser)
|
123
|
+
else
|
124
|
+
|
125
|
+
end
|
126
|
+
parser.write_compressed_data # Write remaining buffered data
|
127
|
+
$WORKER_LOG.info "End processing temporal file #{temp}"
|
128
|
+
return data
|
129
|
+
end
|
130
|
+
|
131
|
+
def get_gz(temp, parser)
|
132
|
+
Zlib::GzipReader.open(temp) {|gz|
|
133
|
+
gz.each do |line|
|
134
|
+
parser.parse(line.chomp)
|
135
|
+
end
|
136
|
+
}
|
137
|
+
#return parser.get_data
|
138
|
+
end
|
139
|
+
end
|
data/lib/anncrsnp/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anncrsnp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elena Rojano
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-09-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -98,6 +98,7 @@ executables:
|
|
98
98
|
- grdbfinder.rb
|
99
99
|
- grdbmanager.rb
|
100
100
|
- masterfeatures.rb
|
101
|
+
- retriever.rb
|
101
102
|
- setup
|
102
103
|
- statistics.rb
|
103
104
|
extensions: []
|
@@ -115,6 +116,7 @@ files:
|
|
115
116
|
- bin/grdbfinder.rb
|
116
117
|
- bin/grdbmanager.rb
|
117
118
|
- bin/masterfeatures.rb
|
119
|
+
- bin/retriever.rb
|
118
120
|
- bin/setup
|
119
121
|
- bin/statistics.rb
|
120
122
|
- database/.DS_Store
|
@@ -122,7 +124,13 @@ files:
|
|
122
124
|
- database/deleteme
|
123
125
|
- lib/anncrsnp.rb
|
124
126
|
- lib/anncrsnp/dataset.rb
|
127
|
+
- lib/anncrsnp/file_parser.rb
|
128
|
+
- lib/anncrsnp/file_parsers/wigfix_parser.rb
|
125
129
|
- lib/anncrsnp/parsers/ucscparser.rb
|
130
|
+
- lib/anncrsnp/position_selection_manager.rb
|
131
|
+
- lib/anncrsnp/position_selection_worker.rb
|
132
|
+
- lib/anncrsnp/preprocessing_manager.rb
|
133
|
+
- lib/anncrsnp/preprocessing_worker.rb
|
126
134
|
- lib/anncrsnp/version.rb
|
127
135
|
homepage: ''
|
128
136
|
licenses:
|