anncrsnp 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a5b68efa127fe9ae6c1b409daa38ee22236068b
4
- data.tar.gz: 4714dfd569a2568ddf055e346747de2bd03a5057
3
+ metadata.gz: ee18ae125e8e7b9738d3dd493ee66d57a374f885
4
+ data.tar.gz: 27e49ea014fbbdfb2aba568847c3214d24ee621e
5
5
  SHA512:
6
- metadata.gz: a73bd75040ddf05079ba4555c9769ae45a41610520ae42827fda4f9e6ba45807b79501b947c1dd3d1dd9addc032107c8ac56ae047edba9ba9040534af906de8d
7
- data.tar.gz: 874fb9ec1a453b33451d7d44710ec114de44bf5ba1c8701d1f17622a4222c96259f75bfb12e51a5b226ff835b6da492c82e67791a3553ed77c8289557d63f770
6
+ metadata.gz: cd254d0ed92720ce4a6bc1909e2bff86a845a3714549460e29f4f290bed7376e6d9b872aea58bc8093724dc0545fa3403213a43d3fe269130477cf6cb8ced556
7
+ data.tar.gz: 1269b73c8b8f6147940428a27ad5d54ec06b6ea8381fe0894ace30854e14b851cf893778cb4310630db56c21f7fb704ecfc1889ffa9624a315b454e4f8911d0d
@@ -347,16 +347,44 @@ def download_database(database_path)
347
347
  out_path = File.dirname(database_path)
348
348
  puts "Downloading database in #{out_path}, please be patient..."
349
349
  zip_path = File.join(out_path, 'database.zip')
350
- File.open(zip_path, "wb") do |saved_file|
351
- open("http://bio-267-data.uma.es/database.zip", "rb") do |read_file|
352
- saved_file.write(read_file.read)
353
- end
354
- end
355
- puts "Decompressing database..."
356
- Zip::File.open(zip_path) do |zip_file|
357
- zip_file.each do |entry|
358
- entry.extract(database_path)
359
- end
350
+ # Code from https://www.ruby-forum.com/topic/4413829
351
+ target = "http://bio-267-data.uma.es/database.zip"
352
+
353
+ bytes_total = nil
354
+
355
+ open(target, "rb",
356
+ :content_length_proc => lambda{|content_length|
357
+ bytes_total = content_length},
358
+ :progress_proc => lambda{|bytes_transferred|
359
+ if bytes_total
360
+ # Print progress
361
+ print("\r#{bytes_transferred}/#{bytes_total}")
362
+ else
363
+ # We don’t know how much we get, so just print number
364
+ # of transferred bytes
365
+ print("\r#{bytes_transferred} (total size unknown)")
366
+ end
367
+ }) do |page|
368
+ # Now the real operation
369
+ File.open(zip_path, "wb") do |file|
370
+ # The file may not fit into RAM entirely, so copy it
371
+ # chunk by chunk.
372
+ while chunk = page.read(1024)
373
+ file.write(chunk)
374
+ end
375
+ end
376
+ end
377
+
378
+ if File.exists?(zip_path)
379
+ puts "\nDecompressing database..."
380
+ Zip::File.open(zip_path) do |zip_file|
381
+ zip_file.each do |entry|
382
+ entry.extract(database_path)
383
+ end
384
+ end
385
+ else
386
+ puts "ERROR: #{zip_path} was not found"
387
+ Process.exit
360
388
  end
361
389
  if File.exists?(database_path)
362
390
  File.delete(zip_path)
@@ -0,0 +1,147 @@
1
+ #! /usr/bin/env ruby
2
+ ROOT_PATH = File.dirname(__FILE__)
3
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp'))
4
+
5
+
6
+ require 'optparse'
7
+ require 'scbi_mapreduce'
8
+ require 'preprocessing_manager'
9
+ require 'position_selection_manager'
10
+
11
+ #####################################################################
12
+ ### OPTPARSE
13
+ #####################################################################
14
+
15
+ options = {}
16
+ OptionParser.new do |opts|
17
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
18
+
19
+ ### PARALELISATION OPTIONS
20
+ #####################################################################
21
+ options[:server_ip] = '0.0.0.0'
22
+ opts.on( '-s', '--server IP', 'Server ip. You can use a partial ip to select the apropriate interface' ) do |server_ip|
23
+ options[:server_ip] = server_ip
24
+ end
25
+
26
+ # server port
27
+ options[:port] = 0 # any free port
28
+ opts.on( '-p', '--port PORT', 'Server port. If set to 0, an arbitrary empty port will be used') do |port|
29
+ options[:port] = port.to_i
30
+ end
31
+
32
+ # set number of workers. You can also provide an array with worker names.
33
+ # Those workers names can be read from a file produced by the existing
34
+ # queue system, if any.
35
+ options[:workers] = 2
36
+ opts.on( '-w', '--workers COUNT', 'Number of workers, or file containing machine names to launch workers with ssh' ) do |workers|
37
+ if File.exists?(workers)
38
+ # use workers file
39
+ options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
40
+ else
41
+ begin
42
+ options[:workers] = Integer(workers)
43
+ rescue
44
+ STDERR.puts "ERROR:Invalid workers parameter #{options[:workers]}"
45
+ exit
46
+ end
47
+ end
48
+ end
49
+
50
+ # chunk size
51
+ options[:chunk_size] = 1
52
+ opts.on( '-g', '--group_size chunk_size', 'Group sequences in chunks of size <chunk_size>' ) do |cs|
53
+ options[:chunk_size] = cs.to_i
54
+ end
55
+
56
+ ### EXECUTION OPTIONS
57
+ #####################################################################
58
+ options[:index_size] = 1000000
59
+ opts.on( '-x', '--index_size INTEGER', 'Size of genomic features data packs' ) do |is|
60
+ options[:index_size] = is.to_i
61
+ end
62
+
63
+ options[:file] = nil
64
+ opts.on("-f", "--file-links PATH", "Input file with links to retrieve data") do |f|
65
+ options[:file] = f
66
+ end
67
+
68
+ options[:output] = 'data'
69
+ opts.on("-o", "--output PATH", "Folder output path") do |f|
70
+ options[:output] = f
71
+ end
72
+
73
+ options[:downloaded_only] = FALSE
74
+ opts.on("--download_only", "Only download gemonic features files but not process them") do
75
+ options[:downloaded_only] = TRUE
76
+ end
77
+
78
+ options[:no_auc] = FALSE
79
+ opts.on("--no_auc", "No calculate auc by each genomic feature") do
80
+ options[:no_auc] = TRUE
81
+ end
82
+
83
+ options[:selected_positions] = nil
84
+ opts.on("--selected_positions PATH", "Tabular file with chromosome (as chrN) and base 1 coordinates. Optionally a third field can be added with 0/1 values for positive/negative groups") do |selected|
85
+ options[:selected_positions] = selected
86
+ end
87
+
88
+ end.parse!
89
+
90
+ #####################################################################
91
+ ### MAIN
92
+ #####################################################################
93
+
94
+ # GENERAL FOLDER
95
+ Dir.mkdir(options[:output]) if !Dir.exist?(options[:output])
96
+
97
+ # MAPREDUCE LAUNCHING
98
+ ##########################################################
99
+ $LOG = Logger.new(STDOUT)
100
+ $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
101
+ # Genomic feature data downloading and preprocessing
102
+ #-----------------------------------------------------------------------------
103
+ if !options[:file].nil?
104
+ if File.exists?(options[:file])
105
+ temp = File.join(options[:output], 'temp')
106
+ options[:temp] = temp
107
+ Dir.mkdir(temp) if !Dir.exist?(temp)
108
+ preprocessed_data = File.join(options[:output], 'preprocessed_data')
109
+ options[:preprocessed_data] = preprocessed_data
110
+ Dir.mkdir(preprocessed_data) if !Dir.exist?(preprocessed_data)
111
+
112
+ $LOG.info 'Starting PREPROCESSING server'
113
+ custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'preprocessing_worker.rb')
114
+ PreprocessingManager.init_work_manager(options)
115
+
116
+ mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PreprocessingManager, custom_worker_file, STDOUT) # launch processor server
117
+ mgr.chunk_size = options[:chunk_size]
118
+ mgr.start_server # start processing
119
+ $LOG.info 'Closing PREPROCESSING server'
120
+ else
121
+ puts "Links file not exists\n#{options[:file]}"
122
+ Process.exit()
123
+ end
124
+ end
125
+
126
+ # Genomic feature data position selection
127
+ #-----------------------------------------------------------------------------
128
+ if !options[:selected_positions].nil?
129
+ if File.exist?(options[:selected_positions])
130
+ selected_positions_folder = File.join(options[:output], 'selected_positions')
131
+ options[:selected_positions_folder] = selected_positions_folder
132
+ Dir.mkdir(selected_positions_folder) if !Dir.exist?(selected_positions_folder)
133
+
134
+ $LOG.info 'Starting POSITION_SELECTION server'
135
+ custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'position_selection_worker.rb')
136
+ PositionSelectionManager.init_work_manager(options)
137
+
138
+ mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PositionSelectionManager, custom_worker_file, STDOUT) # launch processor server
139
+ mgr.chunk_size = options[:chunk_size]
140
+ mgr.start_server # start processing
141
+ $LOG.info 'Closing POSITION_SELECTION server'
142
+ else
143
+ puts "File with selected positions not exists:\n#{options[:selected_positions]}"
144
+ Process.exit()
145
+ end
146
+ end
147
+
@@ -0,0 +1,53 @@
1
+ require 'yajl'
2
+ class FileParser
3
+ @@parsers = {}
4
+ def self.get_descendants
5
+ return ObjectSpace.each_object(Class).select { |klass| klass < self }
6
+ end
7
+
8
+ def self.load
9
+ path_parsers = File.join(File.dirname(__FILE__), 'file_parsers')
10
+ Dir.glob(path_parsers+'/*').each do |parser|
11
+ require parser
12
+ end
13
+ get_descendants.each do |descendant|
14
+ @@parsers[descendant.format] = descendant if descendant.available?
15
+ end
16
+ end
17
+
18
+ def self.select(format)
19
+ return @@parsers[format]
20
+ end
21
+
22
+ ########################################################################################
23
+ ## PARSER DEPENDANT METHODS
24
+ ########################################################################################
25
+ def self.available?
26
+ return FALSE
27
+ end
28
+
29
+ def self.format
30
+ return 'master'
31
+ end
32
+
33
+ def initialize(folder, chunk_size)
34
+ @folder = folder
35
+ @chunk_size = chunk_size
36
+ @chrom = nil
37
+ @coords = []
38
+ @packs = 0
39
+ end
40
+
41
+ def parse(line)
42
+
43
+ end
44
+
45
+ def write_compressed_data
46
+ p = @packs * @chunk_size
47
+ gz_path = File.join(@folder, "#{@chrom}_#{p}.gz")
48
+ Zlib::GzipWriter.open(gz_path) do |writer|
49
+ Yajl::Encoder.encode(@coords, writer)
50
+ end
51
+ @packs += 1
52
+ end
53
+ end
@@ -0,0 +1,65 @@
1
+ class WigfixParser < FileParser
2
+ def initialize(folder, chunk_size)
3
+ super
4
+ @start = 1
5
+ @step = 1
6
+ end
7
+
8
+ def self.available?
9
+ return TRUE
10
+ end
11
+
12
+ def self.format
13
+ return 'wigfix'
14
+ end
15
+
16
+ def parse(line)
17
+ #fixedStep chrom=chr11 start=60001 step=1
18
+ if line.include?('fixedStep')
19
+ line =~ /fixedStep chrom=(\S+) start=(\d+) step=(\d+)/
20
+ if !@chrom.nil? && @chrom != $1 #We change of chromosome, we write the buffered coordinates
21
+ #puts "=> #{@packs}\t#{@start}\tx"
22
+ #puts @coords.first.inspect
23
+ #puts @coords.last.inspect
24
+ write_compressed_data
25
+ @coords = []
26
+ end
27
+ @chrom = $1
28
+ last_start = @start
29
+ @start = $2.to_i
30
+ diff = @start - last_start #Create dummy files to fill gaps on coordinate scores
31
+ if diff >= @chunk_size
32
+ (diff/@chunk_size).times do
33
+ #puts "=> #{@packs}\t#{@start}\td"
34
+ #puts @coords.first.inspect
35
+ #puts @coords.last.inspect
36
+ write_compressed_data
37
+ @coords = []
38
+ end
39
+ else
40
+ if @start/@chunk_size != last_start/@chunk_size #Current coordinate belongs to another pack that the previous, write the buffered coordinates
41
+ #puts "=> #{@packs}\t#{@start}\te"
42
+ #puts @coords.first.inspect
43
+ #puts @coords.last.inspect
44
+ write_compressed_data
45
+ @coords = []
46
+ end
47
+ end
48
+ @step = $3.to_i
49
+ else
50
+ if @start % @chunk_size == 0 # We have reached the chun size, write it to disk
51
+ #puts "=> #{@packs}\t#{@start}\tl"
52
+ #puts @coords.first.inspect
53
+ #puts @coords.last.inspect
54
+ write_compressed_data
55
+ @coords = []
56
+ end
57
+ @coords << [@start, line.to_f]
58
+ @start += @step
59
+ end
60
+ end
61
+
62
+ def get_data
63
+ return @coords
64
+ end
65
+ end
@@ -0,0 +1,214 @@
1
+ require 'json'
2
+ require 'rroc'
3
+ require 'gchart'
4
+
5
+ # MyWorkerManager class is used to implement the methods
6
+ # to send and receive the data to or from workers
7
+ class PositionSelectionManager < ScbiMapreduce::WorkManager
8
+
9
+ ######################################################################################################
10
+ ## MANAGER BASIC METHODS
11
+ ######################################################################################################
12
+
13
+ # init_work_manager is executed at the start, prior to any processing.
14
+ # You can use init_work_manager to initialize global variables, open files, etc...
15
+ # Note that an instance of MyWorkerManager will be created for each
16
+ # worker connection, and thus, all global variables here should be
17
+ # class variables (starting with @@)
18
+ def self.init_work_manager(options)
19
+ @@options = options
20
+ @@positions, @@groups = load_selected_positions(@@options[:selected_positions])
21
+ @@active_data = File.open(File.join(@@options[:preprocessed_data], 'active_data')).readlines.map {|item| item.chomp}
22
+ @@used_data = 0
23
+ @@used_position = 0
24
+ @@all_data = {}
25
+ end
26
+
27
+ # end_work_manager is executed at the end, when all the process is done.
28
+ # You can use it to close files opened in init_work_manager
29
+ def self.end_work_manager
30
+ positions_ids = []
31
+ scores = {} # Create genomic features table
32
+ $LOG.info "Create general scores table"
33
+ @@all_data.each do |data, positions_info|
34
+ data_scores = []
35
+ positions_info.each do |chr, position_info|
36
+ position_info.each do |position, score|
37
+ data_scores << ["#{chr}_#{position.to_s}", score]
38
+ end
39
+ end
40
+ data_scores.sort!{|sc1, sc2| sc1.first <=> sc2.first}
41
+ scores[data] = data_scores.map{|sc| sc.last}
42
+ positions_ids = data_scores.map{|sc| sc.first} if positions_ids.empty?
43
+ end
44
+
45
+ if !@@groups.empty?
46
+ tags = positions_ids.map{|id| # Create vector tag group related to scores table
47
+ tag = @@groups[id]
48
+ if tag == 0
49
+ tag = -1
50
+ else
51
+ tag = 1
52
+ end
53
+ }
54
+ if !@@options[:no_auc]
55
+ $LOG.info "Calculating AUC for each genomic feature"
56
+ aucs = get_aucs(tags, scores) # GEnerate area under curve by each genomic feature
57
+ File.open(File.join(@@options[:selected_positions_folder], 'AUCs'), 'w'){ |f|
58
+ aucs.each do |data_type, auc|
59
+ f.puts "#{data_type}\t#{auc.join("\t")}"
60
+ end
61
+ }
62
+ end
63
+ $LOG.info "Creating training files for tensorflow"
64
+ create_positions_sets_for_tensorflow(@@options[:selected_positions_folder], scores, tags)
65
+ end
66
+
67
+ data_types = scores.keys
68
+ File.open(File.join(@@options[:selected_positions_folder], 'all_data'), 'w'){ |f| #final genomic feature scores table for goldstandard
69
+ f.puts ['position'].concat(data_types).join("\t")
70
+ positions_ids.each_with_index do |id, i|
71
+ record = [id]
72
+ data_types.each do |dt|
73
+ record << scores[dt][i]
74
+ end
75
+ f.puts record.join("\t")
76
+ end
77
+ }
78
+ end
79
+
80
+ # worker_initial_config is used to send initial parameters to workers.
81
+ # The method is executed once per each worker
82
+ def worker_initial_config
83
+ return @@options
84
+ end
85
+
86
+ # next_work method is called every time a worker needs a new work
87
+ # Here you can read data from disk
88
+ # This method must return the work data or nil if no more data is available
89
+ def next_work
90
+ begin
91
+ if @@used_data >= @@active_data.length
92
+ e = nil # worker signal disconect
93
+ else
94
+ chr = @@positions.keys[@@used_position]
95
+ e = [@@active_data[@@used_data], chr, @@positions[chr]]
96
+ @@used_position += 1
97
+ if @@used_position >= @@positions.length
98
+ @@used_data +=1
99
+ @@used_position = 0
100
+ end
101
+ end
102
+
103
+ rescue Exception => e
104
+ puts e.message
105
+ puts e.backtrace
106
+
107
+ end
108
+ return e
109
+
110
+ end
111
+
112
+
113
+ # work_received is executed each time a worker has finished a job.
114
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
115
+ def work_received(results)
116
+ results.each do |data, positions_info|
117
+ query = @@all_data[data]
118
+ if query.nil?
119
+ @@all_data[data] = positions_info
120
+ else
121
+ @@all_data[data] = query.merge(positions_info)
122
+ end
123
+ end
124
+ end
125
+
126
+ ######################################################################################################
127
+ ## CUSTOM ADDITIONAL METHODS
128
+ ######################################################################################################
129
+
130
+ def self.load_selected_positions(file_path)
131
+ selected_positions = {}
132
+ groups = {}
133
+ File.open(file_path).each do |line|
134
+ line.chomp!
135
+ chr, position, group = line.split("\t")
136
+ record = position.to_i
137
+ if !group.nil?
138
+ group = group.to_i
139
+ groups["#{chr}_#{position}"] = group
140
+ end
141
+ query = selected_positions[chr]
142
+ if query.nil?
143
+ selected_positions[chr] = [record]
144
+ else
145
+ query << record
146
+ query.uniq!
147
+ end
148
+ end
149
+ selected_positions.each do |chr, positions|
150
+ positions.sort!
151
+ end
152
+ return selected_positions, groups
153
+ end
154
+
155
+ def self.get_aucs(tags, scores)
156
+ aucs = {}
157
+ scores.each do | data_type, scores|
158
+ matrix = []
159
+ scores.each_with_index do |score, i|
160
+ matrix << [score, tags[i]]
161
+ end
162
+ pts = ROC.curve_points(matrix)
163
+ aucs[data_type] = [ROC.auc(matrix), GChart.scatter(:data => [pts.collect { |x| x[0] }, pts.collect { |x| x[1] }]).to_url]
164
+ end
165
+ return aucs
166
+ end
167
+
168
+ def self.create_positions_sets_for_tensorflow(path_folder, scores, tags)
169
+ validation_set_proportion = 0.2
170
+ positions_number = tags.length
171
+ validation_set_length = (positions_number * validation_set_proportion).to_i
172
+ training_set_length = positions_number - validation_set_length
173
+ validation_set_positions = [] # Set which positions will belong to validation set
174
+ while validation_set_positions.length < validation_set_length
175
+ position = rand(positions_number - 1) # We need random 0 based positions
176
+ validation_set_positions << position if !validation_set_positions.include?(position)
177
+ end
178
+ tags.map!{|t| #tensorflow nedd positive integer as tags, we change tag used in AUC operation
179
+ if t == -1
180
+ 0
181
+ else
182
+ t
183
+ end
184
+ }
185
+ genomic_features = scores.keys
186
+ training_set = []
187
+ validation_set = []
188
+ tags.each_with_index do |tag, n|
189
+ record = [] # Create record position
190
+ genomic_features.each do |gf|
191
+ record << scores[gf][n]
192
+ end
193
+ record << tag
194
+ if validation_set_positions.include?(n) # Send record to correspondant set
195
+ validation_set << record
196
+ else
197
+ training_set << record
198
+ end
199
+ end
200
+ tag_names = tags.uniq #TODO: improve to ensure exact correspondance
201
+ training_set.unshift([training_set.length, genomic_features.length].concat(tag_names)) # set headers
202
+ validation_set.unshift([validation_set.length, genomic_features.length].concat(tag_names)) # set headers
203
+ write_set(training_set, File.join(path_folder, 'training_set.csv'))
204
+ write_set(validation_set, File.join(path_folder, 'validation_set.csv'))
205
+ end
206
+
207
+ def self.write_set(set, path)
208
+ File.open(path, 'w'){|f|
209
+ set.each do |record|
210
+ f.puts record.join(',')
211
+ end
212
+ }
213
+ end
214
+ end
@@ -0,0 +1,140 @@
1
+ ROOT_PATH=File.dirname(__FILE__)
2
+ $: << File.expand_path(ROOT_PATH)
3
+
4
+ require 'benchmark'
5
+
6
+ # MyWorker defines the behaviour of workers.
7
+ # Here is where the real processing takes place
8
+ class PositionSelectionWorker < ScbiMapreduce::Worker
9
+
10
+ ######################################################################################################
11
+ ## WORKER BASIC METHODS
12
+ ######################################################################################################
13
+
14
+ # starting_worker method is called one time at initialization
15
+ # and allows you to initialize your variables
16
+ def starting_worker
17
+
18
+ # You can use worker logs at any time in this way:
19
+ $WORKER_LOG.info "Starting a worker"
20
+
21
+ end
22
+
23
+
24
+ # receive_initial_config is called only once just after
25
+ # the first connection, when initial parameters are
26
+ # received from manager
27
+ def receive_initial_config(parameters)
28
+ @options = parameters
29
+ # Reads the parameters
30
+
31
+ # You can use worker logs at any time in this way:
32
+ $WORKER_LOG.info "Params received"
33
+
34
+ # save received parameters, if any
35
+ # @params = parameters
36
+ end
37
+
38
+
39
+ # process_object method is called for each received object.
40
+ # Be aware that objs is always an array, and you must iterate
41
+ # over it if you need to process it independently
42
+ #
43
+ # The value returned here will be received by the work_received
44
+ # method at your worker_manager subclass.
45
+ def process_object(objs)
46
+ all_data = nil
47
+ Benchmark.bm do |x|
48
+ x.report('PosS'){
49
+
50
+ packs, datas = get_info_to_search(objs)
51
+ all_data = {}
52
+ datas.each do |data|
53
+ selected_scores = {}
54
+ packs.each do |chr, ps|
55
+ scores = []
56
+ ps.each do |pack, positions|
57
+ info_path = File.join(@options[:preprocessed_data], data, "#{chr}_#{pack}.gz")
58
+ #puts info_path
59
+ if File.exists?(info_path)
60
+ chr_data = []
61
+ Zlib::GzipReader.open(info_path) {|gz| chr_data = JSON.parse(gz.read)}
62
+ scores.concat(get_scores(chr_data, positions))
63
+ end
64
+ end
65
+ selected_scores[chr] = scores
66
+ end
67
+ all_data[data] = selected_scores
68
+ end
69
+ # return objs back to manager
70
+
71
+ }
72
+ end
73
+ return all_data
74
+ end
75
+
76
+ # called once, when the worker is about to be closed
77
+ def closing_worker
78
+
79
+ end
80
+
81
+ ######################################################################################################
82
+ ## WORKER CUSTOM METHODS
83
+ ######################################################################################################
84
+
85
+ def get_info_to_search(objs)
86
+ packs = {}
87
+ datas = []
88
+ objs.each do |data, chr, positions| # Analyse which chromosomes and packs must be loaded
89
+ datas << data if !datas.include?(data)
90
+ positions.each do |position|
91
+ pack = position/@options[:index_size]
92
+ pack = pack * @options[:index_size]
93
+ #puts "#{position} ==> #{pack}"
94
+ query_chr = packs[chr]
95
+ if query_chr.nil?
96
+ packs[chr] = { pack => [position]}
97
+ else
98
+ query_pack = query_chr[pack]
99
+ if query_pack.nil?
100
+ query_chr[pack] = [position]
101
+ else
102
+ query_pack << position
103
+ end
104
+ end
105
+ end
106
+ end
107
+ return packs, datas
108
+ end
109
+
110
+ def get_scores(chr_data, positions)
111
+ positions_scores = []
112
+ # Remove positions out of existing coordinates
113
+ lower_limit = chr_data.first.first
114
+ upper_limit = chr_data.last.first
115
+ positions_scores.concat(positions.select{|pos| pos < lower_limit}.map{|pos| [pos, 0]}) #At the beginning
116
+ filtered_positions = positions.select{|pos| pos >= lower_limit && pos <= upper_limit }
117
+ #--------------------------------------------------------------------------------------------------
118
+ if !filtered_positions.empty?
119
+ current_position = filtered_positions.shift
120
+ chr_data.each do |coord, score|
121
+ if coord == current_position
122
+ positions_scores << [current_position, score]
123
+ break if filtered_positions.empty?
124
+ current_position = filtered_positions.shift
125
+ elsif coord > current_position # We have encountered a gap and current position is in it
126
+ while coord > current_position # drop positions within the gap
127
+ positions_scores << [current_position, 0]
128
+ break if filtered_positions.empty?
129
+ current_position = filtered_positions.shift
130
+ end
131
+ break if filtered_positions.empty?
132
+ end
133
+ end
134
+ end
135
+
136
+ positions_scores.concat(positions.select{|pos| pos > upper_limit}.map{|pos| [pos, 0]}) # At the end
137
+
138
+ return positions_scores
139
+ end
140
+ end
@@ -0,0 +1,87 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class PreprocessingManager < ScbiMapreduce::WorkManager
6
+
7
+ ######################################################################################################
8
+ ## MANAGER BASIC METHODS
9
+ ######################################################################################################
10
+
11
+ # init_work_manager is executed at the start, prior to any processing.
12
+ # You can use init_work_manager to initialize global variables, open files, etc...
13
+ # Note that an instance of MyWorkerManager will be created for each
14
+ # worker connection, and thus, all global variables here should be
15
+ # class variables (starting with @@)
16
+ def self.init_work_manager(options)
17
+ @@options = options
18
+ $LOG.info 'Load genomic features links'
19
+ @@features = load_links(options[:file])
20
+ $LOG.info "Loaded #{@@features.length} genomic features links"
21
+
22
+ # FEATURE DIRECTORIES
23
+ @@features.each do |feature|
24
+ ft_folder = File.join(@@options[:preprocessed_data], feature[1]) #feature name
25
+ ft_temp_folder = File.join(@@options[:temp], feature[1])
26
+ Dir.mkdir(ft_folder) if !Dir.exist?(ft_folder)
27
+ Dir.mkdir(ft_temp_folder) if !Dir.exist?(ft_temp_folder)
28
+ end
29
+ @@processed_features = 0
30
+ end
31
+
32
+ # end_work_manager is executed at the end, when all the process is done.
33
+ # You can use it to close files opened in init_work_manager
34
+ def self.end_work_manager
35
+ File.open(File.join(@@options[:preprocessed_data], 'active_data'), 'w'){ |f| f.puts @@features.map{|f| f[1]}.uniq.join("\n")}
36
+ end
37
+
38
+ # worker_initial_config is used to send initial parameters to workers.
39
+ # The method is executed once per each worker
40
+ def worker_initial_config
41
+ return @@options
42
+ end
43
+
44
+ # next_work method is called every time a worker needs a new work
45
+ # Here you can read data from disk
46
+ # This method must return the work data or nil if no more data is available
47
+ def next_work
48
+ begin
49
+ if @@processed_features >= @@features.length
50
+ e = nil # worker signal disconect
51
+ else
52
+ e = @@features[@@processed_features]
53
+ end
54
+
55
+ @@processed_features += 1
56
+ rescue Exception => e
57
+ puts e.message
58
+ puts e.backtrace
59
+
60
+ end
61
+ return e
62
+
63
+ end
64
+
65
+
66
+ # work_received is executed each time a worker has finished a job.
67
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
68
+ def work_received(results)
69
+
70
+ # write_data_to_disk(results)
71
+ end
72
+
73
+ ######################################################################################################
74
+ ## CUSTOM ADDITIONAL METHODS
75
+ ######################################################################################################
76
+
77
+ def self.load_links(file_path)
78
+ features = []
79
+ File.open(file_path).each do |line|
80
+ line.chomp!
81
+ link, feature, cols, header, format = line.split("\t")
82
+ features << [link, feature, cols.split(',').map{|col| col.to_i}, header.to_i, format]
83
+ end
84
+ return features
85
+ end
86
+
87
+ end
@@ -0,0 +1,139 @@
1
+ ROOT_PATH=File.dirname(__FILE__)
2
+ $: << File.expand_path(ROOT_PATH)
3
+
4
+ require 'yajl'
5
+ require 'open-uri'
6
+ require 'benchmark'
7
+ require 'file_parser'
8
+
9
+
10
+ # MyWorker defines the behaviour of workers.
11
+ # Here is where the real processing takes place
12
+ class PreprocessingWorker < ScbiMapreduce::Worker
13
+
14
+ ######################################################################################################
15
+ ## WORKER BASIC METHODS
16
+ ######################################################################################################
17
+
18
+ # starting_worker method is called one time at initialization
19
+ # and allows you to initialize your variables
20
+ def starting_worker
21
+
22
+ # You can use worker logs at any time in this way:
23
+ $WORKER_LOG.info "Starting a worker"
24
+
25
+ end
26
+
27
+
28
+ # receive_initial_config is called only once just after
29
+ # the first connection, when initial parameters are
30
+ # received from manager
31
+ def receive_initial_config(parameters)
32
+ @options = parameters
33
+ # Reads the parameters
34
+
35
+ # You can use worker logs at any time in this way:
36
+ $WORKER_LOG.info "Params received"
37
+
38
+ # save received parameters, if any
39
+ # @params = parameters
40
+ end
41
+
42
+
43
+ # process_object method is called for each received object.
44
+ # Be aware that objs is always an array, and you must iterate
45
+ # over it if you need to process it independently
46
+ #
47
+ # The value returned here will be received by the work_received
48
+ # method at your worker_manager subclass.
49
+ def process_object(objs)
50
+ Benchmark.bm do |x|
51
+ x.report('Prep'){
52
+
53
+ FileParser.load
54
+ objs.each do |link, feature, cols, header, format| # iterate over all objects received
55
+ $WORKER_LOG.info "Processing link: #{feature}, #{format}, #{link}"
56
+ ft_folder = File.join(@options[:preprocessed_data], feature)
57
+ ft_temp_folder = File.join(@options[:temp], feature)
58
+ temp_file = download_data(link, cols, header, format, ft_temp_folder)
59
+ if !@options[:downloaded_only]
60
+ if File.exist?(temp_file)
61
+ extract_data(format, temp_file, ft_folder)
62
+ else
63
+ $WORKER_LOG.info "WARNING: Temporal file #{temp_file} have not been downloaded for feature #{feature} so it will be skipped"
64
+ end
65
+ else
66
+ $WORKER_LOG.info "Download only mode, skipping processing temp files"
67
+ end
68
+ end
69
+
70
+ }
71
+ end
72
+ # return objs back to manager
73
+ return []
74
+ end
75
+
76
+ # called once, when the worker is about to be closed
77
+ def closing_worker
78
+
79
+ end
80
+
81
+ ######################################################################################################
82
+ ## WORKER CUSTOM METHODS
83
+ ######################################################################################################
84
+
85
+ # Download protocols
86
+ #-----------------------------------------
87
+ def download_data(link, cols, header, format, temp)
88
+ protocol, url = link.split('://')
89
+ temp_file = nil
90
+ if protocol == 'http'
91
+ temp_file = File.join(temp, url.split('/').last)
92
+ if !File.exist?(temp_file)
93
+ get_http_data(url, temp_file)
94
+ $WORKER_LOG.info "Downloading #{link}"
95
+ else
96
+ $WORKER_LOG.info "Link was downloaded in a previous execution. Skipping download #{link}"
97
+ end
98
+ elsif protocol == 'ftp'
99
+ else
100
+ $WORKER_LOG.info "WARNING: protocol: #{protocol} in link: #{link} is not supported"
101
+ end
102
+ return temp_file
103
+ end
104
+
105
+ def get_http_data(url, temp)
106
+ File.open(temp, "wb") do |saved_file|
107
+ open("http://#{url}", "rb") do |read_file|
108
+ saved_file.write(read_file.read)
109
+ end
110
+ end
111
+ end
112
+
113
+ # File decompression methods
114
+ #-----------------------------------------
115
+ def extract_data(format, temp, folder)
116
+ data = {}
117
+ parser_class = FileParser.select(format)
118
+ parser = parser_class.new(folder, @options[:index_size])
119
+ $WORKER_LOG.info "Processing temporal file #{temp}"
120
+ if temp.include?('.gz')
121
+ #data = get_gz(temp, parser)
122
+ get_gz(temp, parser)
123
+ else
124
+
125
+ end
126
+ parser.write_compressed_data # Write remaining buffered data
127
+ $WORKER_LOG.info "End processing temporal file #{temp}"
128
+ return data
129
+ end
130
+
131
+ def get_gz(temp, parser)
132
+ Zlib::GzipReader.open(temp) {|gz|
133
+ gz.each do |line|
134
+ parser.parse(line.chomp)
135
+ end
136
+ }
137
+ #return parser.get_data
138
+ end
139
+ end
@@ -1,3 +1,3 @@
1
1
  module Anncrsnp
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.7"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anncrsnp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elena Rojano
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-07-25 00:00:00.000000000 Z
12
+ date: 2016-09-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -98,6 +98,7 @@ executables:
98
98
  - grdbfinder.rb
99
99
  - grdbmanager.rb
100
100
  - masterfeatures.rb
101
+ - retriever.rb
101
102
  - setup
102
103
  - statistics.rb
103
104
  extensions: []
@@ -115,6 +116,7 @@ files:
115
116
  - bin/grdbfinder.rb
116
117
  - bin/grdbmanager.rb
117
118
  - bin/masterfeatures.rb
119
+ - bin/retriever.rb
118
120
  - bin/setup
119
121
  - bin/statistics.rb
120
122
  - database/.DS_Store
@@ -122,7 +124,13 @@ files:
122
124
  - database/deleteme
123
125
  - lib/anncrsnp.rb
124
126
  - lib/anncrsnp/dataset.rb
127
+ - lib/anncrsnp/file_parser.rb
128
+ - lib/anncrsnp/file_parsers/wigfix_parser.rb
125
129
  - lib/anncrsnp/parsers/ucscparser.rb
130
+ - lib/anncrsnp/position_selection_manager.rb
131
+ - lib/anncrsnp/position_selection_worker.rb
132
+ - lib/anncrsnp/preprocessing_manager.rb
133
+ - lib/anncrsnp/preprocessing_worker.rb
126
134
  - lib/anncrsnp/version.rb
127
135
  homepage: ''
128
136
  licenses: