anncrsnp 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a5b68efa127fe9ae6c1b409daa38ee22236068b
4
- data.tar.gz: 4714dfd569a2568ddf055e346747de2bd03a5057
3
+ metadata.gz: ee18ae125e8e7b9738d3dd493ee66d57a374f885
4
+ data.tar.gz: 27e49ea014fbbdfb2aba568847c3214d24ee621e
5
5
  SHA512:
6
- metadata.gz: a73bd75040ddf05079ba4555c9769ae45a41610520ae42827fda4f9e6ba45807b79501b947c1dd3d1dd9addc032107c8ac56ae047edba9ba9040534af906de8d
7
- data.tar.gz: 874fb9ec1a453b33451d7d44710ec114de44bf5ba1c8701d1f17622a4222c96259f75bfb12e51a5b226ff835b6da492c82e67791a3553ed77c8289557d63f770
6
+ metadata.gz: cd254d0ed92720ce4a6bc1909e2bff86a845a3714549460e29f4f290bed7376e6d9b872aea58bc8093724dc0545fa3403213a43d3fe269130477cf6cb8ced556
7
+ data.tar.gz: 1269b73c8b8f6147940428a27ad5d54ec06b6ea8381fe0894ace30854e14b851cf893778cb4310630db56c21f7fb704ecfc1889ffa9624a315b454e4f8911d0d
@@ -347,16 +347,44 @@ def download_database(database_path)
347
347
  out_path = File.dirname(database_path)
348
348
  puts "Downloading database in #{out_path}, please be patient..."
349
349
  zip_path = File.join(out_path, 'database.zip')
350
- File.open(zip_path, "wb") do |saved_file|
351
- open("http://bio-267-data.uma.es/database.zip", "rb") do |read_file|
352
- saved_file.write(read_file.read)
353
- end
354
- end
355
- puts "Decompressing database..."
356
- Zip::File.open(zip_path) do |zip_file|
357
- zip_file.each do |entry|
358
- entry.extract(database_path)
359
- end
350
+ # Code from https://www.ruby-forum.com/topic/4413829
351
+ target = "http://bio-267-data.uma.es/database.zip"
352
+
353
+ bytes_total = nil
354
+
355
+ open(target, "rb",
356
+ :content_length_proc => lambda{|content_length|
357
+ bytes_total = content_length},
358
+ :progress_proc => lambda{|bytes_transferred|
359
+ if bytes_total
360
+ # Print progress
361
+ print("\r#{bytes_transferred}/#{bytes_total}")
362
+ else
363
+ # We don’t know how much we get, so just print number
364
+ # of transferred bytes
365
+ print("\r#{bytes_transferred} (total size unknown)")
366
+ end
367
+ }) do |page|
368
+ # Now the real operation
369
+ File.open(zip_path, "wb") do |file|
370
+ # The file may not fit into RAM entirely, so copy it
371
+ # chunk by chunk.
372
+ while chunk = page.read(1024)
373
+ file.write(chunk)
374
+ end
375
+ end
376
+ end
377
+
378
+ if File.exists?(zip_path)
379
+ puts "\nDecompressing database..."
380
+ Zip::File.open(zip_path) do |zip_file|
381
+ zip_file.each do |entry|
382
+ entry.extract(database_path)
383
+ end
384
+ end
385
+ else
386
+ puts "ERROR: #{zip_path} was not found"
387
+ Process.exit
360
388
  end
361
389
  if File.exists?(database_path)
362
390
  File.delete(zip_path)
@@ -0,0 +1,147 @@
1
+ #! /usr/bin/env ruby
2
+ ROOT_PATH = File.dirname(__FILE__)
3
+ $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp'))
4
+
5
+
6
+ require 'optparse'
7
+ require 'scbi_mapreduce'
8
+ require 'preprocessing_manager'
9
+ require 'position_selection_manager'
10
+
11
+ #####################################################################
12
+ ### OPTPARSE
13
+ #####################################################################
14
+
15
+ options = {}
16
+ OptionParser.new do |opts|
17
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
18
+
19
+ ### PARALELISATION OPTIONS
20
+ #####################################################################
21
+ options[:server_ip] = '0.0.0.0'
22
+ opts.on( '-s', '--server IP', 'Server ip. You can use a partial ip to select the apropriate interface' ) do |server_ip|
23
+ options[:server_ip] = server_ip
24
+ end
25
+
26
+ # server port
27
+ options[:port] = 0 # any free port
28
+ opts.on( '-p', '--port PORT', 'Server port. If set to 0, an arbitrary empty port will be used') do |port|
29
+ options[:port] = port.to_i
30
+ end
31
+
32
+ # set number of workers. You can also provide an array with worker names.
33
+ # Those workers names can be read from a file produced by the existing
34
+ # queue system, if any.
35
+ options[:workers] = 2
36
+ opts.on( '-w', '--workers COUNT', 'Number of workers, or file containing machine names to launch workers with ssh' ) do |workers|
37
+ if File.exists?(workers)
38
+ # use workers file
39
+ options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
40
+ else
41
+ begin
42
+ options[:workers] = Integer(workers)
43
+ rescue
44
+ STDERR.puts "ERROR:Invalid workers parameter #{options[:workers]}"
45
+ exit
46
+ end
47
+ end
48
+ end
49
+
50
+ # chunk size
51
+ options[:chunk_size] = 1
52
+ opts.on( '-g', '--group_size chunk_size', 'Group sequences in chunks of size <chunk_size>' ) do |cs|
53
+ options[:chunk_size] = cs.to_i
54
+ end
55
+
56
+ ### EXECUTION OPTIONS
57
+ #####################################################################
58
+ options[:index_size] = 1000000
59
+ opts.on( '-x', '--index_size INTEGER', 'Size of genomic features data packs' ) do |is|
60
+ options[:index_size] = is.to_i
61
+ end
62
+
63
+ options[:file] = nil
64
+ opts.on("-f", "--file-links PATH", "Input file with links to retrieve data") do |f|
65
+ options[:file] = f
66
+ end
67
+
68
+ options[:output] = 'data'
69
+ opts.on("-o", "--output PATH", "Folder output path") do |f|
70
+ options[:output] = f
71
+ end
72
+
73
+ options[:downloaded_only] = FALSE
74
+ opts.on("--download_only", "Only download gemonic features files but not process them") do
75
+ options[:downloaded_only] = TRUE
76
+ end
77
+
78
+ options[:no_auc] = FALSE
79
+ opts.on("--no_auc", "No calculate auc by each genomic feature") do
80
+ options[:no_auc] = TRUE
81
+ end
82
+
83
+ options[:selected_positions] = nil
84
+ opts.on("--selected_positions PATH", "Tabular file with chromosome (as chrN) and base 1 coordinates. Optionally a third field can be added with 0/1 values for positive/negative groups") do |selected|
85
+ options[:selected_positions] = selected
86
+ end
87
+
88
+ end.parse!
89
+
90
+ #####################################################################
91
+ ### MAIN
92
+ #####################################################################
93
+
94
+ # GENERAL FOLDER
95
+ Dir.mkdir(options[:output]) if !Dir.exist?(options[:output])
96
+
97
+ # MAPREDUCE LAUNCHING
98
+ ##########################################################
99
+ $LOG = Logger.new(STDOUT)
100
+ $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
101
+ # Genomic feature data downloading and preprocessing
102
+ #-----------------------------------------------------------------------------
103
+ if !options[:file].nil?
104
+ if File.exists?(options[:file])
105
+ temp = File.join(options[:output], 'temp')
106
+ options[:temp] = temp
107
+ Dir.mkdir(temp) if !Dir.exist?(temp)
108
+ preprocessed_data = File.join(options[:output], 'preprocessed_data')
109
+ options[:preprocessed_data] = preprocessed_data
110
+ Dir.mkdir(preprocessed_data) if !Dir.exist?(preprocessed_data)
111
+
112
+ $LOG.info 'Starting PREPROCESSING server'
113
+ custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'preprocessing_worker.rb')
114
+ PreprocessingManager.init_work_manager(options)
115
+
116
+ mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PreprocessingManager, custom_worker_file, STDOUT) # launch processor server
117
+ mgr.chunk_size = options[:chunk_size]
118
+ mgr.start_server # start processing
119
+ $LOG.info 'Closing PREPROCESSING server'
120
+ else
121
+ puts "Links file not exists\n#{options[:file]}"
122
+ Process.exit()
123
+ end
124
+ end
125
+
126
+ # Genomic feature data position selection
127
+ #-----------------------------------------------------------------------------
128
+ if !options[:selected_positions].nil?
129
+ if File.exist?(options[:selected_positions])
130
+ selected_positions_folder = File.join(options[:output], 'selected_positions')
131
+ options[:selected_positions_folder] = selected_positions_folder
132
+ Dir.mkdir(selected_positions_folder) if !Dir.exist?(selected_positions_folder)
133
+
134
+ $LOG.info 'Starting POSITION_SELECTION server'
135
+ custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'position_selection_worker.rb')
136
+ PositionSelectionManager.init_work_manager(options)
137
+
138
+ mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PositionSelectionManager, custom_worker_file, STDOUT) # launch processor server
139
+ mgr.chunk_size = options[:chunk_size]
140
+ mgr.start_server # start processing
141
+ $LOG.info 'Closing POSITION_SELECTION server'
142
+ else
143
+ puts "File with selected positions not exists:\n#{options[:selected_positions]}"
144
+ Process.exit()
145
+ end
146
+ end
147
+
@@ -0,0 +1,53 @@
1
+ require 'yajl'
2
+ class FileParser
3
+ @@parsers = {}
4
+ def self.get_descendants
5
+ return ObjectSpace.each_object(Class).select { |klass| klass < self }
6
+ end
7
+
8
+ def self.load
9
+ path_parsers = File.join(File.dirname(__FILE__), 'file_parsers')
10
+ Dir.glob(path_parsers+'/*').each do |parser|
11
+ require parser
12
+ end
13
+ get_descendants.each do |descendant|
14
+ @@parsers[descendant.format] = descendant if descendant.available?
15
+ end
16
+ end
17
+
18
+ def self.select(format)
19
+ return @@parsers[format]
20
+ end
21
+
22
+ ########################################################################################
23
+ ## PARSER DEPENDANT METHODS
24
+ ########################################################################################
25
+ def self.available?
26
+ return FALSE
27
+ end
28
+
29
+ def self.format
30
+ return 'master'
31
+ end
32
+
33
+ def initialize(folder, chunk_size)
34
+ @folder = folder
35
+ @chunk_size = chunk_size
36
+ @chrom = nil
37
+ @coords = []
38
+ @packs = 0
39
+ end
40
+
41
+ def parse(line)
42
+
43
+ end
44
+
45
+ def write_compressed_data
46
+ p = @packs * @chunk_size
47
+ gz_path = File.join(@folder, "#{@chrom}_#{p}.gz")
48
+ Zlib::GzipWriter.open(gz_path) do |writer|
49
+ Yajl::Encoder.encode(@coords, writer)
50
+ end
51
+ @packs += 1
52
+ end
53
+ end
@@ -0,0 +1,65 @@
1
+ class WigfixParser < FileParser
2
+ def initialize(folder, chunk_size)
3
+ super
4
+ @start = 1
5
+ @step = 1
6
+ end
7
+
8
+ def self.available?
9
+ return TRUE
10
+ end
11
+
12
+ def self.format
13
+ return 'wigfix'
14
+ end
15
+
16
+ def parse(line)
17
+ #fixedStep chrom=chr11 start=60001 step=1
18
+ if line.include?('fixedStep')
19
+ line =~ /fixedStep chrom=(\S+) start=(\d+) step=(\d+)/
20
+ if !@chrom.nil? && @chrom != $1 #We change of chromosome, we write the buffered coordinates
21
+ #puts "=> #{@packs}\t#{@start}\tx"
22
+ #puts @coords.first.inspect
23
+ #puts @coords.last.inspect
24
+ write_compressed_data
25
+ @coords = []
26
+ end
27
+ @chrom = $1
28
+ last_start = @start
29
+ @start = $2.to_i
30
+ diff = @start - last_start #Create dummy files to fill gaps on coordinate scores
31
+ if diff >= @chunk_size
32
+ (diff/@chunk_size).times do
33
+ #puts "=> #{@packs}\t#{@start}\td"
34
+ #puts @coords.first.inspect
35
+ #puts @coords.last.inspect
36
+ write_compressed_data
37
+ @coords = []
38
+ end
39
+ else
40
+ if @start/@chunk_size != last_start/@chunk_size #Current coordinate belongs to another pack that the previous, write the buffered coordinates
41
+ #puts "=> #{@packs}\t#{@start}\te"
42
+ #puts @coords.first.inspect
43
+ #puts @coords.last.inspect
44
+ write_compressed_data
45
+ @coords = []
46
+ end
47
+ end
48
+ @step = $3.to_i
49
+ else
50
+ if @start % @chunk_size == 0 # We have reached the chun size, write it to disk
51
+ #puts "=> #{@packs}\t#{@start}\tl"
52
+ #puts @coords.first.inspect
53
+ #puts @coords.last.inspect
54
+ write_compressed_data
55
+ @coords = []
56
+ end
57
+ @coords << [@start, line.to_f]
58
+ @start += @step
59
+ end
60
+ end
61
+
62
+ def get_data
63
+ return @coords
64
+ end
65
+ end
@@ -0,0 +1,214 @@
1
+ require 'json'
2
+ require 'rroc'
3
+ require 'gchart'
4
+
5
+ # MyWorkerManager class is used to implement the methods
6
+ # to send and receive the data to or from workers
7
+ class PositionSelectionManager < ScbiMapreduce::WorkManager
8
+
9
+ ######################################################################################################
10
+ ## MANAGER BASIC METHODS
11
+ ######################################################################################################
12
+
13
+ # init_work_manager is executed at the start, prior to any processing.
14
+ # You can use init_work_manager to initialize global variables, open files, etc...
15
+ # Note that an instance of MyWorkerManager will be created for each
16
+ # worker connection, and thus, all global variables here should be
17
+ # class variables (starting with @@)
18
+ def self.init_work_manager(options)
19
+ @@options = options
20
+ @@positions, @@groups = load_selected_positions(@@options[:selected_positions])
21
+ @@active_data = File.open(File.join(@@options[:preprocessed_data], 'active_data')).readlines.map {|item| item.chomp}
22
+ @@used_data = 0
23
+ @@used_position = 0
24
+ @@all_data = {}
25
+ end
26
+
27
+ # end_work_manager is executed at the end, when all the process is done.
28
+ # You can use it to close files opened in init_work_manager
29
+ def self.end_work_manager
30
+ positions_ids = []
31
+ scores = {} # Create genomic features table
32
+ $LOG.info "Create general scores table"
33
+ @@all_data.each do |data, positions_info|
34
+ data_scores = []
35
+ positions_info.each do |chr, position_info|
36
+ position_info.each do |position, score|
37
+ data_scores << ["#{chr}_#{position.to_s}", score]
38
+ end
39
+ end
40
+ data_scores.sort!{|sc1, sc2| sc1.first <=> sc2.first}
41
+ scores[data] = data_scores.map{|sc| sc.last}
42
+ positions_ids = data_scores.map{|sc| sc.first} if positions_ids.empty?
43
+ end
44
+
45
+ if !@@groups.empty?
46
+ tags = positions_ids.map{|id| # Create vector tag group related to scores table
47
+ tag = @@groups[id]
48
+ if tag == 0
49
+ tag = -1
50
+ else
51
+ tag = 1
52
+ end
53
+ }
54
+ if !@@options[:no_auc]
55
+ $LOG.info "Calculating AUC for each genomic feature"
56
+ aucs = get_aucs(tags, scores) # GEnerate area under curve by each genomic feature
57
+ File.open(File.join(@@options[:selected_positions_folder], 'AUCs'), 'w'){ |f|
58
+ aucs.each do |data_type, auc|
59
+ f.puts "#{data_type}\t#{auc.join("\t")}"
60
+ end
61
+ }
62
+ end
63
+ $LOG.info "Creating training files for tensorflow"
64
+ create_positions_sets_for_tensorflow(@@options[:selected_positions_folder], scores, tags)
65
+ end
66
+
67
+ data_types = scores.keys
68
+ File.open(File.join(@@options[:selected_positions_folder], 'all_data'), 'w'){ |f| #final genomic feature scores table for goldstandard
69
+ f.puts ['position'].concat(data_types).join("\t")
70
+ positions_ids.each_with_index do |id, i|
71
+ record = [id]
72
+ data_types.each do |dt|
73
+ record << scores[dt][i]
74
+ end
75
+ f.puts record.join("\t")
76
+ end
77
+ }
78
+ end
79
+
80
+ # worker_initial_config is used to send initial parameters to workers.
81
+ # The method is executed once per each worker
82
+ def worker_initial_config
83
+ return @@options
84
+ end
85
+
86
+ # next_work method is called every time a worker needs a new work
87
+ # Here you can read data from disk
88
+ # This method must return the work data or nil if no more data is available
89
+ def next_work
90
+ begin
91
+ if @@used_data >= @@active_data.length
92
+ e = nil # worker signal disconect
93
+ else
94
+ chr = @@positions.keys[@@used_position]
95
+ e = [@@active_data[@@used_data], chr, @@positions[chr]]
96
+ @@used_position += 1
97
+ if @@used_position >= @@positions.length
98
+ @@used_data +=1
99
+ @@used_position = 0
100
+ end
101
+ end
102
+
103
+ rescue Exception => e
104
+ puts e.message
105
+ puts e.backtrace
106
+
107
+ end
108
+ return e
109
+
110
+ end
111
+
112
+
113
+ # work_received is executed each time a worker has finished a job.
114
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
115
+ def work_received(results)
116
+ results.each do |data, positions_info|
117
+ query = @@all_data[data]
118
+ if query.nil?
119
+ @@all_data[data] = positions_info
120
+ else
121
+ @@all_data[data] = query.merge(positions_info)
122
+ end
123
+ end
124
+ end
125
+
126
+ ######################################################################################################
127
+ ## CUSTOM ADDITIONAL METHODS
128
+ ######################################################################################################
129
+
130
+ def self.load_selected_positions(file_path)
131
+ selected_positions = {}
132
+ groups = {}
133
+ File.open(file_path).each do |line|
134
+ line.chomp!
135
+ chr, position, group = line.split("\t")
136
+ record = position.to_i
137
+ if !group.nil?
138
+ group = group.to_i
139
+ groups["#{chr}_#{position}"] = group
140
+ end
141
+ query = selected_positions[chr]
142
+ if query.nil?
143
+ selected_positions[chr] = [record]
144
+ else
145
+ query << record
146
+ query.uniq!
147
+ end
148
+ end
149
+ selected_positions.each do |chr, positions|
150
+ positions.sort!
151
+ end
152
+ return selected_positions, groups
153
+ end
154
+
155
+ def self.get_aucs(tags, scores)
156
+ aucs = {}
157
+ scores.each do | data_type, scores|
158
+ matrix = []
159
+ scores.each_with_index do |score, i|
160
+ matrix << [score, tags[i]]
161
+ end
162
+ pts = ROC.curve_points(matrix)
163
+ aucs[data_type] = [ROC.auc(matrix), GChart.scatter(:data => [pts.collect { |x| x[0] }, pts.collect { |x| x[1] }]).to_url]
164
+ end
165
+ return aucs
166
+ end
167
+
168
+ def self.create_positions_sets_for_tensorflow(path_folder, scores, tags)
169
+ validation_set_proportion = 0.2
170
+ positions_number = tags.length
171
+ validation_set_length = (positions_number * validation_set_proportion).to_i
172
+ training_set_length = positions_number - validation_set_length
173
+ validation_set_positions = [] # Set which positions will belong to validation set
174
+ while validation_set_positions.length < validation_set_length
175
+ position = rand(positions_number - 1) # We need random 0 based positions
176
+ validation_set_positions << position if !validation_set_positions.include?(position)
177
+ end
178
+ tags.map!{|t| #tensorflow nedd positive integer as tags, we change tag used in AUC operation
179
+ if t == -1
180
+ 0
181
+ else
182
+ t
183
+ end
184
+ }
185
+ genomic_features = scores.keys
186
+ training_set = []
187
+ validation_set = []
188
+ tags.each_with_index do |tag, n|
189
+ record = [] # Create record position
190
+ genomic_features.each do |gf|
191
+ record << scores[gf][n]
192
+ end
193
+ record << tag
194
+ if validation_set_positions.include?(n) # Send record to correspondant set
195
+ validation_set << record
196
+ else
197
+ training_set << record
198
+ end
199
+ end
200
+ tag_names = tags.uniq #TODO: improve to ensure exact correspondance
201
+ training_set.unshift([training_set.length, genomic_features.length].concat(tag_names)) # set headers
202
+ validation_set.unshift([validation_set.length, genomic_features.length].concat(tag_names)) # set headers
203
+ write_set(training_set, File.join(path_folder, 'training_set.csv'))
204
+ write_set(validation_set, File.join(path_folder, 'validation_set.csv'))
205
+ end
206
+
207
+ def self.write_set(set, path)
208
+ File.open(path, 'w'){|f|
209
+ set.each do |record|
210
+ f.puts record.join(',')
211
+ end
212
+ }
213
+ end
214
+ end
@@ -0,0 +1,140 @@
1
+ ROOT_PATH=File.dirname(__FILE__)
2
+ $: << File.expand_path(ROOT_PATH)
3
+
4
+ require 'benchmark'
5
+
6
+ # MyWorker defines the behaviour of workers.
7
+ # Here is where the real processing takes place
8
+ class PositionSelectionWorker < ScbiMapreduce::Worker
9
+
10
+ ######################################################################################################
11
+ ## WORKER BASIC METHODS
12
+ ######################################################################################################
13
+
14
+ # starting_worker method is called one time at initialization
15
+ # and allows you to initialize your variables
16
+ def starting_worker
17
+
18
+ # You can use worker logs at any time in this way:
19
+ $WORKER_LOG.info "Starting a worker"
20
+
21
+ end
22
+
23
+
24
+ # receive_initial_config is called only once just after
25
+ # the first connection, when initial parameters are
26
+ # received from manager
27
+ def receive_initial_config(parameters)
28
+ @options = parameters
29
+ # Reads the parameters
30
+
31
+ # You can use worker logs at any time in this way:
32
+ $WORKER_LOG.info "Params received"
33
+
34
+ # save received parameters, if any
35
+ # @params = parameters
36
+ end
37
+
38
+
39
+ # process_object method is called for each received object.
40
+ # Be aware that objs is always an array, and you must iterate
41
+ # over it if you need to process it independently
42
+ #
43
+ # The value returned here will be received by the work_received
44
+ # method at your worker_manager subclass.
45
+ def process_object(objs)
46
+ all_data = nil
47
+ Benchmark.bm do |x|
48
+ x.report('PosS'){
49
+
50
+ packs, datas = get_info_to_search(objs)
51
+ all_data = {}
52
+ datas.each do |data|
53
+ selected_scores = {}
54
+ packs.each do |chr, ps|
55
+ scores = []
56
+ ps.each do |pack, positions|
57
+ info_path = File.join(@options[:preprocessed_data], data, "#{chr}_#{pack}.gz")
58
+ #puts info_path
59
+ if File.exists?(info_path)
60
+ chr_data = []
61
+ Zlib::GzipReader.open(info_path) {|gz| chr_data = JSON.parse(gz.read)}
62
+ scores.concat(get_scores(chr_data, positions))
63
+ end
64
+ end
65
+ selected_scores[chr] = scores
66
+ end
67
+ all_data[data] = selected_scores
68
+ end
69
+ # return objs back to manager
70
+
71
+ }
72
+ end
73
+ return all_data
74
+ end
75
+
76
+ # called once, when the worker is about to be closed
77
+ def closing_worker
78
+
79
+ end
80
+
81
+ ######################################################################################################
82
+ ## WORKER CUSTOM METHODS
83
+ ######################################################################################################
84
+
85
+ def get_info_to_search(objs)
86
+ packs = {}
87
+ datas = []
88
+ objs.each do |data, chr, positions| # Analyse which chromosomes and packs must be loaded
89
+ datas << data if !datas.include?(data)
90
+ positions.each do |position|
91
+ pack = position/@options[:index_size]
92
+ pack = pack * @options[:index_size]
93
+ #puts "#{position} ==> #{pack}"
94
+ query_chr = packs[chr]
95
+ if query_chr.nil?
96
+ packs[chr] = { pack => [position]}
97
+ else
98
+ query_pack = query_chr[pack]
99
+ if query_pack.nil?
100
+ query_chr[pack] = [position]
101
+ else
102
+ query_pack << position
103
+ end
104
+ end
105
+ end
106
+ end
107
+ return packs, datas
108
+ end
109
+
110
+ def get_scores(chr_data, positions)
111
+ positions_scores = []
112
+ # Remove positions out of existing coordinates
113
+ lower_limit = chr_data.first.first
114
+ upper_limit = chr_data.last.first
115
+ positions_scores.concat(positions.select{|pos| pos < lower_limit}.map{|pos| [pos, 0]}) #At the beginning
116
+ filtered_positions = positions.select{|pos| pos >= lower_limit && pos <= upper_limit }
117
+ #--------------------------------------------------------------------------------------------------
118
+ if !filtered_positions.empty?
119
+ current_position = filtered_positions.shift
120
+ chr_data.each do |coord, score|
121
+ if coord == current_position
122
+ positions_scores << [current_position, score]
123
+ break if filtered_positions.empty?
124
+ current_position = filtered_positions.shift
125
+ elsif coord > current_position # We have encountered a gap and current position is in it
126
+ while coord > current_position # drop positions within the gap
127
+ positions_scores << [current_position, 0]
128
+ break if filtered_positions.empty?
129
+ current_position = filtered_positions.shift
130
+ end
131
+ break if filtered_positions.empty?
132
+ end
133
+ end
134
+ end
135
+
136
+ positions_scores.concat(positions.select{|pos| pos > upper_limit}.map{|pos| [pos, 0]}) # At the end
137
+
138
+ return positions_scores
139
+ end
140
+ end
@@ -0,0 +1,87 @@
1
+ require 'json'
2
+
3
+ # MyWorkerManager class is used to implement the methods
4
+ # to send and receive the data to or from workers
5
+ class PreprocessingManager < ScbiMapreduce::WorkManager
6
+
7
+ ######################################################################################################
8
+ ## MANAGER BASIC METHODS
9
+ ######################################################################################################
10
+
11
+ # init_work_manager is executed at the start, prior to any processing.
12
+ # You can use init_work_manager to initialize global variables, open files, etc...
13
+ # Note that an instance of MyWorkerManager will be created for each
14
+ # worker connection, and thus, all global variables here should be
15
+ # class variables (starting with @@)
16
+ def self.init_work_manager(options)
17
+ @@options = options
18
+ $LOG.info 'Load genomic features links'
19
+ @@features = load_links(options[:file])
20
+ $LOG.info "Loaded #{@@features.length} genomic features links"
21
+
22
+ # FEATURE DIRECTORIES
23
+ @@features.each do |feature|
24
+ ft_folder = File.join(@@options[:preprocessed_data], feature[1]) #feature name
25
+ ft_temp_folder = File.join(@@options[:temp], feature[1])
26
+ Dir.mkdir(ft_folder) if !Dir.exist?(ft_folder)
27
+ Dir.mkdir(ft_temp_folder) if !Dir.exist?(ft_temp_folder)
28
+ end
29
+ @@processed_features = 0
30
+ end
31
+
32
+ # end_work_manager is executed at the end, when all the process is done.
33
+ # You can use it to close files opened in init_work_manager
34
+ def self.end_work_manager
35
+ File.open(File.join(@@options[:preprocessed_data], 'active_data'), 'w'){ |f| f.puts @@features.map{|f| f[1]}.uniq.join("\n")}
36
+ end
37
+
38
+ # worker_initial_config is used to send initial parameters to workers.
39
+ # The method is executed once per each worker
40
+ def worker_initial_config
41
+ return @@options
42
+ end
43
+
44
+ # next_work method is called every time a worker needs a new work
45
+ # Here you can read data from disk
46
+ # This method must return the work data or nil if no more data is available
47
+ def next_work
48
+ begin
49
+ if @@processed_features >= @@features.length
50
+ e = nil # worker signal disconect
51
+ else
52
+ e = @@features[@@processed_features]
53
+ end
54
+
55
+ @@processed_features += 1
56
+ rescue Exception => e
57
+ puts e.message
58
+ puts e.backtrace
59
+
60
+ end
61
+ return e
62
+
63
+ end
64
+
65
+
66
+ # work_received is executed each time a worker has finished a job.
67
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
68
+ def work_received(results)
69
+
70
+ # write_data_to_disk(results)
71
+ end
72
+
73
+ ######################################################################################################
74
+ ## CUSTOM ADDITIONAL METHODS
75
+ ######################################################################################################
76
+
77
+ def self.load_links(file_path)
78
+ features = []
79
+ File.open(file_path).each do |line|
80
+ line.chomp!
81
+ link, feature, cols, header, format = line.split("\t")
82
+ features << [link, feature, cols.split(',').map{|col| col.to_i}, header.to_i, format]
83
+ end
84
+ return features
85
+ end
86
+
87
+ end
@@ -0,0 +1,139 @@
1
+ ROOT_PATH=File.dirname(__FILE__)
2
+ $: << File.expand_path(ROOT_PATH)
3
+
4
+ require 'yajl'
5
+ require 'open-uri'
6
+ require 'benchmark'
7
+ require 'file_parser'
8
+
9
+
10
+ # MyWorker defines the behaviour of workers.
11
+ # Here is where the real processing takes place
12
+ class PreprocessingWorker < ScbiMapreduce::Worker
13
+
14
+ ######################################################################################################
15
+ ## WORKER BASIC METHODS
16
+ ######################################################################################################
17
+
18
+ # starting_worker method is called one time at initialization
19
+ # and allows you to initialize your variables
20
+ def starting_worker
21
+
22
+ # You can use worker logs at any time in this way:
23
+ $WORKER_LOG.info "Starting a worker"
24
+
25
+ end
26
+
27
+
28
+ # receive_initial_config is called only once just after
29
+ # the first connection, when initial parameters are
30
+ # received from manager
31
+ def receive_initial_config(parameters)
32
+ @options = parameters
33
+ # Reads the parameters
34
+
35
+ # You can use worker logs at any time in this way:
36
+ $WORKER_LOG.info "Params received"
37
+
38
+ # save received parameters, if any
39
+ # @params = parameters
40
+ end
41
+
42
+
43
+ # process_object method is called for each received object.
44
+ # Be aware that objs is always an array, and you must iterate
45
+ # over it if you need to process it independently
46
+ #
47
+ # The value returned here will be received by the work_received
48
+ # method at your worker_manager subclass.
49
+ def process_object(objs)
50
+ Benchmark.bm do |x|
51
+ x.report('Prep'){
52
+
53
+ FileParser.load
54
+ objs.each do |link, feature, cols, header, format| # iterate over all objects received
55
+ $WORKER_LOG.info "Processing link: #{feature}, #{format}, #{link}"
56
+ ft_folder = File.join(@options[:preprocessed_data], feature)
57
+ ft_temp_folder = File.join(@options[:temp], feature)
58
+ temp_file = download_data(link, cols, header, format, ft_temp_folder)
59
+ if !@options[:downloaded_only]
60
+ if File.exist?(temp_file)
61
+ extract_data(format, temp_file, ft_folder)
62
+ else
63
+ $WORKER_LOG.info "WARNING: Temporal file #{temp_file} have not been downloaded for feature #{feature} so it will be skipped"
64
+ end
65
+ else
66
+ $WORKER_LOG.info "Download only mode, skipping processing temp files"
67
+ end
68
+ end
69
+
70
+ }
71
+ end
72
+ # return objs back to manager
73
+ return []
74
+ end
75
+
76
+ # called once, when the worker is about to be closed
77
+ def closing_worker
78
+
79
+ end
80
+
81
+ ######################################################################################################
82
+ ## WORKER CUSTOM METHODS
83
+ ######################################################################################################
84
+
85
+ # Download protocols
86
+ #-----------------------------------------
87
+ def download_data(link, cols, header, format, temp)
88
+ protocol, url = link.split('://')
89
+ temp_file = nil
90
+ if protocol == 'http'
91
+ temp_file = File.join(temp, url.split('/').last)
92
+ if !File.exist?(temp_file)
93
+ get_http_data(url, temp_file)
94
+ $WORKER_LOG.info "Downloading #{link}"
95
+ else
96
+ $WORKER_LOG.info "Link was downloaded in a previous execution. Skipping download #{link}"
97
+ end
98
+ elsif protocol == 'ftp'
99
+ else
100
+ $WORKER_LOG.info "WARNING: protocol: #{protocol} in link: #{link} is not supported"
101
+ end
102
+ return temp_file
103
+ end
104
+
105
+ def get_http_data(url, temp)
106
+ File.open(temp, "wb") do |saved_file|
107
+ open("http://#{url}", "rb") do |read_file|
108
+ saved_file.write(read_file.read)
109
+ end
110
+ end
111
+ end
112
+
113
+ # File decompression methods
114
+ #-----------------------------------------
115
+ def extract_data(format, temp, folder)
116
+ data = {}
117
+ parser_class = FileParser.select(format)
118
+ parser = parser_class.new(folder, @options[:index_size])
119
+ $WORKER_LOG.info "Processing temporal file #{temp}"
120
+ if temp.include?('.gz')
121
+ #data = get_gz(temp, parser)
122
+ get_gz(temp, parser)
123
+ else
124
+
125
+ end
126
+ parser.write_compressed_data # Write remaining buffered data
127
+ $WORKER_LOG.info "End processing temporal file #{temp}"
128
+ return data
129
+ end
130
+
131
+ def get_gz(temp, parser)
132
+ Zlib::GzipReader.open(temp) {|gz|
133
+ gz.each do |line|
134
+ parser.parse(line.chomp)
135
+ end
136
+ }
137
+ #return parser.get_data
138
+ end
139
+ end
@@ -1,3 +1,3 @@
1
1
  module Anncrsnp
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.7"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anncrsnp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elena Rojano
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-07-25 00:00:00.000000000 Z
12
+ date: 2016-09-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -98,6 +98,7 @@ executables:
98
98
  - grdbfinder.rb
99
99
  - grdbmanager.rb
100
100
  - masterfeatures.rb
101
+ - retriever.rb
101
102
  - setup
102
103
  - statistics.rb
103
104
  extensions: []
@@ -115,6 +116,7 @@ files:
115
116
  - bin/grdbfinder.rb
116
117
  - bin/grdbmanager.rb
117
118
  - bin/masterfeatures.rb
119
+ - bin/retriever.rb
118
120
  - bin/setup
119
121
  - bin/statistics.rb
120
122
  - database/.DS_Store
@@ -122,7 +124,13 @@ files:
122
124
  - database/deleteme
123
125
  - lib/anncrsnp.rb
124
126
  - lib/anncrsnp/dataset.rb
127
+ - lib/anncrsnp/file_parser.rb
128
+ - lib/anncrsnp/file_parsers/wigfix_parser.rb
125
129
  - lib/anncrsnp/parsers/ucscparser.rb
130
+ - lib/anncrsnp/position_selection_manager.rb
131
+ - lib/anncrsnp/position_selection_worker.rb
132
+ - lib/anncrsnp/preprocessing_manager.rb
133
+ - lib/anncrsnp/preprocessing_worker.rb
126
134
  - lib/anncrsnp/version.rb
127
135
  homepage: ''
128
136
  licenses: