RubyGems - anncrsnp - Versions diffs - 0.1.6 → 0.1.7 - Mend

anncrsnp 0.1.6 → 0.1.7

Files changed (11) hide show

checksums.yaml +4 -4
data/bin/grdbfinder.rb +38 -10
data/bin/retriever.rb +147 -0
data/lib/anncrsnp/file_parser.rb +53 -0
data/lib/anncrsnp/file_parsers/wigfix_parser.rb +65 -0
data/lib/anncrsnp/position_selection_manager.rb +214 -0
data/lib/anncrsnp/position_selection_worker.rb +140 -0
data/lib/anncrsnp/preprocessing_manager.rb +87 -0
data/lib/anncrsnp/preprocessing_worker.rb +139 -0
data/lib/anncrsnp/version.rb +1 -1
metadata +10 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9a5b68efa127fe9ae6c1b409daa38ee22236068b
-  data.tar.gz: 4714dfd569a2568ddf055e346747de2bd03a5057
+  metadata.gz: ee18ae125e8e7b9738d3dd493ee66d57a374f885
+  data.tar.gz: 27e49ea014fbbdfb2aba568847c3214d24ee621e
 SHA512:
-  metadata.gz: a73bd75040ddf05079ba4555c9769ae45a41610520ae42827fda4f9e6ba45807b79501b947c1dd3d1dd9addc032107c8ac56ae047edba9ba9040534af906de8d
-  data.tar.gz: 874fb9ec1a453b33451d7d44710ec114de44bf5ba1c8701d1f17622a4222c96259f75bfb12e51a5b226ff835b6da492c82e67791a3553ed77c8289557d63f770
+  metadata.gz: cd254d0ed92720ce4a6bc1909e2bff86a845a3714549460e29f4f290bed7376e6d9b872aea58bc8093724dc0545fa3403213a43d3fe269130477cf6cb8ced556
+  data.tar.gz: 1269b73c8b8f6147940428a27ad5d54ec06b6ea8381fe0894ace30854e14b851cf893778cb4310630db56c21f7fb704ecfc1889ffa9624a315b454e4f8911d0d

data/bin/grdbfinder.rb CHANGED

@@ -347,16 +347,44 @@ def download_database(database_path)
   out_path = File.dirname(database_path)
   puts "Downloading database in #{out_path}, please be patient..."
   zip_path = File.join(out_path, 'database.zip')
-  File.open(zip_path, "wb") do |saved_file|
-    open("http://bio-267-data.uma.es/database.zip", "rb") do |read_file|
-     saved_file.write(read_file.read)
-    end
-  end
-  puts "Decompressing database..."
-  Zip::File.open(zip_path) do |zip_file|
-    zip_file.each do |entry|
-      entry.extract(database_path)
-    end
+	# Code from https://www.ruby-forum.com/topic/4413829
+	target = "http://bio-267-data.uma.es/database.zip"
+	bytes_total = nil
+	open(target, "rb",
+     		:content_length_proc => lambda{|content_length|
+       		bytes_total = content_length},
+     		:progress_proc => lambda{|bytes_transferred|
+       	if bytes_total
+        	 # Print progress
+         	print("\r#{bytes_transferred}/#{bytes_total}")
+       	else
+        	 # We don’t know how much we get, so just print number
+        	 # of transferred bytes
+        	 print("\r#{bytes_transferred} (total size unknown)")
+       	end
+     	}) do |page|
+  	# Now the real operation
+  		File.open(zip_path, "wb") do |file|
+    			# The file may not fit into RAM entirely, so copy it
+    			# chunk by chunk.
+    			while chunk = page.read(1024)
+      				file.write(chunk)
+    			end
+  		end
+	end
+  if File.exists?(zip_path)
+  	 puts "\nDecompressing database..."
+ 	 Zip::File.open(zip_path) do |zip_file|
+   		 zip_file.each do |entry|
+      			entry.extract(database_path)
+    		 end
+  	end
+  else
+	puts "ERROR: #{zip_path} was not found"
+	Process.exit
   end
   if File.exists?(database_path)
    File.delete(zip_path)

data/bin/retriever.rb ADDED

@@ -0,0 +1,147 @@
+#! /usr/bin/env ruby
+ROOT_PATH = File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'anncrsnp'))
+require 'optparse'
+require 'scbi_mapreduce'
+require 'preprocessing_manager'
+require 'position_selection_manager'
+#####################################################################
+### OPTPARSE
+#####################################################################
+options = {}
+OptionParser.new do |opts|
+  opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
+  ### PARALELISATION OPTIONS
+  #####################################################################
+  options[:server_ip] = '0.0.0.0'
+  opts.on( '-s', '--server IP', 'Server ip. You can use a partial ip to select the apropriate interface' ) do |server_ip|
+    options[:server_ip] = server_ip
+  end
+ # server port
+  options[:port] = 0 # any free port
+  opts.on( '-p', '--port PORT', 'Server port. If set to 0, an arbitrary empty port will be used') do |port|
+    options[:port] = port.to_i
+  end
+  # set number of workers. You can also provide an array with worker names.
+  # Those workers names can be read from a file produced by the existing
+  # queue system, if any.
+  options[:workers] = 2
+  opts.on( '-w', '--workers COUNT', 'Number of workers, or file containing machine names to launch workers with ssh' ) do |workers|
+    if File.exists?(workers)
+      # use workers file
+      options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
+    else
+      begin
+        options[:workers] = Integer(workers)
+      rescue
+        STDERR.puts "ERROR:Invalid workers parameter #{options[:workers]}"
+        exit
+      end
+    end
+  end
+  # chunk size
+  options[:chunk_size] = 1
+  opts.on( '-g', '--group_size chunk_size', 'Group sequences in chunks of size <chunk_size>' ) do |cs|
+    options[:chunk_size] = cs.to_i
+  end
+  ### EXECUTION OPTIONS
+  #####################################################################
+  options[:index_size] = 1000000
+  opts.on( '-x', '--index_size INTEGER', 'Size of genomic features data packs' ) do |is|
+    options[:index_size] = is.to_i
+  end
+  options[:file] = nil
+  opts.on("-f", "--file-links PATH", "Input file with links to retrieve data") do |f|
+    options[:file] = f
+  end
+  options[:output] = 'data'
+  opts.on("-o", "--output PATH", "Folder output path") do |f|
+    options[:output] = f
+  end
+  options[:downloaded_only] = FALSE
+  opts.on("--download_only", "Only download gemonic features files but not process them") do
+    options[:downloaded_only] = TRUE
+  end
+  options[:no_auc] = FALSE
+  opts.on("--no_auc", "No calculate auc by each genomic feature") do
+    options[:no_auc] = TRUE
+  end
+  options[:selected_positions] = nil
+  opts.on("--selected_positions PATH", "Tabular file with chromosome (as chrN) and base 1 coordinates. Optionally a third field can be added with 0/1 values for positive/negative groups") do |selected|
+    options[:selected_positions] = selected
+  end
+end.parse!
+#####################################################################
+### MAIN
+#####################################################################
+# GENERAL FOLDER
+Dir.mkdir(options[:output]) if !Dir.exist?(options[:output])
+# MAPREDUCE LAUNCHING
+##########################################################
+$LOG = Logger.new(STDOUT)
+$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
+# Genomic feature data downloading and preprocessing
+#-----------------------------------------------------------------------------
+if !options[:file].nil?
+	if  File.exists?(options[:file])
+		temp = File.join(options[:output], 'temp')
+		options[:temp] = temp
+		Dir.mkdir(temp) if !Dir.exist?(temp)
+		preprocessed_data = File.join(options[:output], 'preprocessed_data')
+		options[:preprocessed_data] = preprocessed_data
+		Dir.mkdir(preprocessed_data) if !Dir.exist?(preprocessed_data)
+		$LOG.info 'Starting PREPROCESSING server'
+		custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'preprocessing_worker.rb')
+		PreprocessingManager.init_work_manager(options)
+		mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PreprocessingManager, custom_worker_file, STDOUT) # launch processor server
+		mgr.chunk_size = options[:chunk_size]
+		mgr.start_server # start processing
+		$LOG.info 'Closing PREPROCESSING server'
+	else
+		puts "Links file not exists\n#{options[:file]}"
+		Process.exit()
+	end
+end
+# Genomic feature data position selection
+#-----------------------------------------------------------------------------
+if !options[:selected_positions].nil?
+	if File.exist?(options[:selected_positions])
+		selected_positions_folder = File.join(options[:output], 'selected_positions')
+		options[:selected_positions_folder] = selected_positions_folder
+		Dir.mkdir(selected_positions_folder) if !Dir.exist?(selected_positions_folder)
+		$LOG.info 'Starting POSITION_SELECTION server'
+		custom_worker_file = File.join(ROOT_PATH, '..', 'lib', 'anncrsnp', 'position_selection_worker.rb')
+		PositionSelectionManager.init_work_manager(options)
+		mgr = ScbiMapreduce::Manager.new( options[:server_ip], options[:port], options[:workers], PositionSelectionManager, custom_worker_file, STDOUT) # launch processor server
+		mgr.chunk_size = options[:chunk_size]
+		mgr.start_server # start processing
+		$LOG.info 'Closing POSITION_SELECTION server'
+	else
+		puts "File with selected positions not exists:\n#{options[:selected_positions]}"
+		Process.exit()
+	end
+end

data/lib/anncrsnp/file_parser.rb ADDED

@@ -0,0 +1,53 @@
+require 'yajl'
+class FileParser
+	@@parsers = {}
+	def self.get_descendants
+		return ObjectSpace.each_object(Class).select { |klass| klass < self }
+	end
+	def self.load
+		path_parsers = File.join(File.dirname(__FILE__), 'file_parsers')
+		Dir.glob(path_parsers+'/*').each do |parser|
+			require parser
+		end
+		get_descendants.each do |descendant|
+			@@parsers[descendant.format] = descendant if descendant.available?
+		end
+	end
+	def self.select(format)
+		return @@parsers[format]
+	end
+	########################################################################################
+	## PARSER DEPENDANT METHODS
+	########################################################################################
+	def self.available?
+		return FALSE
+	end
+	def self.format
+		return 'master'
+	end
+	def initialize(folder, chunk_size)
+		@folder = folder
+		@chunk_size = chunk_size
+		@chrom = nil
+		@coords = []
+		@packs = 0
+	end
+	def parse(line)
+	end
+	def write_compressed_data
+		p = @packs * @chunk_size
+		gz_path = File.join(@folder, "#{@chrom}_#{p}.gz")
+        Zlib::GzipWriter.open(gz_path) do |writer|
+          Yajl::Encoder.encode(@coords, writer)
+        end
+        @packs += 1
+    end
+end

data/lib/anncrsnp/file_parsers/wigfix_parser.rb ADDED

@@ -0,0 +1,65 @@
+class WigfixParser < FileParser
+	def initialize(folder, chunk_size)
+		super
+		@start = 1
+		@step = 1
+	end
+	def self.available?
+		return TRUE
+	end
+	def self.format
+		return 'wigfix'
+	end
+	def parse(line)
+		#fixedStep chrom=chr11 start=60001 step=1
+		if line.include?('fixedStep')
+			line =~ /fixedStep chrom=(\S+) start=(\d+) step=(\d+)/
+			if !@chrom.nil? && @chrom != $1 #We change of chromosome, we write the buffered coordinates
+				#puts "=> #{@packs}\t#{@start}\tx"
+				#puts @coords.first.inspect
+				#puts @coords.last.inspect
+				write_compressed_data
+				@coords = []
+			end
+			@chrom = $1
+			last_start = @start
+			@start = $2.to_i
+			diff = @start - last_start #Create dummy files to fill gaps on coordinate scores
+			if diff >= @chunk_size
+				(diff/@chunk_size).times do
+					#puts "=> #{@packs}\t#{@start}\td"
+					#puts @coords.first.inspect
+					#puts @coords.last.inspect
+					write_compressed_data
+					@coords = []
+				end
+			else
+				if @start/@chunk_size != last_start/@chunk_size #Current coordinate belongs to another pack that the previous, write the buffered coordinates
+					#puts "=> #{@packs}\t#{@start}\te"
+					#puts @coords.first.inspect
+					#puts @coords.last.inspect
+					write_compressed_data
+					@coords = []
+				end
+			end
+			@step = $3.to_i
+		else
+			if @start % @chunk_size == 0 # We have reached the chun size, write it to disk
+				#puts "=> #{@packs}\t#{@start}\tl"
+				#puts @coords.first.inspect
+				#puts @coords.last.inspect
+				write_compressed_data
+				@coords = []
+			end
+			@coords << [@start, line.to_f]
+			@start += @step
+		end
+	end
+	def get_data
+		return @coords
+	end
+end

data/lib/anncrsnp/position_selection_manager.rb ADDED

@@ -0,0 +1,214 @@
+require 'json'
+require 'rroc'
+require 'gchart'
+# MyWorkerManager class is used to implement the methods
+# to send and receive the data to or from workers
+class PositionSelectionManager < ScbiMapreduce::WorkManager
+  ######################################################################################################
+  ## MANAGER BASIC METHODS
+  ######################################################################################################
+  # init_work_manager is executed at the start, prior to any processing.
+  # You can use init_work_manager to initialize global variables, open files, etc...
+  # Note that an instance of MyWorkerManager will be created for each
+  # worker connection, and thus, all global variables here should be
+  # class variables (starting with @@)
+  def self.init_work_manager(options)
+    @@options = options
+    @@positions, @@groups = load_selected_positions(@@options[:selected_positions])
+    @@active_data = File.open(File.join(@@options[:preprocessed_data], 'active_data')).readlines.map {|item| item.chomp}
+    @@used_data = 0
+    @@used_position = 0
+    @@all_data = {}
+  end
+  # end_work_manager is executed at the end, when all the process is done.
+  # You can use it to close files opened in init_work_manager
+  def self.end_work_manager
+    positions_ids = []
+    scores = {} # Create genomic features table
+    $LOG.info "Create general scores table"
+    @@all_data.each do |data, positions_info|
+      data_scores = []
+      positions_info.each do |chr, position_info|
+        position_info.each do |position, score|
+          data_scores << ["#{chr}_#{position.to_s}", score]
+        end
+      end
+      data_scores.sort!{|sc1, sc2| sc1.first <=> sc2.first}
+      scores[data] = data_scores.map{|sc| sc.last}
+      positions_ids = data_scores.map{|sc| sc.first} if positions_ids.empty?
+    end
+    if !@@groups.empty?
+      tags = positions_ids.map{|id| # Create vector tag group related to scores table
+        tag = @@groups[id]
+        if tag == 0
+          tag = -1
+        else
+          tag = 1
+        end
+      }
+      if !@@options[:no_auc]
+        $LOG.info "Calculating AUC for each genomic feature"
+        aucs = get_aucs(tags, scores) # GEnerate area under curve by each genomic feature
+        File.open(File.join(@@options[:selected_positions_folder], 'AUCs'), 'w'){ |f|
+          aucs.each do |data_type, auc|
+            f.puts "#{data_type}\t#{auc.join("\t")}"
+          end
+        }
+      end
+      $LOG.info "Creating training files for tensorflow"
+      create_positions_sets_for_tensorflow(@@options[:selected_positions_folder], scores, tags)
+    end
+    data_types = scores.keys
+    File.open(File.join(@@options[:selected_positions_folder], 'all_data'), 'w'){ |f| #final genomic feature scores table for goldstandard
+      f.puts ['position'].concat(data_types).join("\t")
+      positions_ids.each_with_index do |id, i|
+        record = [id]
+        data_types.each do |dt|
+          record << scores[dt][i]
+        end
+        f.puts record.join("\t")
+      end
+    }
+  end
+  # worker_initial_config is used to send initial parameters to workers.
+  # The method is executed once per each worker
+  def worker_initial_config
+    return @@options
+  end
+  # next_work method is called every time a worker needs a new work
+  # Here you can read data from disk
+  # This method must return the work data or nil if no more data is available
+  def next_work
+    begin
+      if @@used_data >= @@active_data.length
+        e = nil # worker signal disconect
+      else
+        chr = @@positions.keys[@@used_position]
+        e = [@@active_data[@@used_data], chr, @@positions[chr]]
+        @@used_position += 1
+        if @@used_position >= @@positions.length
+          @@used_data +=1
+          @@used_position = 0
+        end
+      end
+    rescue Exception => e
+      puts e.message
+      puts e.backtrace
+    end
+    return e
+  end
+  # work_received is executed each time a worker has finished a job.
+  # Here you can write results down to disk, perform some aggregated statistics, etc...
+  def work_received(results)
+    results.each do |data, positions_info|
+      query = @@all_data[data]
+      if query.nil?
+        @@all_data[data] = positions_info
+      else
+        @@all_data[data] = query.merge(positions_info)
+      end
+    end
+  end
+  ######################################################################################################
+  ## CUSTOM ADDITIONAL METHODS
+  ######################################################################################################
+  def self.load_selected_positions(file_path)
+    selected_positions = {}
+    groups = {}
+    File.open(file_path).each do |line|
+      line.chomp!
+      chr, position, group = line.split("\t")
+      record = position.to_i
+      if !group.nil?
+        group = group.to_i
+        groups["#{chr}_#{position}"] = group
+      end
+      query = selected_positions[chr]
+      if query.nil?
+        selected_positions[chr] = [record]
+      else
+        query << record
+        query.uniq!
+      end
+    end
+    selected_positions.each do |chr, positions|
+      positions.sort!
+    end
+    return selected_positions, groups
+  end
+  def self.get_aucs(tags, scores)
+    aucs = {}
+    scores.each do | data_type, scores|
+      matrix = []
+      scores.each_with_index do |score, i|
+        matrix << [score, tags[i]]
+      end
+      pts = ROC.curve_points(matrix)
+      aucs[data_type] = [ROC.auc(matrix), GChart.scatter(:data => [pts.collect { |x| x[0] }, pts.collect { |x| x[1] }]).to_url]
+    end
+    return aucs
+  end
+  def self.create_positions_sets_for_tensorflow(path_folder, scores, tags)
+    validation_set_proportion = 0.2
+    positions_number = tags.length
+    validation_set_length = (positions_number * validation_set_proportion).to_i
+    training_set_length = positions_number - validation_set_length
+    validation_set_positions = [] # Set which positions will belong to validation set
+    while validation_set_positions.length < validation_set_length
+      position = rand(positions_number - 1) # We need random 0 based positions
+      validation_set_positions << position if !validation_set_positions.include?(position)
+    end
+    tags.map!{|t| #tensorflow nedd positive integer as tags, we change tag used in AUC operation
+      if t == -1
+        0
+      else
+        t
+      end
+    }
+    genomic_features = scores.keys
+    training_set = []
+    validation_set = []
+    tags.each_with_index do |tag, n|
+      record = [] # Create record position
+      genomic_features.each do |gf|
+        record << scores[gf][n]
+      end
+      record << tag
+      if validation_set_positions.include?(n) # Send record to correspondant set
+        validation_set << record
+      else
+        training_set << record
+      end
+    end
+    tag_names = tags.uniq #TODO: improve to ensure exact correspondance
+    training_set.unshift([training_set.length, genomic_features.length].concat(tag_names)) # set headers
+    validation_set.unshift([validation_set.length, genomic_features.length].concat(tag_names)) # set headers
+    write_set(training_set, File.join(path_folder, 'training_set.csv'))
+    write_set(validation_set, File.join(path_folder, 'validation_set.csv'))
+  end
+  def self.write_set(set, path)
+    File.open(path, 'w'){|f|
+      set.each do |record|
+        f.puts record.join(',')
+      end
+    }
+  end
+end

data/lib/anncrsnp/position_selection_worker.rb ADDED

@@ -0,0 +1,140 @@
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(ROOT_PATH)
+require 'benchmark'
+# MyWorker defines the behaviour of workers.
+# Here is where the real processing takes place
+class PositionSelectionWorker < ScbiMapreduce::Worker
+  ######################################################################################################
+  ## WORKER BASIC METHODS
+  ######################################################################################################
+  # starting_worker method is called one time at initialization
+  # and allows you to initialize your variables
+  def starting_worker
+    # You can use worker logs at any time in this way:
+    $WORKER_LOG.info "Starting a worker"
+  end
+  # receive_initial_config is called only once just after
+  # the first connection, when initial parameters are
+  # received from manager
+  def receive_initial_config(parameters)
+    @options = parameters
+    # Reads the parameters
+    # You can use worker logs at any time in this way:
+    $WORKER_LOG.info "Params received"
+    # save received parameters, if any
+    # @params = parameters
+  end
+  # process_object method is called for each received object.
+  # Be aware that objs is always an array, and you must iterate
+  # over it if you need to process it independently
+  #
+  # The value returned here will be received by the work_received
+  # method at your worker_manager subclass.
+  def process_object(objs)
+    all_data = nil
+    Benchmark.bm do |x|
+    x.report('PosS'){
+      packs, datas = get_info_to_search(objs)
+      all_data = {}
+      datas.each do |data|
+        selected_scores = {}
+        packs.each do |chr, ps|
+          scores = []
+          ps.each do |pack, positions|
+            info_path = File.join(@options[:preprocessed_data], data, "#{chr}_#{pack}.gz")
+            #puts info_path
+            if File.exists?(info_path)
+              chr_data = []
+              Zlib::GzipReader.open(info_path) {|gz| chr_data = JSON.parse(gz.read)}
+              scores.concat(get_scores(chr_data, positions))
+            end
+          end
+          selected_scores[chr] = scores
+        end
+        all_data[data] = selected_scores
+      end
+      # return objs back to manager
+    }
+    end
+    return all_data
+  end
+  # called once, when the worker is about to be closed
+  def closing_worker
+  end
+  ######################################################################################################
+  ## WORKER CUSTOM METHODS
+  ######################################################################################################
+  def get_info_to_search(objs)
+    packs = {}
+    datas = []
+    objs.each do |data, chr, positions| # Analyse which chromosomes and packs must be loaded
+       datas << data if !datas.include?(data)
+       positions.each do |position|
+        pack = position/@options[:index_size]
+        pack = pack * @options[:index_size]
+        #puts "#{position} ==> #{pack}"
+        query_chr = packs[chr]
+        if query_chr.nil?
+          packs[chr] = { pack => [position]}
+        else
+          query_pack = query_chr[pack]
+          if query_pack.nil?
+            query_chr[pack] = [position]
+          else
+            query_pack << position
+          end
+        end
+       end
+    end
+    return packs, datas
+  end
+  def get_scores(chr_data, positions)
+    positions_scores = []
+    # Remove positions out of existing coordinates
+    lower_limit = chr_data.first.first
+    upper_limit = chr_data.last.first
+    positions_scores.concat(positions.select{|pos| pos < lower_limit}.map{|pos| [pos, 0]}) #At the beginning
+    filtered_positions = positions.select{|pos| pos >= lower_limit && pos <= upper_limit }
+    #--------------------------------------------------------------------------------------------------
+    if !filtered_positions.empty?
+      current_position = filtered_positions.shift
+      chr_data.each do |coord, score|
+        if coord == current_position
+          positions_scores << [current_position,  score]
+          break if filtered_positions.empty?
+          current_position = filtered_positions.shift
+        elsif coord > current_position # We have encountered a gap and current position is in it
+          while coord > current_position # drop positions within the gap
+            positions_scores << [current_position,  0]
+            break if filtered_positions.empty?
+            current_position = filtered_positions.shift
+          end
+          break if filtered_positions.empty?
+        end
+      end
+    end
+    positions_scores.concat(positions.select{|pos| pos > upper_limit}.map{|pos| [pos, 0]}) # At the end
+    return positions_scores
+  end
+end

data/lib/anncrsnp/preprocessing_manager.rb ADDED

@@ -0,0 +1,87 @@
+require 'json'
+# MyWorkerManager class is used to implement the methods
+# to send and receive the data to or from workers
+class PreprocessingManager < ScbiMapreduce::WorkManager
+  ######################################################################################################
+  ## MANAGER BASIC METHODS
+  ######################################################################################################
+  # init_work_manager is executed at the start, prior to any processing.
+  # You can use init_work_manager to initialize global variables, open files, etc...
+  # Note that an instance of MyWorkerManager will be created for each
+  # worker connection, and thus, all global variables here should be
+  # class variables (starting with @@)
+  def self.init_work_manager(options)
+    @@options = options
+    $LOG.info 'Load genomic features links'
+    @@features = load_links(options[:file])
+    $LOG.info "Loaded #{@@features.length} genomic features links"
+    # FEATURE DIRECTORIES
+    @@features.each do |feature|
+      ft_folder = File.join(@@options[:preprocessed_data], feature[1]) #feature name
+      ft_temp_folder = File.join(@@options[:temp], feature[1])
+      Dir.mkdir(ft_folder) if !Dir.exist?(ft_folder)
+      Dir.mkdir(ft_temp_folder) if !Dir.exist?(ft_temp_folder)
+    end
+    @@processed_features = 0
+  end
+  # end_work_manager is executed at the end, when all the process is done.
+  # You can use it to close files opened in init_work_manager
+  def self.end_work_manager
+    File.open(File.join(@@options[:preprocessed_data], 'active_data'), 'w'){ |f| f.puts @@features.map{|f| f[1]}.uniq.join("\n")}
+  end
+  # worker_initial_config is used to send initial parameters to workers.
+  # The method is executed once per each worker
+  def worker_initial_config
+    return @@options
+  end
+  # next_work method is called every time a worker needs a new work
+  # Here you can read data from disk
+  # This method must return the work data or nil if no more data is available
+  def next_work
+    begin
+      if @@processed_features >= @@features.length
+        e = nil # worker signal disconect
+      else
+        e = @@features[@@processed_features]
+      end
+      @@processed_features += 1
+    rescue Exception => e
+      puts e.message
+      puts e.backtrace
+    end
+    return e
+  end
+  # work_received is executed each time a worker has finished a job.
+  # Here you can write results down to disk, perform some aggregated statistics, etc...
+  def work_received(results)
+    # write_data_to_disk(results)
+  end
+  ######################################################################################################
+  ## CUSTOM ADDITIONAL METHODS
+  ######################################################################################################
+  def self.load_links(file_path)
+    features = []
+    File.open(file_path).each do |line|
+      line.chomp!
+      link, feature, cols, header, format = line.split("\t")
+      features << [link, feature, cols.split(',').map{|col| col.to_i}, header.to_i, format]
+    end
+    return features
+  end
+end

data/lib/anncrsnp/preprocessing_worker.rb ADDED

@@ -0,0 +1,139 @@
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(ROOT_PATH)
+require 'yajl'
+require 'open-uri'
+require 'benchmark'
+require 'file_parser'
+# MyWorker defines the behaviour of workers.
+# Here is where the real processing takes place
+class PreprocessingWorker < ScbiMapreduce::Worker
+  ######################################################################################################
+  ## WORKER BASIC METHODS
+  ######################################################################################################
+  # starting_worker method is called one time at initialization
+  # and allows you to initialize your variables
+  def starting_worker
+    # You can use worker logs at any time in this way:
+    $WORKER_LOG.info "Starting a worker"
+  end
+  # receive_initial_config is called only once just after
+  # the first connection, when initial parameters are
+  # received from manager
+  def receive_initial_config(parameters)
+    @options = parameters
+    # Reads the parameters
+    # You can use worker logs at any time in this way:
+    $WORKER_LOG.info "Params received"
+    # save received parameters, if any
+    # @params = parameters
+  end
+  # process_object method is called for each received object.
+  # Be aware that objs is always an array, and you must iterate
+  # over it if you need to process it independently
+  #
+  # The value returned here will be received by the work_received
+  # method at your worker_manager subclass.
+  def process_object(objs)
+    Benchmark.bm do |x|
+    x.report('Prep'){
+      FileParser.load
+      objs.each do |link, feature, cols, header, format| # iterate over all objects received
+        $WORKER_LOG.info "Processing link: #{feature}, #{format}, #{link}"
+        ft_folder = File.join(@options[:preprocessed_data], feature)
+        ft_temp_folder = File.join(@options[:temp], feature)
+        temp_file = download_data(link, cols, header, format, ft_temp_folder)
+        if !@options[:downloaded_only]
+          if File.exist?(temp_file)
+            extract_data(format, temp_file, ft_folder)
+          else
+            $WORKER_LOG.info "WARNING: Temporal file #{temp_file} have not been downloaded for feature #{feature} so it will be skipped"
+          end
+        else
+          $WORKER_LOG.info "Download only mode, skipping processing temp files"
+        end
+      end
+    }
+    end
+    # return objs back to manager
+    return []
+  end
+  # called once, when the worker is about to be closed
+  def closing_worker
+  end
+  ######################################################################################################
+  ## WORKER CUSTOM METHODS
+  ######################################################################################################
+  # Download protocols
+  #-----------------------------------------
+  def download_data(link, cols, header, format, temp)
+    protocol, url = link.split('://')
+    temp_file = nil
+    if protocol == 'http'
+      temp_file = File.join(temp, url.split('/').last)
+      if !File.exist?(temp_file)
+        get_http_data(url, temp_file)
+        $WORKER_LOG.info "Downloading #{link}"
+      else
+        $WORKER_LOG.info "Link was downloaded in a previous execution. Skipping download #{link}"
+      end
+    elsif protocol == 'ftp'
+    else
+      $WORKER_LOG.info "WARNING: protocol: #{protocol} in link: #{link} is not supported"
+    end
+    return temp_file
+  end
+  def get_http_data(url, temp)
+    File.open(temp, "wb") do |saved_file|
+      open("http://#{url}", "rb") do |read_file|
+       saved_file.write(read_file.read)
+      end
+    end
+  end
+  # File decompression methods
+  #-----------------------------------------
+  def extract_data(format, temp, folder)
+    data = {}
+    parser_class = FileParser.select(format)
+    parser = parser_class.new(folder, @options[:index_size])
+    $WORKER_LOG.info "Processing temporal file #{temp}"
+    if temp.include?('.gz')
+      #data = get_gz(temp, parser)
+      get_gz(temp, parser)
+    else
+    end
+    parser.write_compressed_data # Write remaining buffered data
+    $WORKER_LOG.info "End processing temporal file #{temp}"
+    return data
+  end
+  def get_gz(temp, parser)
+    Zlib::GzipReader.open(temp) {|gz|
+        gz.each do |line|
+          parser.parse(line.chomp)
+        end
+    }
+    #return parser.get_data
+  end
+end

data/lib/anncrsnp/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Anncrsnp
-  VERSION = "0.1.6"
+  VERSION = "0.1.7"
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anncrsnp
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - Elena Rojano
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-07-25 00:00:00.000000000 Z
+date: 2016-09-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -98,6 +98,7 @@ executables:
 - grdbfinder.rb
 - grdbmanager.rb
 - masterfeatures.rb
+- retriever.rb
 - setup
 - statistics.rb
 extensions: []
@@ -115,6 +116,7 @@ files:
 - bin/grdbfinder.rb
 - bin/grdbmanager.rb
 - bin/masterfeatures.rb
+- bin/retriever.rb
 - bin/setup
 - bin/statistics.rb
 - database/.DS_Store
@@ -122,7 +124,13 @@ files:
 - database/deleteme
 - lib/anncrsnp.rb
 - lib/anncrsnp/dataset.rb
+- lib/anncrsnp/file_parser.rb
+- lib/anncrsnp/file_parsers/wigfix_parser.rb
 - lib/anncrsnp/parsers/ucscparser.rb
+- lib/anncrsnp/position_selection_manager.rb
+- lib/anncrsnp/position_selection_worker.rb
+- lib/anncrsnp/preprocessing_manager.rb
+- lib/anncrsnp/preprocessing_worker.rb
 - lib/anncrsnp/version.rb
 homepage: ''
 licenses: