RubyGems - mobilize-hdfs - Versions diffs - 1.0.0 - Mend

mobilize-hdfs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/.gitignore +19 -0
data/Gemfile +4 -0
data/LICENSE.txt +201 -0
data/README.md +245 -0
data/Rakefile +18 -0
data/lib/mobilize-hdfs.rb +9 -0
data/lib/mobilize-hdfs/handlers/hadoop.rb +67 -0
data/lib/mobilize-hdfs/handlers/hdfs.rb +187 -0
data/lib/mobilize-hdfs/tasks.rb +38 -0
data/lib/mobilize-hdfs/version.rb +5 -0
data/lib/samples/hadoop.yml +52 -0
data/mobilize-hdfs.gemspec +20 -0
data/test/hdfs_job_rows.yml +11 -0
data/test/mobilize-hdfs_test.rb +46 -0
data/test/redis-test.conf +540 -0
data/test/test_helper.rb +10 -0
metadata +82 -0

data/Rakefile ADDED

@@ -0,0 +1,18 @@
+require "bundler/gem_tasks"
+require 'mobilize-base/tasks'
+require 'mobilize-ssh/tasks'
+require 'mobilize-hdfs/tasks'
+#
+# Tests
+#
+require 'rake/testtask'
+Rake::TestTask.new do |test|
+  test.verbose = true
+  test.libs << "test"
+  test.libs << "lib"
+  test.test_files = FileList['test/**/*_test.rb']
+end
+task :default => :test

data/lib/mobilize-hdfs.rb ADDED

@@ -0,0 +1,9 @@
+require "mobilize-hdfs/version"
+require "mobilize-ssh"
+module Mobilize
+  module Hdfs
+  end
+end
+require "mobilize-hdfs/handlers/hadoop"
+require "mobilize-hdfs/handlers/hdfs"

data/lib/mobilize-hdfs/handlers/hadoop.rb ADDED

@@ -0,0 +1,67 @@
+module Mobilize
+  module Hadoop
+    def Hadoop.config
+      Base.config('hadoop')
+    end
+    def Hadoop.exec_path(cluster)
+      Hadoop.config['clusters'][cluster]['exec_path']
+    end
+    def Hadoop.gateway_node(cluster)
+      Hadoop.clusters[cluster]['gateway_node']
+    end
+    def Hadoop.clusters
+      Hadoop.config['clusters']
+    end
+    def Hadoop.output_cluster
+      Hadoop.config['output_cluster']
+    end
+    def Hadoop.output_dir
+      Hadoop.config['output_dir']
+    end
+    def Hadoop.read_limit
+      Hadoop.config['read_limit']
+    end
+    def Hadoop.job(command,cluster,user,file_hash={})
+      command = ["-",command].join unless command.starts_with?("-")
+      Hadoop.run("job -fs #{Hdfs.root(cluster)} #{command}",cluster,user,file_hash).ie do |r|
+        r.class==Array ? r.first : r
+      end
+    end
+    def Hadoop.job_list(cluster)
+      raw_list = Hadoop.job("list",{},cluster)
+      raw_list.split("\n")[1..-1].join("\n").tsv_to_hash_array
+    end
+    def Hadoop.job_status(hdfs_job_id,cluster)
+      raw_status = Hadoop.job("status #{hdfs_job_id}",{},cluster)
+      dhash_status = raw_status.strip.split("\n").map do |sline|
+                       delim_index = [sline.index("="),sline.index(":")].compact.min
+                       if delim_index
+                         key,value = [sline[0..delim_index-1],sline[(delim_index+1)..-1]]
+                         {key.strip => value.strip}
+                       end
+                     end.compact
+      hash_status = {}
+      dhash_status.each{|h| hash_status.merge!(h)}
+      hash_status
+    end
+    def Hadoop.run(command,cluster,user,file_hash={})
+      h_command = if command.starts_with?("hadoop")
+                    command.sub("hadoop",Hadoop.exec_path(cluster))
+                  else
+                    "#{Hadoop.exec_path(cluster)} #{command}"
+                  end
+      gateway_node = Hadoop.gateway_node(cluster)
+      Ssh.run(gateway_node,h_command,user,file_hash)
+    end
+  end
+end

data/lib/mobilize-hdfs/handlers/hdfs.rb ADDED

@@ -0,0 +1,187 @@
+module Mobilize
+  module Hdfs
+    def Hdfs.root(cluster)
+      namenode = Hadoop.clusters[cluster]['namenode']
+      "hdfs://#{namenode['name']}:#{namenode['port']}"
+    end
+    def Hdfs.run(command,cluster,user)
+      command = ["-",command].join unless command.starts_with?("-")
+      command = "dfs -fs #{Hdfs.root(cluster)}/ #{command}"
+      Hadoop.run(command,cluster,user)
+    end
+    def Hdfs.rm(target_path,user)
+      #ignore errors due to missing file
+      cluster,cluster_path = Hdfs.resolve_path(target_path)
+      begin
+        Hdfs.run("rm '#{cluster_path}'",cluster,user)
+        return true
+      rescue
+        return false
+      end
+    end
+    def Hdfs.rmr(target_dir,user)
+      #ignore errors due to missing dir
+      cluster,cluster_dir = Hdfs.resolve_path(target_dir)
+      begin
+        Hdfs.run("rmr '#{cluster_dir}'",cluster,user)
+        return true
+      rescue
+        return false
+      end
+    end
+    def Hdfs.read(path,user)
+      cluster, cluster_path = Hdfs.resolve_path(path)
+      gateway_node = Hadoop.gateway_node(cluster)
+      #need to direct stderr to dev null since hdfs throws errors at being headed off
+      command = "((#{Hadoop.exec_path(cluster)} fs -fs '#{Hdfs.namenode_path(path)}' -cat #{cluster_path}"
+      command += " | head -c #{Hadoop.read_limit}) > out.txt 2> /dev/null) && cat out.txt"
+      response = Ssh.run(gateway_node,command,user)
+      if response.length==Hadoop.read_limit
+        raise "Hadoop read limit reached -- please reduce query size"
+      end
+      response
+    end
+    def Hdfs.resolve_path(path)
+      if path.starts_with?("/")
+        return [Hadoop.output_cluster,path]
+      #determine if first term in path is a cluster name
+      elsif Hadoop.clusters.keys.include?(path.split("/").first)
+        return path.split("/").ie{|p| [p.first,"/#{p[1..-1].join("/")}"]}
+      else
+        return [nil,nil]
+      end
+    end
+    def Hdfs.namenode_path(path)
+      cluster, cluster_path = Hdfs.resolve_path(path)
+      "#{Hdfs.root(cluster)}#{cluster_path}"
+    end
+    def Hdfs.write(path,string,user)
+      file_hash = {'file.txt'=>string}
+      cluster = Hdfs.resolve_path(path).first
+      Hdfs.rm(path,user) #remove old one if any
+      write_command = "dfs -copyFromLocal file.txt '#{Hdfs.namenode_path(path)}'"
+      Hadoop.run(write_command,cluster,user,file_hash)
+      return Hdfs.namenode_path(path)
+    end
+    def Hdfs.copy(source_path,target_path,user)
+      Hdfs.rm(target_path,user) #remove to_path
+      source_cluster = Hdfs.resolve_path(source_path).first
+      command = "dfs -cp '#{Hdfs.namenode_path(source_path)}' '#{Hdfs.namenode_path(target_path)}'"
+      #copy operation implies access to target_url from source_cluster
+      Hadoop.run(command,source_cluster,user)
+      return Hdfs.namenode_path(target_path)
+    end
+    def Hdfs.read_by_stage_path(stage_path)
+      s = Stage.where(:path=>stage_path).first
+      u = s.job.runner.user
+      params = s.params
+      source_path = params['source']
+      user = params['user']
+      #check for source in hdfs format
+      source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
+      raise "unable to resolve source path" if source_cluster.nil?
+      node = Hadoop.gateway_node(source_cluster)
+      if user and !Ssh.sudoers(node).include?(u.name)
+        raise "#{u.name} does not have su permissions for #{node}"
+      elsif user.nil? and Ssh.su_all_users(node)
+        user = u.name
+      end
+      source_path = "#{source_cluster}#{source_cluster_path}"
+      out_string = Hdfs.read(source_path,user).to_s
+      out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
+      Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
+      out_url
+    end
+    def Hdfs.write_by_stage_path(stage_path)
+      s = Stage.where(:path=>stage_path).first
+      u = s.job.runner.user
+      params = s.params
+      source_path = params['source']
+      target_path = params['target']
+      user = params['user']
+      #check for source in hdfs format
+      source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
+      if source_cluster.nil?
+        #not hdfs
+        gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
+        #return blank response if there are no slots available
+        return nil unless gdrive_slot
+        source_dst = s.source_dsts(gdrive_slot).first
+        Gdrive.unslot_worker_by_path(stage_path)
+      else
+        source_path = "#{source_cluster}#{source_cluster_path}"
+        source_dst = Dataset.find_or_create_by_handler_and_path("hdfs",source_path)
+      end
+      #determine cluster for target
+      target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
+      raise "unable to resolve target path" if target_cluster.nil?
+      node = Hadoop.gateway_node(target_cluster)
+      if user and !Ssh.sudoers(node).include?(u.name)
+        raise "#{u.name} does not have su permissions for #{node}"
+      elsif user.nil? and Ssh.su_all_users(node)
+        user = u.name
+      end
+      target_path = "#{target_cluster}#{target_cluster_path}"
+      in_string = source_dst.read(user)
+      out_string = Hdfs.write(target_path,in_string,user)
+      out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
+      Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
+      out_url
+    end
+    def Hdfs.copy_by_stage_path(stage_path)
+      s = Stage.where(:path=>stage_path).first
+      u = s.job.runner.user
+      params = s.params
+      source_path = params['source']
+      target_path = params['target']
+      user = params['user']
+      #check for source in hdfs format
+      source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
+      raise "unable to resolve source path" if source_cluster.nil?
+      #determine cluster for target
+      target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
+      raise "unable to resolve target path" if target_cluster.nil?
+      node = Hadoop.gateway_node(source_cluster)
+      if user and !Ssh.sudoers(node).include?(u.name)
+        raise "#{u.name} does not have su permissions for #{node}"
+      elsif user.nil? and Ssh.su_all_users(node)
+        user = u.name
+      end
+      source_path = "#{source_cluster}#{source_cluster_path}"
+      target_path = "#{target_cluster}#{target_cluster_path}"
+      out_string = Hdfs.copy(source_path,target_path,user)
+      out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
+      Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
+      out_url
+    end
+    def Hdfs.read_by_dataset_path(dst_path,user)
+      Hdfs.read(dst_path,user)
+    end
+    def Hdfs.write_by_dataset_path(dst_path,string,user)
+      Hdfs.write(dst_path,string,user)
+    end
+  end
+end

data/lib/mobilize-hdfs/tasks.rb ADDED

@@ -0,0 +1,38 @@
+namespace :mobilize_hdfs do
+  desc "Set up config and log folders and files"
+  task :setup do
+    sample_dir = File.dirname(__FILE__) + '/../samples/'
+    sample_files = Dir.entries(sample_dir)
+    config_dir = (ENV['MOBILIZE_CONFIG_DIR'] ||= "config/mobilize/")
+    log_dir = (ENV['MOBILIZE_LOG_DIR'] ||= "log/")
+    full_config_dir = "#{ENV['PWD']}/#{config_dir}"
+    full_log_dir = "#{ENV['PWD']}/#{log_dir}"
+    unless File.exists?(full_config_dir)
+      puts "creating #{config_dir}"
+      `mkdir -p #{full_config_dir}`
+    end
+    unless File.exists?(full_log_dir)
+      puts "creating #{log_dir}"
+      `mkdir -p #{full_log_dir}`
+    end
+    sample_files.each do |fname|
+      unless File.exists?("#{full_config_dir}#{fname}")
+        puts "creating #{config_dir}#{fname}"
+        `cp #{sample_dir}#{fname} #{full_config_dir}#{fname}`
+      end
+    end
+    #make sure that the jobtracker.yml is updated to include the
+    #mobilize-ssh library
+    jt_config_file = "#{config_dir}jobtracker.yml"
+    if File.exists?(jt_config_file)
+      yml_hash = YAML.load_file(jt_config_file)
+      yml_hash.keys.each do |k|
+        if yml_hash[k]['extensions'] and !yml_hash[k]['extensions'].include?('mobilize-hdfs')
+          puts "adding mobilize-hdfs to jobtracker.yml/#{k}/extensions"
+          yml_hash[k]['extensions'] = yml_hash[k]['extensions'].to_a + ['mobilize-hdfs']
+        end
+      end
+      File.open(jt_config_file,"w") {|f| f.print(yml_hash.to_yaml)}
+    end
+  end
+end

data/lib/mobilize-hdfs/version.rb ADDED

@@ -0,0 +1,5 @@
+module Mobilize
+  module Hdfs
+    VERSION = "1.0.0"
+  end
+end

data/lib/samples/hadoop.yml ADDED

@@ -0,0 +1,52 @@
+---
+development:
+  output_cluster: dev_cluster
+  output_dir: /user/mobilize/development/
+  read_limit: 1000000000
+  clusters:
+    dev_cluster:
+      namenode:
+        name: dev_namenode.host.com
+        port: 50070
+      gateway_node: dev_hadoop_host
+      exec_path: /path/to/hadoop
+    dev_cluster_2:
+      namenode:
+        name: dev_namenode_2.host.com
+        port: 50070
+      gateway_node: dev_hadoop_host
+      exec_path: /path/to/hadoop
+test:
+  output_cluster: test_cluster
+  output_dir: /user/mobilize/test/
+  read_limit: 1000000000
+  clusters:
+    test_cluster:
+      namenode:
+        name: test_namenode.host.com
+        port: 50070
+      gateway_node: test_hadoop_host
+      exec_path: /path/to/hadoop
+    test_cluster_2:
+      namenode:
+        name: test_namenode_2.host.com
+        port: 50070
+      gateway_node: test_hadoop_host
+      exec_path: /path/to/hadoop
+production:
+  output_cluster: prod_cluster
+  output_dir: /user/mobilize/production/
+  read_limit: 1000000000
+  clusters:
+    prod_cluster:
+      namenode:
+        name: prod_namenode.host.com
+        port: 50070
+      gateway_node: prod_hadoop_host
+      exec_path: /path/to/hadoop
+    prod_cluster_2:
+      namenode:
+        name: prod_namenode_2.host.com
+        port: 50070
+      gateway_node: prod_hadoop_host
+      exec_path: /path/to/hadoop

data/mobilize-hdfs.gemspec ADDED

@@ -0,0 +1,20 @@
+# -*- encoding: utf-8 -*-
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'mobilize-hdfs/version'
+Gem::Specification.new do |gem|
+  gem.name          = "mobilize-hdfs"
+  gem.version       = Mobilize::Hdfs::VERSION
+  gem.authors       = ["Cassio Paes-Leme"]
+  gem.email         = ["cpaesleme@ngmoco.com"]
+  gem.description   = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
+  gem.summary       = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
+  gem.homepage      = "http://github.com/ngmoco/mobilize-hdfs"
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+  gem.add_runtime_dependency "mobilize-ssh","1.1.0"
+end

data/test/hdfs_job_rows.yml ADDED

@@ -0,0 +1,11 @@
+- name: test_hdfs_1
+  active: true
+  trigger: once
+  status: ""
+  stage1: hdfs.write target:"/user/mobilize/test/test_hdfs_1.out",
+            source:"Runner_mobilize(test)/test_hdfs_1.in"
+  stage2: hdfs.copy source:"/user/mobilize/test/test_hdfs_1.out",
+            target:"test_cluster_2/user/mobilize/test/test_hdfs_copy.out",
+  stage3: hdfs.read source:"/user/mobilize/test/test_hdfs_1_copy.out"
+  stage4: gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
+            target:"Runner_mobilize(test)/test_hdfs_1_copy.out"

data/test/mobilize-hdfs_test.rb ADDED

@@ -0,0 +1,46 @@
+require 'test_helper'
+describe "Mobilize" do
+  def before
+    puts 'nothing before'
+  end
+  # enqueues 4 workers on Resque
+  it "runs integration test" do
+    puts "restart workers"
+    Mobilize::Jobtracker.restart_workers!
+    gdrive_slot = Mobilize::Gdrive.owner_email
+    puts "create user 'mobilize'"
+    user_name = gdrive_slot.split("@").first
+    u = Mobilize::User.where(:name=>user_name).first
+    r = u.runner
+    hdfs_1_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1.in",gdrive_slot)
+    [hdfs_1_sheet].each {|s| s.delete if s}
+    puts "add test_source data"
+    hdfs_1_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1.in",gdrive_slot)
+    hdfs_1_tsv = ([(["test"]*10).join("\t")]*10).join("\n")
+    hdfs_1_sheet.write(hdfs_1_tsv,u.name)
+    jobs_sheet = r.gsheet(gdrive_slot)
+    test_job_rows = ::YAML.load_file("#{Mobilize::Base.root}/test/hdfs_job_rows.yml")
+    jobs_sheet.add_or_update_rows(test_job_rows)
+    hdfs_1_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
+    [hdfs_1_target_sheet].each {|s| s.delete if s}
+    puts "job row added, force enqueued requestor, wait 120s"
+    r.enqueue!
+    sleep 120
+    puts "jobtracker posted data to test sheet"
+    test_destination_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
+    assert test_destination_sheet.to_tsv.length == 499
+  end
+end