mobilize-hdfs 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'mobilize-base/tasks'
4
+ require 'mobilize-ssh/tasks'
5
+ require 'mobilize-hdfs/tasks'
6
+
7
+ #
8
+ # Tests
9
+ #
10
+ require 'rake/testtask'
11
+
12
+ Rake::TestTask.new do |test|
13
+ test.verbose = true
14
+ test.libs << "test"
15
+ test.libs << "lib"
16
+ test.test_files = FileList['test/**/*_test.rb']
17
+ end
18
+ task :default => :test
@@ -0,0 +1,9 @@
1
+ require "mobilize-hdfs/version"
2
+ require "mobilize-ssh"
3
+
4
+ module Mobilize
5
+ module Hdfs
6
+ end
7
+ end
8
+ require "mobilize-hdfs/handlers/hadoop"
9
+ require "mobilize-hdfs/handlers/hdfs"
@@ -0,0 +1,67 @@
1
+ module Mobilize
2
+ module Hadoop
3
+ def Hadoop.config
4
+ Base.config('hadoop')
5
+ end
6
+
7
+ def Hadoop.exec_path(cluster)
8
+ Hadoop.config['clusters'][cluster]['exec_path']
9
+ end
10
+
11
+ def Hadoop.gateway_node(cluster)
12
+ Hadoop.clusters[cluster]['gateway_node']
13
+ end
14
+
15
+ def Hadoop.clusters
16
+ Hadoop.config['clusters']
17
+ end
18
+
19
+ def Hadoop.output_cluster
20
+ Hadoop.config['output_cluster']
21
+ end
22
+
23
+ def Hadoop.output_dir
24
+ Hadoop.config['output_dir']
25
+ end
26
+
27
+ def Hadoop.read_limit
28
+ Hadoop.config['read_limit']
29
+ end
30
+
31
+ def Hadoop.job(command,cluster,user,file_hash={})
32
+ command = ["-",command].join unless command.starts_with?("-")
33
+ Hadoop.run("job -fs #{Hdfs.root(cluster)} #{command}",cluster,user,file_hash).ie do |r|
34
+ r.class==Array ? r.first : r
35
+ end
36
+ end
37
+
38
+ def Hadoop.job_list(cluster)
39
+ raw_list = Hadoop.job("list",{},cluster)
40
+ raw_list.split("\n")[1..-1].join("\n").tsv_to_hash_array
41
+ end
42
+
43
+ def Hadoop.job_status(hdfs_job_id,cluster)
44
+ raw_status = Hadoop.job("status #{hdfs_job_id}",{},cluster)
45
+ dhash_status = raw_status.strip.split("\n").map do |sline|
46
+ delim_index = [sline.index("="),sline.index(":")].compact.min
47
+ if delim_index
48
+ key,value = [sline[0..delim_index-1],sline[(delim_index+1)..-1]]
49
+ {key.strip => value.strip}
50
+ end
51
+ end.compact
52
+ hash_status = {}
53
+ dhash_status.each{|h| hash_status.merge!(h)}
54
+ hash_status
55
+ end
56
+
57
+ def Hadoop.run(command,cluster,user,file_hash={})
58
+ h_command = if command.starts_with?("hadoop")
59
+ command.sub("hadoop",Hadoop.exec_path(cluster))
60
+ else
61
+ "#{Hadoop.exec_path(cluster)} #{command}"
62
+ end
63
+ gateway_node = Hadoop.gateway_node(cluster)
64
+ Ssh.run(gateway_node,h_command,user,file_hash)
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,187 @@
1
+ module Mobilize
2
+ module Hdfs
3
+ def Hdfs.root(cluster)
4
+ namenode = Hadoop.clusters[cluster]['namenode']
5
+ "hdfs://#{namenode['name']}:#{namenode['port']}"
6
+ end
7
+
8
+ def Hdfs.run(command,cluster,user)
9
+ command = ["-",command].join unless command.starts_with?("-")
10
+ command = "dfs -fs #{Hdfs.root(cluster)}/ #{command}"
11
+ Hadoop.run(command,cluster,user)
12
+ end
13
+
14
+ def Hdfs.rm(target_path,user)
15
+ #ignore errors due to missing file
16
+ cluster,cluster_path = Hdfs.resolve_path(target_path)
17
+ begin
18
+ Hdfs.run("rm '#{cluster_path}'",cluster,user)
19
+ return true
20
+ rescue
21
+ return false
22
+ end
23
+ end
24
+
25
+ def Hdfs.rmr(target_dir,user)
26
+ #ignore errors due to missing dir
27
+ cluster,cluster_dir = Hdfs.resolve_path(target_dir)
28
+ begin
29
+ Hdfs.run("rmr '#{cluster_dir}'",cluster,user)
30
+ return true
31
+ rescue
32
+ return false
33
+ end
34
+ end
35
+
36
+ def Hdfs.read(path,user)
37
+ cluster, cluster_path = Hdfs.resolve_path(path)
38
+ gateway_node = Hadoop.gateway_node(cluster)
39
+ #need to direct stderr to dev null since hdfs throws errors at being headed off
40
+ command = "((#{Hadoop.exec_path(cluster)} fs -fs '#{Hdfs.namenode_path(path)}' -cat #{cluster_path}"
41
+ command += " | head -c #{Hadoop.read_limit}) > out.txt 2> /dev/null) && cat out.txt"
42
+ response = Ssh.run(gateway_node,command,user)
43
+ if response.length==Hadoop.read_limit
44
+ raise "Hadoop read limit reached -- please reduce query size"
45
+ end
46
+ response
47
+ end
48
+
49
+ def Hdfs.resolve_path(path)
50
+ if path.starts_with?("/")
51
+ return [Hadoop.output_cluster,path]
52
+ #determine if first term in path is a cluster name
53
+ elsif Hadoop.clusters.keys.include?(path.split("/").first)
54
+ return path.split("/").ie{|p| [p.first,"/#{p[1..-1].join("/")}"]}
55
+ else
56
+ return [nil,nil]
57
+ end
58
+ end
59
+
60
+ def Hdfs.namenode_path(path)
61
+ cluster, cluster_path = Hdfs.resolve_path(path)
62
+ "#{Hdfs.root(cluster)}#{cluster_path}"
63
+ end
64
+
65
+ def Hdfs.write(path,string,user)
66
+ file_hash = {'file.txt'=>string}
67
+ cluster = Hdfs.resolve_path(path).first
68
+ Hdfs.rm(path,user) #remove old one if any
69
+ write_command = "dfs -copyFromLocal file.txt '#{Hdfs.namenode_path(path)}'"
70
+ Hadoop.run(write_command,cluster,user,file_hash)
71
+ return Hdfs.namenode_path(path)
72
+ end
73
+
74
+ def Hdfs.copy(source_path,target_path,user)
75
+ Hdfs.rm(target_path,user) #remove to_path
76
+ source_cluster = Hdfs.resolve_path(source_path).first
77
+ command = "dfs -cp '#{Hdfs.namenode_path(source_path)}' '#{Hdfs.namenode_path(target_path)}'"
78
+ #copy operation implies access to target_url from source_cluster
79
+ Hadoop.run(command,source_cluster,user)
80
+ return Hdfs.namenode_path(target_path)
81
+ end
82
+
83
+ def Hdfs.read_by_stage_path(stage_path)
84
+ s = Stage.where(:path=>stage_path).first
85
+ u = s.job.runner.user
86
+ params = s.params
87
+ source_path = params['source']
88
+ user = params['user']
89
+ #check for source in hdfs format
90
+ source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
91
+ raise "unable to resolve source path" if source_cluster.nil?
92
+
93
+ node = Hadoop.gateway_node(source_cluster)
94
+ if user and !Ssh.sudoers(node).include?(u.name)
95
+ raise "#{u.name} does not have su permissions for #{node}"
96
+ elsif user.nil? and Ssh.su_all_users(node)
97
+ user = u.name
98
+ end
99
+
100
+ source_path = "#{source_cluster}#{source_cluster_path}"
101
+ out_string = Hdfs.read(source_path,user).to_s
102
+ out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
103
+ Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
104
+ out_url
105
+ end
106
+
107
+ def Hdfs.write_by_stage_path(stage_path)
108
+ s = Stage.where(:path=>stage_path).first
109
+ u = s.job.runner.user
110
+ params = s.params
111
+ source_path = params['source']
112
+ target_path = params['target']
113
+ user = params['user']
114
+ #check for source in hdfs format
115
+ source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
116
+ if source_cluster.nil?
117
+ #not hdfs
118
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
119
+ #return blank response if there are no slots available
120
+ return nil unless gdrive_slot
121
+ source_dst = s.source_dsts(gdrive_slot).first
122
+ Gdrive.unslot_worker_by_path(stage_path)
123
+ else
124
+ source_path = "#{source_cluster}#{source_cluster_path}"
125
+ source_dst = Dataset.find_or_create_by_handler_and_path("hdfs",source_path)
126
+ end
127
+
128
+ #determine cluster for target
129
+ target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
130
+ raise "unable to resolve target path" if target_cluster.nil?
131
+
132
+ node = Hadoop.gateway_node(target_cluster)
133
+ if user and !Ssh.sudoers(node).include?(u.name)
134
+ raise "#{u.name} does not have su permissions for #{node}"
135
+ elsif user.nil? and Ssh.su_all_users(node)
136
+ user = u.name
137
+ end
138
+
139
+ target_path = "#{target_cluster}#{target_cluster_path}"
140
+ in_string = source_dst.read(user)
141
+ out_string = Hdfs.write(target_path,in_string,user)
142
+
143
+ out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
144
+ Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
145
+ out_url
146
+ end
147
+
148
+ def Hdfs.copy_by_stage_path(stage_path)
149
+ s = Stage.where(:path=>stage_path).first
150
+ u = s.job.runner.user
151
+ params = s.params
152
+ source_path = params['source']
153
+ target_path = params['target']
154
+ user = params['user']
155
+ #check for source in hdfs format
156
+ source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
157
+ raise "unable to resolve source path" if source_cluster.nil?
158
+
159
+ #determine cluster for target
160
+ target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
161
+ raise "unable to resolve target path" if target_cluster.nil?
162
+
163
+ node = Hadoop.gateway_node(source_cluster)
164
+ if user and !Ssh.sudoers(node).include?(u.name)
165
+ raise "#{u.name} does not have su permissions for #{node}"
166
+ elsif user.nil? and Ssh.su_all_users(node)
167
+ user = u.name
168
+ end
169
+
170
+ source_path = "#{source_cluster}#{source_cluster_path}"
171
+ target_path = "#{target_cluster}#{target_cluster_path}"
172
+ out_string = Hdfs.copy(source_path,target_path,user)
173
+
174
+ out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
175
+ Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
176
+ out_url
177
+ end
178
+
179
+ def Hdfs.read_by_dataset_path(dst_path,user)
180
+ Hdfs.read(dst_path,user)
181
+ end
182
+
183
+ def Hdfs.write_by_dataset_path(dst_path,string,user)
184
+ Hdfs.write(dst_path,string,user)
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,38 @@
1
+ namespace :mobilize_hdfs do
2
+ desc "Set up config and log folders and files"
3
+ task :setup do
4
+ sample_dir = File.dirname(__FILE__) + '/../samples/'
5
+ sample_files = Dir.entries(sample_dir)
6
+ config_dir = (ENV['MOBILIZE_CONFIG_DIR'] ||= "config/mobilize/")
7
+ log_dir = (ENV['MOBILIZE_LOG_DIR'] ||= "log/")
8
+ full_config_dir = "#{ENV['PWD']}/#{config_dir}"
9
+ full_log_dir = "#{ENV['PWD']}/#{log_dir}"
10
+ unless File.exists?(full_config_dir)
11
+ puts "creating #{config_dir}"
12
+ `mkdir -p #{full_config_dir}`
13
+ end
14
+ unless File.exists?(full_log_dir)
15
+ puts "creating #{log_dir}"
16
+ `mkdir -p #{full_log_dir}`
17
+ end
18
+ sample_files.each do |fname|
19
+ unless File.exists?("#{full_config_dir}#{fname}")
20
+ puts "creating #{config_dir}#{fname}"
21
+ `cp #{sample_dir}#{fname} #{full_config_dir}#{fname}`
22
+ end
23
+ end
24
+ #make sure that the jobtracker.yml is updated to include the
25
+ #mobilize-ssh library
26
+ jt_config_file = "#{config_dir}jobtracker.yml"
27
+ if File.exists?(jt_config_file)
28
+ yml_hash = YAML.load_file(jt_config_file)
29
+ yml_hash.keys.each do |k|
30
+ if yml_hash[k]['extensions'] and !yml_hash[k]['extensions'].include?('mobilize-hdfs')
31
+ puts "adding mobilize-hdfs to jobtracker.yml/#{k}/extensions"
32
+ yml_hash[k]['extensions'] = yml_hash[k]['extensions'].to_a + ['mobilize-hdfs']
33
+ end
34
+ end
35
+ File.open(jt_config_file,"w") {|f| f.print(yml_hash.to_yaml)}
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,5 @@
1
+ module Mobilize
2
+ module Hdfs
3
+ VERSION = "1.0.0"
4
+ end
5
+ end
@@ -0,0 +1,52 @@
1
+ ---
2
+ development:
3
+ output_cluster: dev_cluster
4
+ output_dir: /user/mobilize/development/
5
+ read_limit: 1000000000
6
+ clusters:
7
+ dev_cluster:
8
+ namenode:
9
+ name: dev_namenode.host.com
10
+ port: 50070
11
+ gateway_node: dev_hadoop_host
12
+ exec_path: /path/to/hadoop
13
+ dev_cluster_2:
14
+ namenode:
15
+ name: dev_namenode_2.host.com
16
+ port: 50070
17
+ gateway_node: dev_hadoop_host
18
+ exec_path: /path/to/hadoop
19
+ test:
20
+ output_cluster: test_cluster
21
+ output_dir: /user/mobilize/test/
22
+ read_limit: 1000000000
23
+ clusters:
24
+ test_cluster:
25
+ namenode:
26
+ name: test_namenode.host.com
27
+ port: 50070
28
+ gateway_node: test_hadoop_host
29
+ exec_path: /path/to/hadoop
30
+ test_cluster_2:
31
+ namenode:
32
+ name: test_namenode_2.host.com
33
+ port: 50070
34
+ gateway_node: test_hadoop_host
35
+ exec_path: /path/to/hadoop
36
+ production:
37
+ output_cluster: prod_cluster
38
+ output_dir: /user/mobilize/production/
39
+ read_limit: 1000000000
40
+ clusters:
41
+ prod_cluster:
42
+ namenode:
43
+ name: prod_namenode.host.com
44
+ port: 50070
45
+ gateway_node: prod_hadoop_host
46
+ exec_path: /path/to/hadoop
47
+ prod_cluster_2:
48
+ namenode:
49
+ name: prod_namenode_2.host.com
50
+ port: 50070
51
+ gateway_node: prod_hadoop_host
52
+ exec_path: /path/to/hadoop
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'mobilize-hdfs/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "mobilize-hdfs"
8
+ gem.version = Mobilize::Hdfs::VERSION
9
+ gem.authors = ["Cassio Paes-Leme"]
10
+ gem.email = ["cpaesleme@ngmoco.com"]
11
+ gem.description = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
12
+ gem.summary = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
13
+ gem.homepage = "http://github.com/ngmoco/mobilize-hdfs"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_runtime_dependency "mobilize-ssh","1.1.0"
20
+ end
@@ -0,0 +1,11 @@
1
+ - name: test_hdfs_1
2
+ active: true
3
+ trigger: once
4
+ status: ""
5
+ stage1: hdfs.write target:"/user/mobilize/test/test_hdfs_1.out",
6
+ source:"Runner_mobilize(test)/test_hdfs_1.in"
7
+ stage2: hdfs.copy source:"/user/mobilize/test/test_hdfs_1.out",
8
+ target:"test_cluster_2/user/mobilize/test/test_hdfs_copy.out",
9
+ stage3: hdfs.read source:"/user/mobilize/test/test_hdfs_1_copy.out"
10
+ stage4: gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
11
+ target:"Runner_mobilize(test)/test_hdfs_1_copy.out"
@@ -0,0 +1,46 @@
1
+ require 'test_helper'
2
+
3
+ describe "Mobilize" do
4
+
5
+ def before
6
+ puts 'nothing before'
7
+ end
8
+
9
+ # enqueues 4 workers on Resque
10
+ it "runs integration test" do
11
+
12
+ puts "restart workers"
13
+ Mobilize::Jobtracker.restart_workers!
14
+
15
+ gdrive_slot = Mobilize::Gdrive.owner_email
16
+ puts "create user 'mobilize'"
17
+ user_name = gdrive_slot.split("@").first
18
+ u = Mobilize::User.where(:name=>user_name).first
19
+ r = u.runner
20
+ hdfs_1_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1.in",gdrive_slot)
21
+ [hdfs_1_sheet].each {|s| s.delete if s}
22
+
23
+ puts "add test_source data"
24
+ hdfs_1_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1.in",gdrive_slot)
25
+ hdfs_1_tsv = ([(["test"]*10).join("\t")]*10).join("\n")
26
+ hdfs_1_sheet.write(hdfs_1_tsv,u.name)
27
+
28
+ jobs_sheet = r.gsheet(gdrive_slot)
29
+
30
+ test_job_rows = ::YAML.load_file("#{Mobilize::Base.root}/test/hdfs_job_rows.yml")
31
+ jobs_sheet.add_or_update_rows(test_job_rows)
32
+
33
+ hdfs_1_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
34
+ [hdfs_1_target_sheet].each {|s| s.delete if s}
35
+
36
+ puts "job row added, force enqueued requestor, wait 120s"
37
+ r.enqueue!
38
+ sleep 120
39
+
40
+ puts "jobtracker posted data to test sheet"
41
+ test_destination_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
42
+
43
+ assert test_destination_sheet.to_tsv.length == 499
44
+ end
45
+
46
+ end