mobilize-hdfs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'mobilize-base/tasks'
4
+ require 'mobilize-ssh/tasks'
5
+ require 'mobilize-hdfs/tasks'
6
+
7
+ #
8
+ # Tests
9
+ #
10
+ require 'rake/testtask'
11
+
12
+ Rake::TestTask.new do |test|
13
+ test.verbose = true
14
+ test.libs << "test"
15
+ test.libs << "lib"
16
+ test.test_files = FileList['test/**/*_test.rb']
17
+ end
18
+ task :default => :test
@@ -0,0 +1,9 @@
1
+ require "mobilize-hdfs/version"
2
+ require "mobilize-ssh"
3
+
4
+ module Mobilize
5
+ module Hdfs
6
+ end
7
+ end
8
+ require "mobilize-hdfs/handlers/hadoop"
9
+ require "mobilize-hdfs/handlers/hdfs"
@@ -0,0 +1,67 @@
1
+ module Mobilize
2
+ module Hadoop
3
+ def Hadoop.config
4
+ Base.config('hadoop')
5
+ end
6
+
7
+ def Hadoop.exec_path(cluster)
8
+ Hadoop.config['clusters'][cluster]['exec_path']
9
+ end
10
+
11
+ def Hadoop.gateway_node(cluster)
12
+ Hadoop.clusters[cluster]['gateway_node']
13
+ end
14
+
15
+ def Hadoop.clusters
16
+ Hadoop.config['clusters']
17
+ end
18
+
19
+ def Hadoop.output_cluster
20
+ Hadoop.config['output_cluster']
21
+ end
22
+
23
+ def Hadoop.output_dir
24
+ Hadoop.config['output_dir']
25
+ end
26
+
27
+ def Hadoop.read_limit
28
+ Hadoop.config['read_limit']
29
+ end
30
+
31
+ def Hadoop.job(command,cluster,user,file_hash={})
32
+ command = ["-",command].join unless command.starts_with?("-")
33
+ Hadoop.run("job -fs #{Hdfs.root(cluster)} #{command}",cluster,user,file_hash).ie do |r|
34
+ r.class==Array ? r.first : r
35
+ end
36
+ end
37
+
38
+ def Hadoop.job_list(cluster)
39
+ raw_list = Hadoop.job("list",{},cluster)
40
+ raw_list.split("\n")[1..-1].join("\n").tsv_to_hash_array
41
+ end
42
+
43
+ def Hadoop.job_status(hdfs_job_id,cluster)
44
+ raw_status = Hadoop.job("status #{hdfs_job_id}",{},cluster)
45
+ dhash_status = raw_status.strip.split("\n").map do |sline|
46
+ delim_index = [sline.index("="),sline.index(":")].compact.min
47
+ if delim_index
48
+ key,value = [sline[0..delim_index-1],sline[(delim_index+1)..-1]]
49
+ {key.strip => value.strip}
50
+ end
51
+ end.compact
52
+ hash_status = {}
53
+ dhash_status.each{|h| hash_status.merge!(h)}
54
+ hash_status
55
+ end
56
+
57
+ def Hadoop.run(command,cluster,user,file_hash={})
58
+ h_command = if command.starts_with?("hadoop")
59
+ command.sub("hadoop",Hadoop.exec_path(cluster))
60
+ else
61
+ "#{Hadoop.exec_path(cluster)} #{command}"
62
+ end
63
+ gateway_node = Hadoop.gateway_node(cluster)
64
+ Ssh.run(gateway_node,h_command,user,file_hash)
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,187 @@
1
+ module Mobilize
2
+ module Hdfs
3
+ def Hdfs.root(cluster)
4
+ namenode = Hadoop.clusters[cluster]['namenode']
5
+ "hdfs://#{namenode['name']}:#{namenode['port']}"
6
+ end
7
+
8
+ def Hdfs.run(command,cluster,user)
9
+ command = ["-",command].join unless command.starts_with?("-")
10
+ command = "dfs -fs #{Hdfs.root(cluster)}/ #{command}"
11
+ Hadoop.run(command,cluster,user)
12
+ end
13
+
14
+ def Hdfs.rm(target_path,user)
15
+ #ignore errors due to missing file
16
+ cluster,cluster_path = Hdfs.resolve_path(target_path)
17
+ begin
18
+ Hdfs.run("rm '#{cluster_path}'",cluster,user)
19
+ return true
20
+ rescue
21
+ return false
22
+ end
23
+ end
24
+
25
+ def Hdfs.rmr(target_dir,user)
26
+ #ignore errors due to missing dir
27
+ cluster,cluster_dir = Hdfs.resolve_path(target_dir)
28
+ begin
29
+ Hdfs.run("rmr '#{cluster_dir}'",cluster,user)
30
+ return true
31
+ rescue
32
+ return false
33
+ end
34
+ end
35
+
36
+ def Hdfs.read(path,user)
37
+ cluster, cluster_path = Hdfs.resolve_path(path)
38
+ gateway_node = Hadoop.gateway_node(cluster)
39
+ #need to direct stderr to dev null since hdfs throws errors at being headed off
40
+ command = "((#{Hadoop.exec_path(cluster)} fs -fs '#{Hdfs.namenode_path(path)}' -cat #{cluster_path}"
41
+ command += " | head -c #{Hadoop.read_limit}) > out.txt 2> /dev/null) && cat out.txt"
42
+ response = Ssh.run(gateway_node,command,user)
43
+ if response.length==Hadoop.read_limit
44
+ raise "Hadoop read limit reached -- please reduce query size"
45
+ end
46
+ response
47
+ end
48
+
49
+ def Hdfs.resolve_path(path)
50
+ if path.starts_with?("/")
51
+ return [Hadoop.output_cluster,path]
52
+ #determine if first term in path is a cluster name
53
+ elsif Hadoop.clusters.keys.include?(path.split("/").first)
54
+ return path.split("/").ie{|p| [p.first,"/#{p[1..-1].join("/")}"]}
55
+ else
56
+ return [nil,nil]
57
+ end
58
+ end
59
+
60
+ def Hdfs.namenode_path(path)
61
+ cluster, cluster_path = Hdfs.resolve_path(path)
62
+ "#{Hdfs.root(cluster)}#{cluster_path}"
63
+ end
64
+
65
+ def Hdfs.write(path,string,user)
66
+ file_hash = {'file.txt'=>string}
67
+ cluster = Hdfs.resolve_path(path).first
68
+ Hdfs.rm(path,user) #remove old one if any
69
+ write_command = "dfs -copyFromLocal file.txt '#{Hdfs.namenode_path(path)}'"
70
+ Hadoop.run(write_command,cluster,user,file_hash)
71
+ return Hdfs.namenode_path(path)
72
+ end
73
+
74
+ def Hdfs.copy(source_path,target_path,user)
75
+ Hdfs.rm(target_path,user) #remove to_path
76
+ source_cluster = Hdfs.resolve_path(source_path).first
77
+ command = "dfs -cp '#{Hdfs.namenode_path(source_path)}' '#{Hdfs.namenode_path(target_path)}'"
78
+ #copy operation implies access to target_url from source_cluster
79
+ Hadoop.run(command,source_cluster,user)
80
+ return Hdfs.namenode_path(target_path)
81
+ end
82
+
83
+ def Hdfs.read_by_stage_path(stage_path)
84
+ s = Stage.where(:path=>stage_path).first
85
+ u = s.job.runner.user
86
+ params = s.params
87
+ source_path = params['source']
88
+ user = params['user']
89
+ #check for source in hdfs format
90
+ source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
91
+ raise "unable to resolve source path" if source_cluster.nil?
92
+
93
+ node = Hadoop.gateway_node(source_cluster)
94
+ if user and !Ssh.sudoers(node).include?(u.name)
95
+ raise "#{u.name} does not have su permissions for #{node}"
96
+ elsif user.nil? and Ssh.su_all_users(node)
97
+ user = u.name
98
+ end
99
+
100
+ source_path = "#{source_cluster}#{source_cluster_path}"
101
+ out_string = Hdfs.read(source_path,user).to_s
102
+ out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
103
+ Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
104
+ out_url
105
+ end
106
+
107
+ def Hdfs.write_by_stage_path(stage_path)
108
+ s = Stage.where(:path=>stage_path).first
109
+ u = s.job.runner.user
110
+ params = s.params
111
+ source_path = params['source']
112
+ target_path = params['target']
113
+ user = params['user']
114
+ #check for source in hdfs format
115
+ source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
116
+ if source_cluster.nil?
117
+ #not hdfs
118
+ gdrive_slot = Gdrive.slot_worker_by_path(stage_path)
119
+ #return blank response if there are no slots available
120
+ return nil unless gdrive_slot
121
+ source_dst = s.source_dsts(gdrive_slot).first
122
+ Gdrive.unslot_worker_by_path(stage_path)
123
+ else
124
+ source_path = "#{source_cluster}#{source_cluster_path}"
125
+ source_dst = Dataset.find_or_create_by_handler_and_path("hdfs",source_path)
126
+ end
127
+
128
+ #determine cluster for target
129
+ target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
130
+ raise "unable to resolve target path" if target_cluster.nil?
131
+
132
+ node = Hadoop.gateway_node(target_cluster)
133
+ if user and !Ssh.sudoers(node).include?(u.name)
134
+ raise "#{u.name} does not have su permissions for #{node}"
135
+ elsif user.nil? and Ssh.su_all_users(node)
136
+ user = u.name
137
+ end
138
+
139
+ target_path = "#{target_cluster}#{target_cluster_path}"
140
+ in_string = source_dst.read(user)
141
+ out_string = Hdfs.write(target_path,in_string,user)
142
+
143
+ out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
144
+ Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
145
+ out_url
146
+ end
147
+
148
+ def Hdfs.copy_by_stage_path(stage_path)
149
+ s = Stage.where(:path=>stage_path).first
150
+ u = s.job.runner.user
151
+ params = s.params
152
+ source_path = params['source']
153
+ target_path = params['target']
154
+ user = params['user']
155
+ #check for source in hdfs format
156
+ source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
157
+ raise "unable to resolve source path" if source_cluster.nil?
158
+
159
+ #determine cluster for target
160
+ target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
161
+ raise "unable to resolve target path" if target_cluster.nil?
162
+
163
+ node = Hadoop.gateway_node(source_cluster)
164
+ if user and !Ssh.sudoers(node).include?(u.name)
165
+ raise "#{u.name} does not have su permissions for #{node}"
166
+ elsif user.nil? and Ssh.su_all_users(node)
167
+ user = u.name
168
+ end
169
+
170
+ source_path = "#{source_cluster}#{source_cluster_path}"
171
+ target_path = "#{target_cluster}#{target_cluster_path}"
172
+ out_string = Hdfs.copy(source_path,target_path,user)
173
+
174
+ out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
175
+ Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
176
+ out_url
177
+ end
178
+
179
+ def Hdfs.read_by_dataset_path(dst_path,user)
180
+ Hdfs.read(dst_path,user)
181
+ end
182
+
183
+ def Hdfs.write_by_dataset_path(dst_path,string,user)
184
+ Hdfs.write(dst_path,string,user)
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,38 @@
1
+ namespace :mobilize_hdfs do
2
+ desc "Set up config and log folders and files"
3
+ task :setup do
4
+ sample_dir = File.dirname(__FILE__) + '/../samples/'
5
+ sample_files = Dir.entries(sample_dir)
6
+ config_dir = (ENV['MOBILIZE_CONFIG_DIR'] ||= "config/mobilize/")
7
+ log_dir = (ENV['MOBILIZE_LOG_DIR'] ||= "log/")
8
+ full_config_dir = "#{ENV['PWD']}/#{config_dir}"
9
+ full_log_dir = "#{ENV['PWD']}/#{log_dir}"
10
+ unless File.exists?(full_config_dir)
11
+ puts "creating #{config_dir}"
12
+ `mkdir -p #{full_config_dir}`
13
+ end
14
+ unless File.exists?(full_log_dir)
15
+ puts "creating #{log_dir}"
16
+ `mkdir -p #{full_log_dir}`
17
+ end
18
+ sample_files.each do |fname|
19
+ unless File.exists?("#{full_config_dir}#{fname}")
20
+ puts "creating #{config_dir}#{fname}"
21
+ `cp #{sample_dir}#{fname} #{full_config_dir}#{fname}`
22
+ end
23
+ end
24
+ #make sure that the jobtracker.yml is updated to include the
25
+ #mobilize-ssh library
26
+ jt_config_file = "#{config_dir}jobtracker.yml"
27
+ if File.exists?(jt_config_file)
28
+ yml_hash = YAML.load_file(jt_config_file)
29
+ yml_hash.keys.each do |k|
30
+ if yml_hash[k]['extensions'] and !yml_hash[k]['extensions'].include?('mobilize-hdfs')
31
+ puts "adding mobilize-hdfs to jobtracker.yml/#{k}/extensions"
32
+ yml_hash[k]['extensions'] = yml_hash[k]['extensions'].to_a + ['mobilize-hdfs']
33
+ end
34
+ end
35
+ File.open(jt_config_file,"w") {|f| f.print(yml_hash.to_yaml)}
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,5 @@
1
+ module Mobilize
2
+ module Hdfs
3
+ VERSION = "1.0.0"
4
+ end
5
+ end
@@ -0,0 +1,52 @@
1
+ ---
2
+ development:
3
+ output_cluster: dev_cluster
4
+ output_dir: /user/mobilize/development/
5
+ read_limit: 1000000000
6
+ clusters:
7
+ dev_cluster:
8
+ namenode:
9
+ name: dev_namenode.host.com
10
+ port: 50070
11
+ gateway_node: dev_hadoop_host
12
+ exec_path: /path/to/hadoop
13
+ dev_cluster_2:
14
+ namenode:
15
+ name: dev_namenode_2.host.com
16
+ port: 50070
17
+ gateway_node: dev_hadoop_host
18
+ exec_path: /path/to/hadoop
19
+ test:
20
+ output_cluster: test_cluster
21
+ output_dir: /user/mobilize/test/
22
+ read_limit: 1000000000
23
+ clusters:
24
+ test_cluster:
25
+ namenode:
26
+ name: test_namenode.host.com
27
+ port: 50070
28
+ gateway_node: test_hadoop_host
29
+ exec_path: /path/to/hadoop
30
+ test_cluster_2:
31
+ namenode:
32
+ name: test_namenode_2.host.com
33
+ port: 50070
34
+ gateway_node: test_hadoop_host
35
+ exec_path: /path/to/hadoop
36
+ production:
37
+ output_cluster: prod_cluster
38
+ output_dir: /user/mobilize/production/
39
+ read_limit: 1000000000
40
+ clusters:
41
+ prod_cluster:
42
+ namenode:
43
+ name: prod_namenode.host.com
44
+ port: 50070
45
+ gateway_node: prod_hadoop_host
46
+ exec_path: /path/to/hadoop
47
+ prod_cluster_2:
48
+ namenode:
49
+ name: prod_namenode_2.host.com
50
+ port: 50070
51
+ gateway_node: prod_hadoop_host
52
+ exec_path: /path/to/hadoop
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'mobilize-hdfs/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "mobilize-hdfs"
8
+ gem.version = Mobilize::Hdfs::VERSION
9
+ gem.authors = ["Cassio Paes-Leme"]
10
+ gem.email = ["cpaesleme@ngmoco.com"]
11
+ gem.description = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
12
+ gem.summary = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
13
+ gem.homepage = "http://github.com/ngmoco/mobilize-hdfs"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_runtime_dependency "mobilize-ssh","1.1.0"
20
+ end
@@ -0,0 +1,11 @@
1
+ - name: test_hdfs_1
2
+ active: true
3
+ trigger: once
4
+ status: ""
5
+ stage1: hdfs.write target:"/user/mobilize/test/test_hdfs_1.out",
6
+ source:"Runner_mobilize(test)/test_hdfs_1.in"
7
+ stage2: hdfs.copy source:"/user/mobilize/test/test_hdfs_1.out",
8
+ target:"test_cluster_2/user/mobilize/test/test_hdfs_copy.out",
9
+ stage3: hdfs.read source:"/user/mobilize/test/test_hdfs_1_copy.out"
10
+ stage4: gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
11
+ target:"Runner_mobilize(test)/test_hdfs_1_copy.out"
@@ -0,0 +1,46 @@
1
+ require 'test_helper'
2
+
3
+ describe "Mobilize" do
4
+
5
+ def before
6
+ puts 'nothing before'
7
+ end
8
+
9
+ # enqueues 4 workers on Resque
10
+ it "runs integration test" do
11
+
12
+ puts "restart workers"
13
+ Mobilize::Jobtracker.restart_workers!
14
+
15
+ gdrive_slot = Mobilize::Gdrive.owner_email
16
+ puts "create user 'mobilize'"
17
+ user_name = gdrive_slot.split("@").first
18
+ u = Mobilize::User.where(:name=>user_name).first
19
+ r = u.runner
20
+ hdfs_1_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1.in",gdrive_slot)
21
+ [hdfs_1_sheet].each {|s| s.delete if s}
22
+
23
+ puts "add test_source data"
24
+ hdfs_1_sheet = Mobilize::Gsheet.find_or_create_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1.in",gdrive_slot)
25
+ hdfs_1_tsv = ([(["test"]*10).join("\t")]*10).join("\n")
26
+ hdfs_1_sheet.write(hdfs_1_tsv,u.name)
27
+
28
+ jobs_sheet = r.gsheet(gdrive_slot)
29
+
30
+ test_job_rows = ::YAML.load_file("#{Mobilize::Base.root}/test/hdfs_job_rows.yml")
31
+ jobs_sheet.add_or_update_rows(test_job_rows)
32
+
33
+ hdfs_1_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
34
+ [hdfs_1_target_sheet].each {|s| s.delete if s}
35
+
36
+ puts "job row added, force enqueued requestor, wait 120s"
37
+ r.enqueue!
38
+ sleep 120
39
+
40
+ puts "jobtracker posted data to test sheet"
41
+ test_destination_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
42
+
43
+ assert test_destination_sheet.to_tsv.length == 499
44
+ end
45
+
46
+ end