mobilize-hdfs 1.0.10 → 1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +13 -22
- data/lib/mobilize-hdfs/handlers/hadoop.rb +11 -11
- data/lib/mobilize-hdfs/handlers/hdfs.rb +129 -146
- data/lib/mobilize-hdfs/version.rb +1 -1
- data/lib/samples/hadoop.yml +0 -3
- data/mobilize-hdfs.gemspec +2 -2
- data/test/hdfs_job_rows.yml +4 -5
- data/test/mobilize-hdfs_test.rb +25 -2
- metadata +5 -5
data/README.md
CHANGED
@@ -94,14 +94,11 @@ be read. If the data is bigger than the read limit, an exception will be
|
|
94
94
|
raised.
|
95
95
|
|
96
96
|
The Hadoop configuration consists of:
|
97
|
-
* output_cluster, which is the cluster where stage outputs will be
|
98
|
-
stored. Clusters are defined in the clusters parameter as described
|
99
|
-
below.
|
100
97
|
* output_dir, which is the absolute path to the directory in HDFS that will store stage
|
101
|
-
outputs. Directory names should end with a slash (/).
|
98
|
+
outputs. Directory names should end with a slash (/). It will choose the
|
99
|
+
first cluster as the default cluster to write to.
|
102
100
|
* read_limit, which is the maximum size data that can be read from the
|
103
|
-
cluster.
|
104
|
-
-c <size limit>. Default is 1GB.
|
101
|
+
cluster. Default is 1GB.
|
105
102
|
* clusters - this defines aliases for clusters, which are used as
|
106
103
|
parameters for Hdfs stages. Cluster aliases contain 5 parameters:
|
107
104
|
* namenode - defines the name and port for accessing the namenode
|
@@ -118,7 +115,6 @@ Sample hadoop.yml:
|
|
118
115
|
``` yml
|
119
116
|
---
|
120
117
|
development:
|
121
|
-
output_cluster: dev_cluster
|
122
118
|
output_dir: /user/mobilize/development/
|
123
119
|
read_limit: 1000000000
|
124
120
|
clusters:
|
@@ -135,7 +131,6 @@ development:
|
|
135
131
|
gateway_node: dev_hadoop_host
|
136
132
|
exec_path: /path/to/hadoop
|
137
133
|
test:
|
138
|
-
output_cluster: test_cluster
|
139
134
|
output_dir: /user/mobilize/test/
|
140
135
|
read_limit: 1000000000
|
141
136
|
clusters:
|
@@ -152,7 +147,6 @@ test:
|
|
152
147
|
gateway_node: test_hadoop_host
|
153
148
|
exec_path: /path/to/hadoop
|
154
149
|
production:
|
155
|
-
output_cluster: prod_cluster
|
156
150
|
output_dir: /user/mobilize/production/
|
157
151
|
read_limit: 1000000000
|
158
152
|
clusters:
|
@@ -181,17 +175,15 @@ Start
|
|
181
175
|
* cluster and user are optional for all of the below.
|
182
176
|
* cluster defaults to output_cluster;
|
183
177
|
* user is treated the same way as in [mobilize-ssh][mobilize-ssh].
|
184
|
-
* hdfs.
|
185
|
-
*
|
186
|
-
*
|
187
|
-
* The gsheet_full_path should be of the form `<gbook_name>/<gsheet_name>`. The test uses "Requestor_mobilize(test)/test_hdfs_1.in".
|
188
|
-
* The hdfs_full_path is the cluster alias followed by full path on the cluster.
|
178
|
+
* hdfs.write `source:<full_path>, target:<hdfs_full_path>, user:<user>`
|
179
|
+
* The full_path can use `<gsheet_path>` or `<hdfs_path>`. The test uses "test_hdfs_1.in".
|
180
|
+
* `<hdfs_path>` is the cluster alias followed by absolute path on the cluster.
|
189
181
|
* if a full path is supplied without a preceding cluster alias (e.g. "/user/mobilize/test/test_hdfs_1.in"),
|
190
|
-
the
|
182
|
+
the first listed cluster will be used as the default.
|
191
183
|
* The test uses "/user/mobilize/test/test_hdfs_1.in" for the initial
|
192
184
|
write, then "test_cluster_2/user/mobilize/test/test_hdfs_copy.out" for
|
193
|
-
the
|
194
|
-
* both cluster arguments and user are optional. If
|
185
|
+
the cross-cluster write.
|
186
|
+
* both cluster arguments and user are optional. If writing from
|
195
187
|
one cluster to another, your source_cluster gateway_node must be able to
|
196
188
|
access both clusters.
|
197
189
|
|
@@ -216,12 +208,11 @@ same cluster as your first.
|
|
216
208
|
|
217
209
|
3) $ rake test
|
218
210
|
|
219
|
-
* The test runs a
|
211
|
+
* The test runs a 3 stage job:
|
220
212
|
* test_hdfs_1:
|
221
|
-
* `hdfs.write target:"/user/mobilize/test/test_hdfs_1.out", source:"
|
222
|
-
* `hdfs.
|
223
|
-
* `
|
224
|
-
* `gsheet.write source:"stage3", target:"Runner_mobilize(test)/test_hdfs_1_copy.out"`
|
213
|
+
* `hdfs.write target:"/user/mobilize/test/test_hdfs_1.out", source:"test_hdfs_1.in"`
|
214
|
+
* `hdfs.write source:"/user/mobilize/test/test_hdfs_1.out",target:"test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out"`
|
215
|
+
* `gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out", target:"test_hdfs_1_copy.out"`
|
225
216
|
* at the end of the test, there should be a sheet named "test_hdfs_1_copy.out" with the same data as test_hdfs_1.in
|
226
217
|
|
227
218
|
<a name='section_Meta'></a>
|
@@ -9,15 +9,15 @@ module Mobilize
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def Hadoop.gateway_node(cluster)
|
12
|
-
Hadoop.clusters[cluster]['gateway_node']
|
12
|
+
Hadoop.config['clusters'][cluster]['gateway_node']
|
13
13
|
end
|
14
14
|
|
15
15
|
def Hadoop.clusters
|
16
|
-
Hadoop.config['clusters']
|
16
|
+
Hadoop.config['clusters'].keys
|
17
17
|
end
|
18
18
|
|
19
|
-
def Hadoop.
|
20
|
-
Hadoop.
|
19
|
+
def Hadoop.default_cluster
|
20
|
+
Hadoop.clusters.first
|
21
21
|
end
|
22
22
|
|
23
23
|
def Hadoop.output_dir
|
@@ -28,20 +28,20 @@ module Mobilize
|
|
28
28
|
Hadoop.config['read_limit']
|
29
29
|
end
|
30
30
|
|
31
|
-
def Hadoop.job(command,
|
31
|
+
def Hadoop.job(cluster,command,user,file_hash={})
|
32
32
|
command = ["-",command].join unless command.starts_with?("-")
|
33
|
-
Hadoop.run("job -fs #{Hdfs.root(cluster)} #{command}",
|
33
|
+
Hadoop.run(cluster,"job -fs #{Hdfs.root(cluster)} #{command}",user,file_hash).ie do |r|
|
34
34
|
r.class==Array ? r.first : r
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
38
|
def Hadoop.job_list(cluster)
|
39
|
-
raw_list = Hadoop.job("list"
|
39
|
+
raw_list = Hadoop.job(cluster,"list")
|
40
40
|
raw_list.split("\n")[1..-1].join("\n").tsv_to_hash_array
|
41
41
|
end
|
42
42
|
|
43
|
-
def Hadoop.job_status(
|
44
|
-
raw_status = Hadoop.job("status #{
|
43
|
+
def Hadoop.job_status(cluster,hadoop_job_id)
|
44
|
+
raw_status = Hadoop.job(cluster,"status #{hadoop_job_id}",{})
|
45
45
|
dhash_status = raw_status.strip.split("\n").map do |sline|
|
46
46
|
delim_index = [sline.index("="),sline.index(":")].compact.min
|
47
47
|
if delim_index
|
@@ -54,14 +54,14 @@ module Mobilize
|
|
54
54
|
hash_status
|
55
55
|
end
|
56
56
|
|
57
|
-
def Hadoop.run(command,
|
57
|
+
def Hadoop.run(cluster,command,user_name,file_hash={})
|
58
58
|
h_command = if command.starts_with?("hadoop")
|
59
59
|
command.sub("hadoop",Hadoop.exec_path(cluster))
|
60
60
|
else
|
61
61
|
"#{Hadoop.exec_path(cluster)} #{command}"
|
62
62
|
end
|
63
63
|
gateway_node = Hadoop.gateway_node(cluster)
|
64
|
-
Ssh.run(gateway_node,h_command,
|
64
|
+
Ssh.run(gateway_node,h_command,user_name,file_hash)
|
65
65
|
end
|
66
66
|
end
|
67
67
|
end
|
@@ -1,190 +1,173 @@
|
|
1
1
|
module Mobilize
|
2
2
|
module Hdfs
|
3
|
+
#returns the hdfs path to the root of the cluster
|
3
4
|
def Hdfs.root(cluster)
|
4
|
-
namenode = Hadoop.clusters[cluster]['namenode']
|
5
|
+
namenode = Hadoop.config['clusters'][cluster]['namenode']
|
5
6
|
"hdfs://#{namenode['name']}:#{namenode['port']}"
|
6
7
|
end
|
7
8
|
|
8
|
-
|
9
|
+
#replaces the cluster alias with a proper namenode path
|
10
|
+
def Hdfs.hdfs_url(url)
|
11
|
+
cluster = url.split("hdfs://").last.split("/").first
|
12
|
+
#replace first instance
|
13
|
+
url.sub("hdfs://#{cluster}",Hdfs.root(cluster))
|
14
|
+
end
|
15
|
+
|
16
|
+
def Hdfs.run(cluster,command,user)
|
9
17
|
command = ["-",command].join unless command.starts_with?("-")
|
10
18
|
command = "dfs -fs #{Hdfs.root(cluster)}/ #{command}"
|
11
|
-
Hadoop.run(command,
|
19
|
+
Hadoop.run(cluster,command,user)
|
12
20
|
end
|
13
21
|
|
14
|
-
|
15
|
-
|
16
|
-
cluster
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
#return the size in bytes for an Hdfs file
|
23
|
+
def Hdfs.file_size(url,user_name)
|
24
|
+
cluster = url.split("://").last.split("/").first
|
25
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
26
|
+
response = Hadoop.run(cluster, "dfs -du '#{hdfs_url}'", user_name)
|
27
|
+
if response['exit_code'] != 0
|
28
|
+
raise "Unable to get file size for #{url} with error: #{response['stderr']}"
|
29
|
+
else
|
30
|
+
#parse out response
|
31
|
+
return response['stdout'].split("\n")[1].split(" ")[1].to_i
|
22
32
|
end
|
23
33
|
end
|
24
34
|
|
25
|
-
def Hdfs.
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
rescue
|
32
|
-
return false
|
35
|
+
def Hdfs.read_by_dataset_path(dst_path,user_name,*args)
|
36
|
+
cluster = dst_path.split("/").first
|
37
|
+
url = Hdfs.url_by_path(dst_path,user_name)
|
38
|
+
#make sure file is not too big
|
39
|
+
if Hdfs.file_size(url,user_name) >= Hadoop.read_limit
|
40
|
+
raise "Hadoop read limit reached -- please reduce query size"
|
33
41
|
end
|
34
|
-
|
35
|
-
|
36
|
-
def Hdfs.read(path,user)
|
37
|
-
cluster, cluster_path = Hdfs.resolve_path(path)
|
38
|
-
gateway_node = Hadoop.gateway_node(cluster)
|
42
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
39
43
|
#need to direct stderr to dev null since hdfs throws errors at being headed off
|
40
|
-
|
41
|
-
|
42
|
-
response
|
43
|
-
|
44
|
-
|
44
|
+
read_command = "dfs -cat '#{hdfs_url}'"
|
45
|
+
response = Hadoop.run(cluster,read_command,user_name)
|
46
|
+
if response['exit_code'] != 0
|
47
|
+
raise "Unable to read from #{url} with error: #{response['stderr']}"
|
48
|
+
else
|
49
|
+
return response['stdout']
|
45
50
|
end
|
46
|
-
response
|
47
51
|
end
|
48
52
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
53
|
+
#used for writing strings straight up to hdfs
|
54
|
+
def Hdfs.write_by_dataset_path(dst_path,string,user_name)
|
55
|
+
cluster = dst_path.split("/").first
|
56
|
+
url = Hdfs.url_by_path(dst_path,user_name)
|
57
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
58
|
+
response = Hdfs.write(cluster,hdfs_url,string,user_name)
|
59
|
+
if response['exit_code'] != 0
|
60
|
+
raise "Unable to write to #{url} with error: #{response['stderr']}"
|
55
61
|
else
|
56
|
-
|
57
|
-
return [Hadoop.output_cluster,"/#{path.to_s}"]
|
62
|
+
return response
|
58
63
|
end
|
59
64
|
end
|
60
65
|
|
61
|
-
def Hdfs.
|
62
|
-
cluster, cluster_path = Hdfs.resolve_path(path)
|
63
|
-
"#{Hdfs.root(cluster)}#{cluster_path}"
|
64
|
-
end
|
65
|
-
|
66
|
-
def Hdfs.write(path,string,user)
|
66
|
+
def Hdfs.write(cluster,hdfs_url,string,user_name)
|
67
67
|
file_hash = {'file.txt'=>string}
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
def Hdfs.copy(source_path,target_path,user)
|
76
|
-
Hdfs.rm(target_path,user) #remove to_path
|
77
|
-
source_cluster = Hdfs.resolve_path(source_path).first
|
78
|
-
command = "dfs -cp '#{Hdfs.namenode_path(source_path)}' '#{Hdfs.namenode_path(target_path)}'"
|
79
|
-
#copy operation implies access to target_url from source_cluster
|
80
|
-
Hadoop.run(command,source_cluster,user)
|
81
|
-
return Hdfs.namenode_path(target_path)
|
68
|
+
#make sure path is clear
|
69
|
+
delete_command = "dfs -rm '#{hdfs_url}'"
|
70
|
+
Hadoop.run(cluster,delete_command,user_name)
|
71
|
+
write_command = "dfs -copyFromLocal file.txt '#{hdfs_url}'"
|
72
|
+
response = Hadoop.run(cluster,write_command,user_name,file_hash)
|
73
|
+
response
|
82
74
|
end
|
83
75
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
#
|
91
|
-
source_cluster
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
76
|
+
#copy file from one url to another
|
77
|
+
#source cluster must be able to issue copy command to target cluster
|
78
|
+
def Hdfs.copy(source_url, target_url, user_name)
|
79
|
+
#convert aliases
|
80
|
+
source_hdfs_url = Hdfs.hdfs_url(source_url)
|
81
|
+
target_hdfs_url = Hdfs.hdfs_url(target_url)
|
82
|
+
#get cluster names
|
83
|
+
source_cluster = source_url.split("://").last.split("/").first
|
84
|
+
target_cluster = target_url.split("://").last.split("/").first
|
85
|
+
#delete target
|
86
|
+
delete_command = "dfs -rm '#{target_hdfs_url}'"
|
87
|
+
Hadoop.run(target_cluster,delete_command,user_name)
|
88
|
+
#copy source to target
|
89
|
+
copy_command = "dfs -cp '#{source_hdfs_url}' '#{target_hdfs_url}'"
|
90
|
+
response = Hadoop.run(source_cluster,copy_command,user_name)
|
91
|
+
if response['exit_code'] != 0
|
92
|
+
raise "Unable to copy #{source_url} to #{target_url} with error: #{response['stderr']}"
|
93
|
+
else
|
94
|
+
return target_url
|
99
95
|
end
|
100
|
-
|
101
|
-
source_path = "#{source_cluster}#{source_cluster_path}"
|
102
|
-
out_string = Hdfs.read(source_path,user).to_s
|
103
|
-
out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
|
104
|
-
Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
|
105
|
-
out_url
|
106
96
|
end
|
107
97
|
|
108
|
-
|
98
|
+
# converts a source path or target path to a dst in the context of handler and stage
|
99
|
+
def Hdfs.path_to_dst(path,stage_path)
|
100
|
+
has_handler = true if path.index("://")
|
109
101
|
s = Stage.where(:path=>stage_path).first
|
110
|
-
u = s.job.runner.user
|
111
102
|
params = s.params
|
112
|
-
source_path = params['source']
|
113
103
|
target_path = params['target']
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
|
126
|
-
source_path = "#{source_cluster}#{source_cluster_path}"
|
127
|
-
source_dst = Dataset.find_or_create_by_handler_and_path("hdfs",source_path)
|
128
|
-
in_string = source_dst.read(user)
|
129
|
-
raise "No data found at hdfs://#{source_path}" unless in_string.to_s.length>0
|
104
|
+
is_target = true if path == target_path
|
105
|
+
red_path = path.split("://").last
|
106
|
+
cluster = red_path.split("/").first
|
107
|
+
#is user has a handler, is specifying a target,
|
108
|
+
#has more than 1 slash,
|
109
|
+
#or their first path node is a cluster name
|
110
|
+
#assume it's an hdfs pointer
|
111
|
+
if is_target or has_handler or Hadoop.clusters.include?(cluster) or red_path.split("/").length>2
|
112
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path)
|
113
|
+
hdfs_url = Hdfs.url_by_path(red_path,user_name,is_target)
|
114
|
+
return Dataset.find_or_create_by_url(hdfs_url)
|
130
115
|
end
|
116
|
+
#otherwise, use ssh convention
|
117
|
+
return Ssh.path_to_dst(path,stage_path)
|
118
|
+
end
|
131
119
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
120
|
+
def Hdfs.url_by_path(path,user_name,is_target=false)
|
121
|
+
cluster = path.split("/").first.to_s
|
122
|
+
if Hadoop.clusters.include?(cluster)
|
123
|
+
#cut node out of path
|
124
|
+
path = "/" + path.split("/")[1..-1].join("/")
|
125
|
+
else
|
126
|
+
cluster = Hadoop.default_cluster
|
127
|
+
path = path.starts_with?("/") ? path : "/#{path}"
|
128
|
+
end
|
129
|
+
url = "hdfs://#{cluster}#{path}"
|
130
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
131
|
+
begin
|
132
|
+
response = Hadoop.run(cluster, "fs -tail '#{hdfs_url}'", user_name)
|
133
|
+
if response['exit_code']==0 or is_target
|
134
|
+
return "hdfs://#{cluster}#{path}"
|
135
|
+
else
|
136
|
+
raise "Unable to find #{url} with error: #{response['stderr']}"
|
137
|
+
end
|
138
|
+
rescue => exc
|
139
|
+
raise Exception, "Unable to find #{url} with error: #{exc.to_s}", exc.backtrace
|
141
140
|
end
|
142
|
-
|
143
|
-
target_path = "#{target_cluster}#{target_cluster_path}"
|
144
|
-
out_string = Hdfs.write(target_path,in_string,user)
|
145
|
-
|
146
|
-
out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
|
147
|
-
Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
|
148
|
-
out_url
|
149
141
|
end
|
150
142
|
|
151
|
-
def Hdfs.
|
143
|
+
def Hdfs.user_name_by_stage_path(stage_path,cluster=nil)
|
152
144
|
s = Stage.where(:path=>stage_path).first
|
153
145
|
u = s.job.runner.user
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
#determine cluster for target
|
163
|
-
target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
|
164
|
-
raise "unable to resolve target path" if target_cluster.nil?
|
165
|
-
|
166
|
-
node = Hadoop.gateway_node(source_cluster)
|
167
|
-
if user and !Ssh.sudoers(node).include?(u.name)
|
168
|
-
raise "#{u.name} does not have su permissions for #{node}"
|
169
|
-
elsif user.nil? and Ssh.su_all_users(node)
|
170
|
-
user = u.name
|
146
|
+
user_name = s.params['user']
|
147
|
+
cluster ||= s.params['cluster']
|
148
|
+
cluster = Hadoop.default_cluster unless Hadoop.clusters.include?(cluster)
|
149
|
+
node = Hadoop.gateway_node(cluster)
|
150
|
+
if user_name and !Ssh.sudoers(node).include?(u.name)
|
151
|
+
raise "#{u.name} does not have su permissions for node #{node}"
|
152
|
+
elsif user_name.nil? and Ssh.su_all_users(node)
|
153
|
+
user_name = u.name
|
171
154
|
end
|
172
|
-
|
173
|
-
source_path = "#{source_cluster}#{source_cluster_path}"
|
174
|
-
target_path = "#{target_cluster}#{target_cluster_path}"
|
175
|
-
out_string = Hdfs.copy(source_path,target_path,user)
|
176
|
-
|
177
|
-
out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
|
178
|
-
Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
|
179
|
-
out_url
|
180
|
-
end
|
181
|
-
|
182
|
-
def Hdfs.read_by_dataset_path(dst_path,user)
|
183
|
-
Hdfs.read(dst_path,user)
|
155
|
+
return user_name
|
184
156
|
end
|
185
157
|
|
186
|
-
def Hdfs.
|
187
|
-
|
158
|
+
def Hdfs.write_by_stage_path(stage_path)
|
159
|
+
s = Stage.where(:path=>stage_path).first
|
160
|
+
source = s.sources.first
|
161
|
+
target = s.target
|
162
|
+
cluster = target.url.split("://").last.split("/").first
|
163
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
164
|
+
stdout = if source.handler == 'hdfs'
|
165
|
+
Hdfs.copy(source.url,target.url,user_name)
|
166
|
+
elsif ["gsheet","gfile","ssh"].include?(source.handler)
|
167
|
+
in_string = source.read(user_name)
|
168
|
+
Dataset.write_by_url(target.url, in_string, user_name)
|
169
|
+
end
|
170
|
+
return {'out_str'=>stdout, 'signal' => 0}
|
188
171
|
end
|
189
172
|
end
|
190
173
|
end
|
data/lib/samples/hadoop.yml
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
---
|
2
2
|
development:
|
3
|
-
output_cluster: dev_cluster
|
4
3
|
output_dir: /user/mobilize/development/
|
5
4
|
read_limit: 1000000000
|
6
5
|
clusters:
|
@@ -17,7 +16,6 @@ development:
|
|
17
16
|
gateway_node: dev_hadoop_host
|
18
17
|
exec_path: /path/to/hadoop
|
19
18
|
test:
|
20
|
-
output_cluster: test_cluster
|
21
19
|
output_dir: /user/mobilize/test/
|
22
20
|
read_limit: 1000000000
|
23
21
|
clusters:
|
@@ -34,7 +32,6 @@ test:
|
|
34
32
|
gateway_node: test_hadoop_host
|
35
33
|
exec_path: /path/to/hadoop
|
36
34
|
production:
|
37
|
-
output_cluster: prod_cluster
|
38
35
|
output_dir: /user/mobilize/production/
|
39
36
|
read_limit: 1000000000
|
40
37
|
clusters:
|
data/mobilize-hdfs.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "mobilize-hdfs"
|
8
8
|
gem.version = Mobilize::Hdfs::VERSION
|
9
9
|
gem.authors = ["Cassio Paes-Leme"]
|
10
|
-
gem.email = ["cpaesleme@
|
10
|
+
gem.email = ["cpaesleme@dena.com"]
|
11
11
|
gem.description = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
|
12
12
|
gem.summary = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
|
13
13
|
gem.homepage = "http://github.com/dena/mobilize-hdfs"
|
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-ssh","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-ssh","1.2"
|
20
20
|
end
|
data/test/hdfs_job_rows.yml
CHANGED
@@ -2,10 +2,9 @@
|
|
2
2
|
active: true
|
3
3
|
trigger: once
|
4
4
|
status: ""
|
5
|
-
stage1: hdfs.write target:"/user/mobilize/test/test_hdfs_1.out",
|
6
|
-
source:"
|
7
|
-
stage2: hdfs.
|
5
|
+
stage1: hdfs.write target:"/user/mobilize/test/test_hdfs_1.out",
|
6
|
+
source:"test_hdfs_1.in"
|
7
|
+
stage2: hdfs.write source:"/user/mobilize/test/test_hdfs_1.out",
|
8
8
|
target:"test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
|
9
|
-
stage3:
|
10
|
-
stage4: gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
|
9
|
+
stage3: gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
|
11
10
|
target:"Runner_mobilize(test)/test_hdfs_1_copy.out"
|
data/test/mobilize-hdfs_test.rb
CHANGED
@@ -34,9 +34,9 @@ describe "Mobilize" do
|
|
34
34
|
hdfs_1_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
|
35
35
|
[hdfs_1_target_sheet].each {|s| s.delete if s}
|
36
36
|
|
37
|
-
puts "job row added, force enqueued requestor, wait
|
37
|
+
puts "job row added, force enqueued requestor, wait for stages"
|
38
38
|
r.enqueue!
|
39
|
-
|
39
|
+
wait_for_stages
|
40
40
|
|
41
41
|
puts "jobtracker posted data to test sheet"
|
42
42
|
test_destination_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
|
@@ -44,4 +44,27 @@ describe "Mobilize" do
|
|
44
44
|
assert test_destination_sheet.read(u.name).length == 599
|
45
45
|
end
|
46
46
|
|
47
|
+
def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
|
48
|
+
time = 0
|
49
|
+
time_since_stage = 0
|
50
|
+
#check for 10 min
|
51
|
+
while time < time_limit and time_since_stage < stage_limit
|
52
|
+
sleep wait_length
|
53
|
+
job_classes = Mobilize::Resque.jobs.map{|j| j['class']}
|
54
|
+
if job_classes.include?("Mobilize::Stage")
|
55
|
+
time_since_stage = 0
|
56
|
+
puts "saw stage at #{time.to_s} seconds"
|
57
|
+
else
|
58
|
+
time_since_stage += wait_length
|
59
|
+
puts "#{time_since_stage.to_s} seconds since stage seen"
|
60
|
+
end
|
61
|
+
time += wait_length
|
62
|
+
puts "total wait time #{time.to_s} seconds"
|
63
|
+
end
|
64
|
+
|
65
|
+
if time >= time_limit
|
66
|
+
raise "Timed out before stage completion"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
47
70
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-ssh
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 1.
|
21
|
+
version: '1.2'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,10 +26,10 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: 1.
|
29
|
+
version: '1.2'
|
30
30
|
description: Adds hdfs read, write, and copy support to mobilize-ssh
|
31
31
|
email:
|
32
|
-
- cpaesleme@
|
32
|
+
- cpaesleme@dena.com
|
33
33
|
executables: []
|
34
34
|
extensions: []
|
35
35
|
extra_rdoc_files: []
|