mobilize-hdfs 1.0.10 → 1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +13 -22
- data/lib/mobilize-hdfs/handlers/hadoop.rb +11 -11
- data/lib/mobilize-hdfs/handlers/hdfs.rb +129 -146
- data/lib/mobilize-hdfs/version.rb +1 -1
- data/lib/samples/hadoop.yml +0 -3
- data/mobilize-hdfs.gemspec +2 -2
- data/test/hdfs_job_rows.yml +4 -5
- data/test/mobilize-hdfs_test.rb +25 -2
- metadata +5 -5
data/README.md
CHANGED
@@ -94,14 +94,11 @@ be read. If the data is bigger than the read limit, an exception will be
|
|
94
94
|
raised.
|
95
95
|
|
96
96
|
The Hadoop configuration consists of:
|
97
|
-
* output_cluster, which is the cluster where stage outputs will be
|
98
|
-
stored. Clusters are defined in the clusters parameter as described
|
99
|
-
below.
|
100
97
|
* output_dir, which is the absolute path to the directory in HDFS that will store stage
|
101
|
-
outputs. Directory names should end with a slash (/).
|
98
|
+
outputs. Directory names should end with a slash (/). It will choose the
|
99
|
+
first cluster as the default cluster to write to.
|
102
100
|
* read_limit, which is the maximum size data that can be read from the
|
103
|
-
cluster.
|
104
|
-
-c <size limit>. Default is 1GB.
|
101
|
+
cluster. Default is 1GB.
|
105
102
|
* clusters - this defines aliases for clusters, which are used as
|
106
103
|
parameters for Hdfs stages. Cluster aliases contain 5 parameters:
|
107
104
|
* namenode - defines the name and port for accessing the namenode
|
@@ -118,7 +115,6 @@ Sample hadoop.yml:
|
|
118
115
|
``` yml
|
119
116
|
---
|
120
117
|
development:
|
121
|
-
output_cluster: dev_cluster
|
122
118
|
output_dir: /user/mobilize/development/
|
123
119
|
read_limit: 1000000000
|
124
120
|
clusters:
|
@@ -135,7 +131,6 @@ development:
|
|
135
131
|
gateway_node: dev_hadoop_host
|
136
132
|
exec_path: /path/to/hadoop
|
137
133
|
test:
|
138
|
-
output_cluster: test_cluster
|
139
134
|
output_dir: /user/mobilize/test/
|
140
135
|
read_limit: 1000000000
|
141
136
|
clusters:
|
@@ -152,7 +147,6 @@ test:
|
|
152
147
|
gateway_node: test_hadoop_host
|
153
148
|
exec_path: /path/to/hadoop
|
154
149
|
production:
|
155
|
-
output_cluster: prod_cluster
|
156
150
|
output_dir: /user/mobilize/production/
|
157
151
|
read_limit: 1000000000
|
158
152
|
clusters:
|
@@ -181,17 +175,15 @@ Start
|
|
181
175
|
* cluster and user are optional for all of the below.
|
182
176
|
* cluster defaults to output_cluster;
|
183
177
|
* user is treated the same way as in [mobilize-ssh][mobilize-ssh].
|
184
|
-
* hdfs.
|
185
|
-
*
|
186
|
-
*
|
187
|
-
* The gsheet_full_path should be of the form `<gbook_name>/<gsheet_name>`. The test uses "Requestor_mobilize(test)/test_hdfs_1.in".
|
188
|
-
* The hdfs_full_path is the cluster alias followed by full path on the cluster.
|
178
|
+
* hdfs.write `source:<full_path>, target:<hdfs_full_path>, user:<user>`
|
179
|
+
* The full_path can use `<gsheet_path>` or `<hdfs_path>`. The test uses "test_hdfs_1.in".
|
180
|
+
* `<hdfs_path>` is the cluster alias followed by absolute path on the cluster.
|
189
181
|
* if a full path is supplied without a preceding cluster alias (e.g. "/user/mobilize/test/test_hdfs_1.in"),
|
190
|
-
the
|
182
|
+
the first listed cluster will be used as the default.
|
191
183
|
* The test uses "/user/mobilize/test/test_hdfs_1.in" for the initial
|
192
184
|
write, then "test_cluster_2/user/mobilize/test/test_hdfs_copy.out" for
|
193
|
-
the
|
194
|
-
* both cluster arguments and user are optional. If
|
185
|
+
the cross-cluster write.
|
186
|
+
* both cluster arguments and user are optional. If writing from
|
195
187
|
one cluster to another, your source_cluster gateway_node must be able to
|
196
188
|
access both clusters.
|
197
189
|
|
@@ -216,12 +208,11 @@ same cluster as your first.
|
|
216
208
|
|
217
209
|
3) $ rake test
|
218
210
|
|
219
|
-
* The test runs a
|
211
|
+
* The test runs a 3 stage job:
|
220
212
|
* test_hdfs_1:
|
221
|
-
* `hdfs.write target:"/user/mobilize/test/test_hdfs_1.out", source:"
|
222
|
-
* `hdfs.
|
223
|
-
* `
|
224
|
-
* `gsheet.write source:"stage3", target:"Runner_mobilize(test)/test_hdfs_1_copy.out"`
|
213
|
+
* `hdfs.write target:"/user/mobilize/test/test_hdfs_1.out", source:"test_hdfs_1.in"`
|
214
|
+
* `hdfs.write source:"/user/mobilize/test/test_hdfs_1.out",target:"test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out"`
|
215
|
+
* `gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out", target:"test_hdfs_1_copy.out"`
|
225
216
|
* at the end of the test, there should be a sheet named "test_hdfs_1_copy.out" with the same data as test_hdfs_1.in
|
226
217
|
|
227
218
|
<a name='section_Meta'></a>
|
@@ -9,15 +9,15 @@ module Mobilize
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def Hadoop.gateway_node(cluster)
|
12
|
-
Hadoop.clusters[cluster]['gateway_node']
|
12
|
+
Hadoop.config['clusters'][cluster]['gateway_node']
|
13
13
|
end
|
14
14
|
|
15
15
|
def Hadoop.clusters
|
16
|
-
Hadoop.config['clusters']
|
16
|
+
Hadoop.config['clusters'].keys
|
17
17
|
end
|
18
18
|
|
19
|
-
def Hadoop.
|
20
|
-
Hadoop.
|
19
|
+
def Hadoop.default_cluster
|
20
|
+
Hadoop.clusters.first
|
21
21
|
end
|
22
22
|
|
23
23
|
def Hadoop.output_dir
|
@@ -28,20 +28,20 @@ module Mobilize
|
|
28
28
|
Hadoop.config['read_limit']
|
29
29
|
end
|
30
30
|
|
31
|
-
def Hadoop.job(command,
|
31
|
+
def Hadoop.job(cluster,command,user,file_hash={})
|
32
32
|
command = ["-",command].join unless command.starts_with?("-")
|
33
|
-
Hadoop.run("job -fs #{Hdfs.root(cluster)} #{command}",
|
33
|
+
Hadoop.run(cluster,"job -fs #{Hdfs.root(cluster)} #{command}",user,file_hash).ie do |r|
|
34
34
|
r.class==Array ? r.first : r
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
38
|
def Hadoop.job_list(cluster)
|
39
|
-
raw_list = Hadoop.job("list"
|
39
|
+
raw_list = Hadoop.job(cluster,"list")
|
40
40
|
raw_list.split("\n")[1..-1].join("\n").tsv_to_hash_array
|
41
41
|
end
|
42
42
|
|
43
|
-
def Hadoop.job_status(
|
44
|
-
raw_status = Hadoop.job("status #{
|
43
|
+
def Hadoop.job_status(cluster,hadoop_job_id)
|
44
|
+
raw_status = Hadoop.job(cluster,"status #{hadoop_job_id}",{})
|
45
45
|
dhash_status = raw_status.strip.split("\n").map do |sline|
|
46
46
|
delim_index = [sline.index("="),sline.index(":")].compact.min
|
47
47
|
if delim_index
|
@@ -54,14 +54,14 @@ module Mobilize
|
|
54
54
|
hash_status
|
55
55
|
end
|
56
56
|
|
57
|
-
def Hadoop.run(command,
|
57
|
+
def Hadoop.run(cluster,command,user_name,file_hash={})
|
58
58
|
h_command = if command.starts_with?("hadoop")
|
59
59
|
command.sub("hadoop",Hadoop.exec_path(cluster))
|
60
60
|
else
|
61
61
|
"#{Hadoop.exec_path(cluster)} #{command}"
|
62
62
|
end
|
63
63
|
gateway_node = Hadoop.gateway_node(cluster)
|
64
|
-
Ssh.run(gateway_node,h_command,
|
64
|
+
Ssh.run(gateway_node,h_command,user_name,file_hash)
|
65
65
|
end
|
66
66
|
end
|
67
67
|
end
|
@@ -1,190 +1,173 @@
|
|
1
1
|
module Mobilize
|
2
2
|
module Hdfs
|
3
|
+
#returns the hdfs path to the root of the cluster
|
3
4
|
def Hdfs.root(cluster)
|
4
|
-
namenode = Hadoop.clusters[cluster]['namenode']
|
5
|
+
namenode = Hadoop.config['clusters'][cluster]['namenode']
|
5
6
|
"hdfs://#{namenode['name']}:#{namenode['port']}"
|
6
7
|
end
|
7
8
|
|
8
|
-
|
9
|
+
#replaces the cluster alias with a proper namenode path
|
10
|
+
def Hdfs.hdfs_url(url)
|
11
|
+
cluster = url.split("hdfs://").last.split("/").first
|
12
|
+
#replace first instance
|
13
|
+
url.sub("hdfs://#{cluster}",Hdfs.root(cluster))
|
14
|
+
end
|
15
|
+
|
16
|
+
def Hdfs.run(cluster,command,user)
|
9
17
|
command = ["-",command].join unless command.starts_with?("-")
|
10
18
|
command = "dfs -fs #{Hdfs.root(cluster)}/ #{command}"
|
11
|
-
Hadoop.run(command,
|
19
|
+
Hadoop.run(cluster,command,user)
|
12
20
|
end
|
13
21
|
|
14
|
-
|
15
|
-
|
16
|
-
cluster
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
#return the size in bytes for an Hdfs file
|
23
|
+
def Hdfs.file_size(url,user_name)
|
24
|
+
cluster = url.split("://").last.split("/").first
|
25
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
26
|
+
response = Hadoop.run(cluster, "dfs -du '#{hdfs_url}'", user_name)
|
27
|
+
if response['exit_code'] != 0
|
28
|
+
raise "Unable to get file size for #{url} with error: #{response['stderr']}"
|
29
|
+
else
|
30
|
+
#parse out response
|
31
|
+
return response['stdout'].split("\n")[1].split(" ")[1].to_i
|
22
32
|
end
|
23
33
|
end
|
24
34
|
|
25
|
-
def Hdfs.
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
rescue
|
32
|
-
return false
|
35
|
+
def Hdfs.read_by_dataset_path(dst_path,user_name,*args)
|
36
|
+
cluster = dst_path.split("/").first
|
37
|
+
url = Hdfs.url_by_path(dst_path,user_name)
|
38
|
+
#make sure file is not too big
|
39
|
+
if Hdfs.file_size(url,user_name) >= Hadoop.read_limit
|
40
|
+
raise "Hadoop read limit reached -- please reduce query size"
|
33
41
|
end
|
34
|
-
|
35
|
-
|
36
|
-
def Hdfs.read(path,user)
|
37
|
-
cluster, cluster_path = Hdfs.resolve_path(path)
|
38
|
-
gateway_node = Hadoop.gateway_node(cluster)
|
42
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
39
43
|
#need to direct stderr to dev null since hdfs throws errors at being headed off
|
40
|
-
|
41
|
-
|
42
|
-
response
|
43
|
-
|
44
|
-
|
44
|
+
read_command = "dfs -cat '#{hdfs_url}'"
|
45
|
+
response = Hadoop.run(cluster,read_command,user_name)
|
46
|
+
if response['exit_code'] != 0
|
47
|
+
raise "Unable to read from #{url} with error: #{response['stderr']}"
|
48
|
+
else
|
49
|
+
return response['stdout']
|
45
50
|
end
|
46
|
-
response
|
47
51
|
end
|
48
52
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
53
|
+
#used for writing strings straight up to hdfs
|
54
|
+
def Hdfs.write_by_dataset_path(dst_path,string,user_name)
|
55
|
+
cluster = dst_path.split("/").first
|
56
|
+
url = Hdfs.url_by_path(dst_path,user_name)
|
57
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
58
|
+
response = Hdfs.write(cluster,hdfs_url,string,user_name)
|
59
|
+
if response['exit_code'] != 0
|
60
|
+
raise "Unable to write to #{url} with error: #{response['stderr']}"
|
55
61
|
else
|
56
|
-
|
57
|
-
return [Hadoop.output_cluster,"/#{path.to_s}"]
|
62
|
+
return response
|
58
63
|
end
|
59
64
|
end
|
60
65
|
|
61
|
-
def Hdfs.
|
62
|
-
cluster, cluster_path = Hdfs.resolve_path(path)
|
63
|
-
"#{Hdfs.root(cluster)}#{cluster_path}"
|
64
|
-
end
|
65
|
-
|
66
|
-
def Hdfs.write(path,string,user)
|
66
|
+
def Hdfs.write(cluster,hdfs_url,string,user_name)
|
67
67
|
file_hash = {'file.txt'=>string}
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
def Hdfs.copy(source_path,target_path,user)
|
76
|
-
Hdfs.rm(target_path,user) #remove to_path
|
77
|
-
source_cluster = Hdfs.resolve_path(source_path).first
|
78
|
-
command = "dfs -cp '#{Hdfs.namenode_path(source_path)}' '#{Hdfs.namenode_path(target_path)}'"
|
79
|
-
#copy operation implies access to target_url from source_cluster
|
80
|
-
Hadoop.run(command,source_cluster,user)
|
81
|
-
return Hdfs.namenode_path(target_path)
|
68
|
+
#make sure path is clear
|
69
|
+
delete_command = "dfs -rm '#{hdfs_url}'"
|
70
|
+
Hadoop.run(cluster,delete_command,user_name)
|
71
|
+
write_command = "dfs -copyFromLocal file.txt '#{hdfs_url}'"
|
72
|
+
response = Hadoop.run(cluster,write_command,user_name,file_hash)
|
73
|
+
response
|
82
74
|
end
|
83
75
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
#
|
91
|
-
source_cluster
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
76
|
+
#copy file from one url to another
|
77
|
+
#source cluster must be able to issue copy command to target cluster
|
78
|
+
def Hdfs.copy(source_url, target_url, user_name)
|
79
|
+
#convert aliases
|
80
|
+
source_hdfs_url = Hdfs.hdfs_url(source_url)
|
81
|
+
target_hdfs_url = Hdfs.hdfs_url(target_url)
|
82
|
+
#get cluster names
|
83
|
+
source_cluster = source_url.split("://").last.split("/").first
|
84
|
+
target_cluster = target_url.split("://").last.split("/").first
|
85
|
+
#delete target
|
86
|
+
delete_command = "dfs -rm '#{target_hdfs_url}'"
|
87
|
+
Hadoop.run(target_cluster,delete_command,user_name)
|
88
|
+
#copy source to target
|
89
|
+
copy_command = "dfs -cp '#{source_hdfs_url}' '#{target_hdfs_url}'"
|
90
|
+
response = Hadoop.run(source_cluster,copy_command,user_name)
|
91
|
+
if response['exit_code'] != 0
|
92
|
+
raise "Unable to copy #{source_url} to #{target_url} with error: #{response['stderr']}"
|
93
|
+
else
|
94
|
+
return target_url
|
99
95
|
end
|
100
|
-
|
101
|
-
source_path = "#{source_cluster}#{source_cluster_path}"
|
102
|
-
out_string = Hdfs.read(source_path,user).to_s
|
103
|
-
out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
|
104
|
-
Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
|
105
|
-
out_url
|
106
96
|
end
|
107
97
|
|
108
|
-
|
98
|
+
# converts a source path or target path to a dst in the context of handler and stage
|
99
|
+
def Hdfs.path_to_dst(path,stage_path)
|
100
|
+
has_handler = true if path.index("://")
|
109
101
|
s = Stage.where(:path=>stage_path).first
|
110
|
-
u = s.job.runner.user
|
111
102
|
params = s.params
|
112
|
-
source_path = params['source']
|
113
103
|
target_path = params['target']
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
source_cluster, source_cluster_path = Hdfs.resolve_path(source_path)
|
126
|
-
source_path = "#{source_cluster}#{source_cluster_path}"
|
127
|
-
source_dst = Dataset.find_or_create_by_handler_and_path("hdfs",source_path)
|
128
|
-
in_string = source_dst.read(user)
|
129
|
-
raise "No data found at hdfs://#{source_path}" unless in_string.to_s.length>0
|
104
|
+
is_target = true if path == target_path
|
105
|
+
red_path = path.split("://").last
|
106
|
+
cluster = red_path.split("/").first
|
107
|
+
#is user has a handler, is specifying a target,
|
108
|
+
#has more than 1 slash,
|
109
|
+
#or their first path node is a cluster name
|
110
|
+
#assume it's an hdfs pointer
|
111
|
+
if is_target or has_handler or Hadoop.clusters.include?(cluster) or red_path.split("/").length>2
|
112
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path)
|
113
|
+
hdfs_url = Hdfs.url_by_path(red_path,user_name,is_target)
|
114
|
+
return Dataset.find_or_create_by_url(hdfs_url)
|
130
115
|
end
|
116
|
+
#otherwise, use ssh convention
|
117
|
+
return Ssh.path_to_dst(path,stage_path)
|
118
|
+
end
|
131
119
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
120
|
+
def Hdfs.url_by_path(path,user_name,is_target=false)
|
121
|
+
cluster = path.split("/").first.to_s
|
122
|
+
if Hadoop.clusters.include?(cluster)
|
123
|
+
#cut node out of path
|
124
|
+
path = "/" + path.split("/")[1..-1].join("/")
|
125
|
+
else
|
126
|
+
cluster = Hadoop.default_cluster
|
127
|
+
path = path.starts_with?("/") ? path : "/#{path}"
|
128
|
+
end
|
129
|
+
url = "hdfs://#{cluster}#{path}"
|
130
|
+
hdfs_url = Hdfs.hdfs_url(url)
|
131
|
+
begin
|
132
|
+
response = Hadoop.run(cluster, "fs -tail '#{hdfs_url}'", user_name)
|
133
|
+
if response['exit_code']==0 or is_target
|
134
|
+
return "hdfs://#{cluster}#{path}"
|
135
|
+
else
|
136
|
+
raise "Unable to find #{url} with error: #{response['stderr']}"
|
137
|
+
end
|
138
|
+
rescue => exc
|
139
|
+
raise Exception, "Unable to find #{url} with error: #{exc.to_s}", exc.backtrace
|
141
140
|
end
|
142
|
-
|
143
|
-
target_path = "#{target_cluster}#{target_cluster_path}"
|
144
|
-
out_string = Hdfs.write(target_path,in_string,user)
|
145
|
-
|
146
|
-
out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
|
147
|
-
Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
|
148
|
-
out_url
|
149
141
|
end
|
150
142
|
|
151
|
-
def Hdfs.
|
143
|
+
def Hdfs.user_name_by_stage_path(stage_path,cluster=nil)
|
152
144
|
s = Stage.where(:path=>stage_path).first
|
153
145
|
u = s.job.runner.user
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
#determine cluster for target
|
163
|
-
target_cluster, target_cluster_path = Hdfs.resolve_path(target_path)
|
164
|
-
raise "unable to resolve target path" if target_cluster.nil?
|
165
|
-
|
166
|
-
node = Hadoop.gateway_node(source_cluster)
|
167
|
-
if user and !Ssh.sudoers(node).include?(u.name)
|
168
|
-
raise "#{u.name} does not have su permissions for #{node}"
|
169
|
-
elsif user.nil? and Ssh.su_all_users(node)
|
170
|
-
user = u.name
|
146
|
+
user_name = s.params['user']
|
147
|
+
cluster ||= s.params['cluster']
|
148
|
+
cluster = Hadoop.default_cluster unless Hadoop.clusters.include?(cluster)
|
149
|
+
node = Hadoop.gateway_node(cluster)
|
150
|
+
if user_name and !Ssh.sudoers(node).include?(u.name)
|
151
|
+
raise "#{u.name} does not have su permissions for node #{node}"
|
152
|
+
elsif user_name.nil? and Ssh.su_all_users(node)
|
153
|
+
user_name = u.name
|
171
154
|
end
|
172
|
-
|
173
|
-
source_path = "#{source_cluster}#{source_cluster_path}"
|
174
|
-
target_path = "#{target_cluster}#{target_cluster_path}"
|
175
|
-
out_string = Hdfs.copy(source_path,target_path,user)
|
176
|
-
|
177
|
-
out_url = "hdfs://#{Hadoop.output_cluster}#{Hadoop.output_dir}hdfs/#{stage_path}/out"
|
178
|
-
Dataset.write_by_url(out_url,out_string,Gdrive.owner_name)
|
179
|
-
out_url
|
180
|
-
end
|
181
|
-
|
182
|
-
def Hdfs.read_by_dataset_path(dst_path,user)
|
183
|
-
Hdfs.read(dst_path,user)
|
155
|
+
return user_name
|
184
156
|
end
|
185
157
|
|
186
|
-
def Hdfs.
|
187
|
-
|
158
|
+
def Hdfs.write_by_stage_path(stage_path)
|
159
|
+
s = Stage.where(:path=>stage_path).first
|
160
|
+
source = s.sources.first
|
161
|
+
target = s.target
|
162
|
+
cluster = target.url.split("://").last.split("/").first
|
163
|
+
user_name = Hdfs.user_name_by_stage_path(stage_path,cluster)
|
164
|
+
stdout = if source.handler == 'hdfs'
|
165
|
+
Hdfs.copy(source.url,target.url,user_name)
|
166
|
+
elsif ["gsheet","gfile","ssh"].include?(source.handler)
|
167
|
+
in_string = source.read(user_name)
|
168
|
+
Dataset.write_by_url(target.url, in_string, user_name)
|
169
|
+
end
|
170
|
+
return {'out_str'=>stdout, 'signal' => 0}
|
188
171
|
end
|
189
172
|
end
|
190
173
|
end
|
data/lib/samples/hadoop.yml
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
---
|
2
2
|
development:
|
3
|
-
output_cluster: dev_cluster
|
4
3
|
output_dir: /user/mobilize/development/
|
5
4
|
read_limit: 1000000000
|
6
5
|
clusters:
|
@@ -17,7 +16,6 @@ development:
|
|
17
16
|
gateway_node: dev_hadoop_host
|
18
17
|
exec_path: /path/to/hadoop
|
19
18
|
test:
|
20
|
-
output_cluster: test_cluster
|
21
19
|
output_dir: /user/mobilize/test/
|
22
20
|
read_limit: 1000000000
|
23
21
|
clusters:
|
@@ -34,7 +32,6 @@ test:
|
|
34
32
|
gateway_node: test_hadoop_host
|
35
33
|
exec_path: /path/to/hadoop
|
36
34
|
production:
|
37
|
-
output_cluster: prod_cluster
|
38
35
|
output_dir: /user/mobilize/production/
|
39
36
|
read_limit: 1000000000
|
40
37
|
clusters:
|
data/mobilize-hdfs.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "mobilize-hdfs"
|
8
8
|
gem.version = Mobilize::Hdfs::VERSION
|
9
9
|
gem.authors = ["Cassio Paes-Leme"]
|
10
|
-
gem.email = ["cpaesleme@
|
10
|
+
gem.email = ["cpaesleme@dena.com"]
|
11
11
|
gem.description = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
|
12
12
|
gem.summary = %q{Adds hdfs read, write, and copy support to mobilize-ssh}
|
13
13
|
gem.homepage = "http://github.com/dena/mobilize-hdfs"
|
@@ -16,5 +16,5 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
|
-
gem.add_runtime_dependency "mobilize-ssh","1.
|
19
|
+
gem.add_runtime_dependency "mobilize-ssh","1.2"
|
20
20
|
end
|
data/test/hdfs_job_rows.yml
CHANGED
@@ -2,10 +2,9 @@
|
|
2
2
|
active: true
|
3
3
|
trigger: once
|
4
4
|
status: ""
|
5
|
-
stage1: hdfs.write target:"/user/mobilize/test/test_hdfs_1.out",
|
6
|
-
source:"
|
7
|
-
stage2: hdfs.
|
5
|
+
stage1: hdfs.write target:"/user/mobilize/test/test_hdfs_1.out",
|
6
|
+
source:"test_hdfs_1.in"
|
7
|
+
stage2: hdfs.write source:"/user/mobilize/test/test_hdfs_1.out",
|
8
8
|
target:"test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
|
9
|
-
stage3:
|
10
|
-
stage4: gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
|
9
|
+
stage3: gsheet.write source:"hdfs://test_cluster_2/user/mobilize/test/test_hdfs_1_copy.out",
|
11
10
|
target:"Runner_mobilize(test)/test_hdfs_1_copy.out"
|
data/test/mobilize-hdfs_test.rb
CHANGED
@@ -34,9 +34,9 @@ describe "Mobilize" do
|
|
34
34
|
hdfs_1_target_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
|
35
35
|
[hdfs_1_target_sheet].each {|s| s.delete if s}
|
36
36
|
|
37
|
-
puts "job row added, force enqueued requestor, wait
|
37
|
+
puts "job row added, force enqueued requestor, wait for stages"
|
38
38
|
r.enqueue!
|
39
|
-
|
39
|
+
wait_for_stages
|
40
40
|
|
41
41
|
puts "jobtracker posted data to test sheet"
|
42
42
|
test_destination_sheet = Mobilize::Gsheet.find_by_path("#{r.path.split("/")[0..-2].join("/")}/test_hdfs_1_copy.out",gdrive_slot)
|
@@ -44,4 +44,27 @@ describe "Mobilize" do
|
|
44
44
|
assert test_destination_sheet.read(u.name).length == 599
|
45
45
|
end
|
46
46
|
|
47
|
+
def wait_for_stages(time_limit=600,stage_limit=120,wait_length=10)
|
48
|
+
time = 0
|
49
|
+
time_since_stage = 0
|
50
|
+
#check for 10 min
|
51
|
+
while time < time_limit and time_since_stage < stage_limit
|
52
|
+
sleep wait_length
|
53
|
+
job_classes = Mobilize::Resque.jobs.map{|j| j['class']}
|
54
|
+
if job_classes.include?("Mobilize::Stage")
|
55
|
+
time_since_stage = 0
|
56
|
+
puts "saw stage at #{time.to_s} seconds"
|
57
|
+
else
|
58
|
+
time_since_stage += wait_length
|
59
|
+
puts "#{time_since_stage.to_s} seconds since stage seen"
|
60
|
+
end
|
61
|
+
time += wait_length
|
62
|
+
puts "total wait time #{time.to_s} seconds"
|
63
|
+
end
|
64
|
+
|
65
|
+
if time >= time_limit
|
66
|
+
raise "Timed out before stage completion"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
47
70
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mobilize-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mobilize-ssh
|
@@ -18,7 +18,7 @@ dependencies:
|
|
18
18
|
requirements:
|
19
19
|
- - '='
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 1.
|
21
|
+
version: '1.2'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
24
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -26,10 +26,10 @@ dependencies:
|
|
26
26
|
requirements:
|
27
27
|
- - '='
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: 1.
|
29
|
+
version: '1.2'
|
30
30
|
description: Adds hdfs read, write, and copy support to mobilize-ssh
|
31
31
|
email:
|
32
|
-
- cpaesleme@
|
32
|
+
- cpaesleme@dena.com
|
33
33
|
executables: []
|
34
34
|
extensions: []
|
35
35
|
extra_rdoc_files: []
|