rubydoop 1.0.0-java → 2.0.0.pre1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/rubydoop.jar +0 -0
- data/lib/rubydoop.rb +5 -49
- data/lib/rubydoop/dsl.rb +152 -71
- data/lib/rubydoop/job_runner.rb +50 -0
- data/lib/rubydoop/package.rb +29 -69
- data/lib/rubydoop/version.rb +2 -2
- metadata +27 -14
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0e1f088d9982e3f4cfa2083e3c9817e96a9f586f
|
4
|
+
data.tar.gz: b909d0ee95492428d0b2a29418fe84cb67fa8adb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2f49a006f7d5ed891af12f3e1da488e375bc4730da15f8488e920a433fbf45ba9d4d3a3a318e8eacac3fc726b524402ce266abc970bd9b18e37474f0068986ba
|
7
|
+
data.tar.gz: 4e740a1ad1a7294f98e0710ec57160b3dce03c1d3b179805d6ce18c3a5040de82e4fa06b175f83a9531b2e98631752dea6717d4513a4804d48a331ebec500c16
|
data/lib/rubydoop.jar
CHANGED
Binary file
|
data/lib/rubydoop.rb
CHANGED
@@ -1,58 +1,14 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
$LOAD_PATH << File.expand_path('..', __FILE__)
|
4
|
-
|
5
|
-
|
6
3
|
require 'hadoop'
|
4
|
+
require 'rubydoop.jar'
|
7
5
|
|
8
|
-
|
9
|
-
#
|
10
|
-
# {Package} for the packaging documentation, or the {file:README.md README}
|
6
|
+
# See {Rubydoop.run} for the job configuration DSL documentation,
|
7
|
+
# {Package} for the packaging documentation, or the {file:README.md README}
|
11
8
|
# for a getting started guide.
|
12
9
|
module Rubydoop
|
13
|
-
|
14
|
-
def self.create_mapper(conf)
|
15
|
-
create_instance(conf.get(MAPPER_KEY))
|
16
|
-
end
|
17
|
-
|
18
|
-
# @private
|
19
|
-
def self.create_reducer(conf)
|
20
|
-
create_instance(conf.get(REDUCER_KEY))
|
21
|
-
end
|
22
|
-
|
23
|
-
# @private
|
24
|
-
def self.create_combiner(conf)
|
25
|
-
create_instance(conf.get(COMBINER_KEY))
|
26
|
-
end
|
27
|
-
|
28
|
-
# @private
|
29
|
-
def self.create_partitioner(conf)
|
30
|
-
create_instance(conf.get(PARTITIONER_KEY))
|
31
|
-
end
|
32
|
-
|
33
|
-
# @private
|
34
|
-
def self.create_grouping_comparator(conf)
|
35
|
-
create_instance(conf.get(GROUPING_COMPARATOR_KEY))
|
36
|
-
end
|
37
|
-
|
38
|
-
# @private
|
39
|
-
def self.create_sort_comparator(conf)
|
40
|
-
create_instance(conf.get(SORT_COMPARATOR_KEY))
|
41
|
-
end
|
42
|
-
|
43
|
-
private
|
44
|
-
|
45
|
-
MAPPER_KEY = 'rubydoop.mapper'.freeze
|
46
|
-
REDUCER_KEY = 'rubydoop.reducer'.freeze
|
47
|
-
COMBINER_KEY = 'rubydoop.combiner'.freeze
|
48
|
-
PARTITIONER_KEY = 'rubydoop.partitioner'.freeze
|
49
|
-
GROUPING_COMPARATOR_KEY = 'rubydoop.grouping_comparator'.freeze
|
50
|
-
SORT_COMPARATOR_KEY = 'rubydoop.sort_comparator'.freeze
|
51
|
-
|
52
|
-
def self.create_instance(const_path)
|
53
|
-
cls = const_path.split('::').reduce(Object) { |host, name| host.const_get(name) }
|
54
|
-
cls.new
|
55
|
-
end
|
10
|
+
include_package 'rubydoop'
|
56
11
|
end
|
57
12
|
|
58
13
|
require 'rubydoop/dsl'
|
14
|
+
require 'rubydoop/job_runner'
|
data/lib/rubydoop/dsl.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
module Rubydoop
|
4
4
|
# Main entrypoint into the configuration DSL.
|
5
5
|
#
|
6
|
-
# @example
|
6
|
+
# @example Running a job
|
7
7
|
#
|
8
|
-
# Rubydoop.
|
8
|
+
# Rubydoop.run do |*args|
|
9
9
|
# job 'word_count' do
|
10
10
|
# input args[0]
|
11
11
|
# output args[1]
|
@@ -18,67 +18,69 @@ module Rubydoop
|
|
18
18
|
# end
|
19
19
|
# end
|
20
20
|
#
|
21
|
-
# Within a
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
21
|
+
# Within a run block you can specify one or more jobs, the `job` blocks
|
22
|
+
# are run in the context of a {JobDefinition} instance, so look at that
|
23
|
+
# class for documentation about the available properties. The `run` block
|
24
|
+
# is run within the context of a {ConfigurationDefinition} instance. The
|
25
|
+
# arguments to the `run` block is the command line arguments, minus those
|
26
|
+
# handled by Hadoop's `ToolRunner`.
|
27
27
|
#
|
28
28
|
# @yieldparam [Array<String>] *arguments The command line arguments
|
29
29
|
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# block is not run (this is a feature, it means that the configuration
|
34
|
-
# block doesn't run in mappers and reducers).
|
35
|
-
#
|
36
|
-
def self.configure(impl=ConfigurationDefinition, &block)
|
37
|
-
impl.new($rubydoop_context, &block) if $rubydoop_context
|
30
|
+
def self.run(args=ARGV, &block)
|
31
|
+
return if $rubydoop_embedded
|
32
|
+
JobRunner.run(args, &block)
|
38
33
|
end
|
39
34
|
|
40
|
-
#
|
41
|
-
|
42
|
-
|
35
|
+
# @ see {Rubydoop.run}
|
36
|
+
def self.configure(&block)
|
37
|
+
run(&block)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Configuration DSL.
|
43
41
|
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
# # same DSL as shown in the documentation for Rubydoop.configure
|
47
|
-
# end
|
42
|
+
# `Rubydoop.run` blocks are run within the context of an instance of this
|
43
|
+
# class. These are the methods available in those blocks.
|
48
44
|
#
|
49
45
|
class ConfigurationDefinition
|
50
|
-
|
46
|
+
# @private
|
47
|
+
def initialize(context)
|
51
48
|
@context = context
|
52
|
-
instance_exec(*arguments, &block) if @context && block_given?
|
53
|
-
end
|
54
|
-
|
55
|
-
def arguments
|
56
|
-
@context.arguments
|
57
49
|
end
|
58
50
|
|
59
51
|
def job(name, &block)
|
60
|
-
|
61
|
-
job = JobDefinition.new(@context, @context.create_job(name))
|
52
|
+
job = JobDefinition.new(@context.create_job(name))
|
62
53
|
job.instance_exec(&block)
|
63
54
|
job
|
64
55
|
end
|
56
|
+
|
57
|
+
def parallel(&block)
|
58
|
+
@context.parallel(&block)
|
59
|
+
end
|
60
|
+
|
61
|
+
def sequence(&block)
|
62
|
+
@context.sequence(&block)
|
63
|
+
end
|
64
|
+
|
65
|
+
def wait_for_completion(verbose)
|
66
|
+
@context.wait_for_completion(verbose)
|
67
|
+
end
|
65
68
|
end
|
66
69
|
|
67
70
|
# Job configuration DSL.
|
68
71
|
#
|
69
|
-
# `
|
70
|
-
#
|
72
|
+
# `job` blocks are run within the context of an instance of this
|
73
|
+
# class. These are the methods available in those blocks.
|
71
74
|
#
|
72
75
|
class JobDefinition
|
73
76
|
# @private
|
74
|
-
def initialize(
|
75
|
-
@context = context
|
77
|
+
def initialize(job)
|
76
78
|
@job = job
|
77
79
|
end
|
78
80
|
|
79
81
|
# Sets the input paths of the job.
|
80
82
|
#
|
81
|
-
# Calls `setInputFormatClass` on the Hadoop job and uses the static
|
83
|
+
# Calls `setInputFormatClass` on the Hadoop job and uses the static
|
82
84
|
# `setInputPaths` on the input format to set the job's input path.
|
83
85
|
#
|
84
86
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setInputFormatClass(java.lang.Class) Hadoop's Job#setInputFormatClass
|
@@ -89,12 +91,20 @@ module Rubydoop
|
|
89
91
|
# @option options [JavaClass] :format The input format to use, defaults to `TextInputFormat`
|
90
92
|
def input(paths, options={})
|
91
93
|
paths = paths.join(',') if paths.is_a?(Enumerable)
|
92
|
-
format = options
|
94
|
+
format = options.fetch(:format, :text)
|
95
|
+
unless format.is_a?(Class)
|
96
|
+
class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "InputFormat"
|
97
|
+
format = Hadoop::Mapreduce::Lib::Input.const_get(class_name)
|
98
|
+
end
|
99
|
+
unless format <= Hadoop::Mapreduce::InputFormat
|
100
|
+
@job.configuration.set(Rubydoop::InputFormatProxy::RUBY_CLASS_KEY, format.name)
|
101
|
+
format = Rubydoop::InputFormatProxy
|
102
|
+
end
|
93
103
|
format.set_input_paths(@job, paths)
|
94
104
|
@job.set_input_format_class(format)
|
95
105
|
end
|
96
106
|
|
97
|
-
# Sets the output path of the job.
|
107
|
+
# Sets or gets the output path of the job.
|
98
108
|
#
|
99
109
|
# Calls `setOutputFormatClass` on the Hadoop job and uses the static
|
100
110
|
# `setOutputPath` on the output format to set the job's output path.
|
@@ -104,15 +114,35 @@ module Rubydoop
|
|
104
114
|
# @param [String] dir The output path
|
105
115
|
# @param [Hash] options
|
106
116
|
# @option options [JavaClass] :format The output format to use, defaults to `TextOutputFormat`
|
107
|
-
def output(dir, options={})
|
108
|
-
|
109
|
-
|
110
|
-
|
117
|
+
def output(dir=nil, options={})
|
118
|
+
if dir
|
119
|
+
if dir.is_a?(Hash)
|
120
|
+
options = dir
|
121
|
+
if options[:intermediate]
|
122
|
+
dir = @job.job_name
|
123
|
+
else
|
124
|
+
raise ArgumentError, sprintf('neither dir nor intermediate: true was specified')
|
125
|
+
end
|
126
|
+
end
|
127
|
+
dir = sprintf('%s-%010d-%05d', dir, Time.now, rand(1e5)) if options[:intermediate]
|
128
|
+
@output_dir = dir
|
129
|
+
format = options.fetch(:format, :text)
|
130
|
+
unless format.is_a?(Class)
|
131
|
+
class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "OutputFormat"
|
132
|
+
format = Hadoop::Mapreduce::Lib::Output.const_get(class_name)
|
133
|
+
end
|
134
|
+
format.set_output_path(@job, Hadoop::Fs::Path.new(@output_dir))
|
135
|
+
@job.set_output_format_class(format)
|
136
|
+
if options[:lazy]
|
137
|
+
Hadoop::Mapreduce::Lib::Output::LazyOutputFormat.set_output_format_class(@job, format)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
@output_dir
|
111
141
|
end
|
112
142
|
|
113
143
|
# Sets a job property.
|
114
144
|
#
|
115
|
-
# Calls `set`/`setBoolean`/`setLong`/`setFloat` on the Hadoop Job's
|
145
|
+
# Calls `set`/`setBoolean`/`setLong`/`setFloat` on the Hadoop Job's
|
116
146
|
# configuration (exact method depends on the type of the value).
|
117
147
|
#
|
118
148
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#set
|
@@ -144,7 +174,7 @@ module Rubydoop
|
|
144
174
|
# The class only needs to implement the method `map`, which will be called
|
145
175
|
# exactly like a Java mapper class' `map` method would be called.
|
146
176
|
#
|
147
|
-
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
177
|
+
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
148
178
|
# methods of the same name in Java mappers.
|
149
179
|
#
|
150
180
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Mapper.html Hadoop's Mapper
|
@@ -153,8 +183,8 @@ module Rubydoop
|
|
153
183
|
# @param [Class] cls The (Ruby) mapper class.
|
154
184
|
def mapper(cls=nil)
|
155
185
|
if cls
|
156
|
-
@job.configuration.set(
|
157
|
-
@job.set_mapper_class(
|
186
|
+
@job.configuration.set(Rubydoop::MapperProxy::RUBY_CLASS_KEY, cls.name)
|
187
|
+
@job.set_mapper_class(Rubydoop::MapperProxy)
|
158
188
|
@mapper = cls
|
159
189
|
end
|
160
190
|
@mapper
|
@@ -170,7 +200,7 @@ module Rubydoop
|
|
170
200
|
# The class only needs to implement the method `reduce`, which will be called
|
171
201
|
# exactly like a Java reducer class' `reduce` method would be called.
|
172
202
|
#
|
173
|
-
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
203
|
+
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
174
204
|
# methods of the same name in Java reducers.
|
175
205
|
#
|
176
206
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Reducer.html Hadoop's Reducer
|
@@ -179,8 +209,8 @@ module Rubydoop
|
|
179
209
|
# @param [Class] cls The (Ruby) reducer class.
|
180
210
|
def reducer(cls=nil)
|
181
211
|
if cls
|
182
|
-
@job.configuration.set(
|
183
|
-
@job.set_reducer_class(
|
212
|
+
@job.configuration.set(Rubydoop::ReducerProxy::RUBY_CLASS_KEY, cls.name)
|
213
|
+
@job.set_reducer_class(Rubydoop::ReducerProxy)
|
184
214
|
@reducer = cls
|
185
215
|
end
|
186
216
|
@reducer
|
@@ -200,8 +230,8 @@ module Rubydoop
|
|
200
230
|
# @param [Class] cls The (Ruby) combiner class.
|
201
231
|
def combiner(cls=nil)
|
202
232
|
if cls
|
203
|
-
@job.configuration.set(
|
204
|
-
@job.set_combiner_class(
|
233
|
+
@job.configuration.set(Rubydoop::CombinerProxy::RUBY_CLASS_KEY, cls.name)
|
234
|
+
@job.set_combiner_class(Rubydoop::CombinerProxy)
|
205
235
|
@combiner = cls
|
206
236
|
end
|
207
237
|
@combiner
|
@@ -222,8 +252,8 @@ module Rubydoop
|
|
222
252
|
# @param [Class] cls The (Ruby) partitioner class.
|
223
253
|
def partitioner(cls=nil)
|
224
254
|
if cls
|
225
|
-
@job.configuration.set(
|
226
|
-
@job.set_partitioner_class(
|
255
|
+
@job.configuration.set(Rubydoop::PartitionerProxy::RUBY_CLASS_KEY, cls.name)
|
256
|
+
@job.set_partitioner_class(Rubydoop::PartitionerProxy)
|
227
257
|
@partitioner = cls
|
228
258
|
end
|
229
259
|
@partitioner
|
@@ -232,7 +262,7 @@ module Rubydoop
|
|
232
262
|
|
233
263
|
# Sets a custom grouping comparator.
|
234
264
|
#
|
235
|
-
# The equivalent of calling `setGroupingComparatorClass` on a Hadoop job,
|
265
|
+
# The equivalent of calling `setGroupingComparatorClass` on a Hadoop job,
|
236
266
|
# but instead of a Java class you pass a Ruby class and Rubydoop will wrap
|
237
267
|
# it in a way that works with Hadoop.
|
238
268
|
#
|
@@ -241,8 +271,8 @@ module Rubydoop
|
|
241
271
|
# @param [Class] cls The (Ruby) comparator class.
|
242
272
|
def grouping_comparator(cls=nil)
|
243
273
|
if cls
|
244
|
-
@job.configuration.set(
|
245
|
-
@job.set_grouping_comparator_class(
|
274
|
+
@job.configuration.set(Rubydoop::GroupingComparatorProxy::RUBY_CLASS_KEY, cls.name)
|
275
|
+
@job.set_grouping_comparator_class(Rubydoop::GroupingComparatorProxy)
|
246
276
|
@grouping_comparator = cls
|
247
277
|
end
|
248
278
|
@grouping_comparator
|
@@ -251,7 +281,7 @@ module Rubydoop
|
|
251
281
|
|
252
282
|
# Sets a custom sort comparator.
|
253
283
|
#
|
254
|
-
# The equivalent of calling `setSortComparatorClass` on a Hadoop job,
|
284
|
+
# The equivalent of calling `setSortComparatorClass` on a Hadoop job,
|
255
285
|
# but instead of a Java class you pass a Ruby class and Rubydoop will wrap
|
256
286
|
# it in a way that works with Hadoop.
|
257
287
|
#
|
@@ -260,8 +290,8 @@ module Rubydoop
|
|
260
290
|
# @param [Class] cls The (Ruby) comparator class.
|
261
291
|
def sort_comparator(cls=nil)
|
262
292
|
if cls
|
263
|
-
@job.configuration.set(
|
264
|
-
@job.set_sort_comparator_class(
|
293
|
+
@job.configuration.set(Rubydoop::SortComparatorProxy::RUBY_CLASS_KEY, cls.name)
|
294
|
+
@job.set_sort_comparator_class(Rubydoop::SortComparatorProxy)
|
265
295
|
@sort_comparator = cls
|
266
296
|
end
|
267
297
|
@sort_comparator
|
@@ -284,13 +314,13 @@ module Rubydoop
|
|
284
314
|
def self.class_setter(dsl_name)
|
285
315
|
define_method(dsl_name) do |cls|
|
286
316
|
if cls
|
287
|
-
@job.send("set_#{dsl_name}_class", cls.
|
317
|
+
@job.send("set_#{dsl_name}_class", cls.to_java(Java::JavaLang::Class))
|
288
318
|
instance_variable_set(:"@#{dsl_name}", cls)
|
289
319
|
end
|
290
320
|
instance_variable_get(:"@#{dsl_name}")
|
291
321
|
end
|
292
322
|
define_method("#{dsl_name}=") do |cls|
|
293
|
-
@job.send("set_#{dsl_name}_class", cls.
|
323
|
+
@job.send("set_#{dsl_name}_class", cls.to_java(Java::JavaLang::Class))
|
294
324
|
end
|
295
325
|
end
|
296
326
|
|
@@ -317,7 +347,7 @@ module Rubydoop
|
|
317
347
|
# @!method output_key(cls)
|
318
348
|
#
|
319
349
|
# Sets the reducer's output key type.
|
320
|
-
#
|
350
|
+
#
|
321
351
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputKeyClass(java.lang.Class) Hadoop's Job#setOutputKeyClass
|
322
352
|
#
|
323
353
|
# @param [Class] cls The reducer's output key type
|
@@ -335,23 +365,74 @@ module Rubydoop
|
|
335
365
|
|
336
366
|
# @private
|
337
367
|
class Context
|
338
|
-
|
339
|
-
|
340
|
-
def initialize(conf, proxy_classes, arguments)
|
368
|
+
def initialize(conf)
|
341
369
|
@conf = conf
|
342
|
-
@
|
343
|
-
@arguments = arguments
|
344
|
-
@jobs = []
|
370
|
+
@job_stack = [Jobs::Sequence.new]
|
345
371
|
end
|
346
372
|
|
347
373
|
def create_job(name)
|
348
374
|
hadoop_job = Hadoop::Mapreduce::Job.new(@conf, name)
|
349
|
-
@
|
375
|
+
@job_stack.last.add(hadoop_job)
|
350
376
|
hadoop_job
|
351
377
|
end
|
352
378
|
|
353
|
-
def
|
354
|
-
@
|
379
|
+
def wait_for_completion(verbose)
|
380
|
+
@job_stack.first.wait_for_completion(verbose)
|
381
|
+
end
|
382
|
+
|
383
|
+
def parallel
|
384
|
+
push(Jobs::Parallel.new)
|
385
|
+
if block_given?
|
386
|
+
yield
|
387
|
+
pop
|
388
|
+
end
|
389
|
+
end
|
390
|
+
|
391
|
+
def sequence
|
392
|
+
push(Jobs::Sequence.new)
|
393
|
+
if block_given?
|
394
|
+
yield
|
395
|
+
pop
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
def push(job_list)
|
400
|
+
@job_stack.last.add(job_list)
|
401
|
+
@job_stack.push(job_list)
|
402
|
+
end
|
403
|
+
|
404
|
+
def pop
|
405
|
+
@job_stack.pop
|
406
|
+
end
|
407
|
+
|
408
|
+
class Jobs
|
409
|
+
attr_reader :jobs
|
410
|
+
|
411
|
+
def initialize
|
412
|
+
@jobs = []
|
413
|
+
end
|
414
|
+
|
415
|
+
def add(job)
|
416
|
+
@jobs.push(job)
|
417
|
+
end
|
418
|
+
|
419
|
+
class Sequence < Jobs
|
420
|
+
def wait_for_completion(verbose)
|
421
|
+
@jobs.all? do |job|
|
422
|
+
job.wait_for_completion(verbose)
|
423
|
+
end
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
class Parallel < Jobs
|
428
|
+
def wait_for_completion(verbose)
|
429
|
+
@jobs.map do |job|
|
430
|
+
Thread.new do
|
431
|
+
job.wait_for_completion(verbose)
|
432
|
+
end
|
433
|
+
end.map!(&:value).all?
|
434
|
+
end
|
435
|
+
end
|
355
436
|
end
|
356
437
|
end
|
357
438
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module Rubydoop
|
5
|
+
# @private
|
6
|
+
class JobRunner < Java::OrgApacheHadoopConf::Configured
|
7
|
+
include Java::OrgApacheHadoopUtil::Tool
|
8
|
+
|
9
|
+
def initialize(setup_script=$0, &block)
|
10
|
+
@setup_script = setup_script
|
11
|
+
@block = block
|
12
|
+
end
|
13
|
+
|
14
|
+
def run(args)
|
15
|
+
conf = Java::OrgApacheHadoopMapred::JobConf.new(get_conf)
|
16
|
+
conf.set(Java::Rubydoop::InstanceContainer::JOB_SETUP_SCRIPT_KEY, File.basename(@setup_script))
|
17
|
+
conf.jar = containing_jar
|
18
|
+
context = Context.new(conf)
|
19
|
+
configuration_definition = ConfigurationDefinition.new(context)
|
20
|
+
begin
|
21
|
+
configuration_definition.instance_exec(*args, &@block)
|
22
|
+
rescue => e
|
23
|
+
raise JobRunnerError, sprintf('Could not load job setup script (%s): %s', @setup_script.inspect, e.message.inspect), e.backtrace
|
24
|
+
end
|
25
|
+
configuration_definition.wait_for_completion(true) ? 0 : 1
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.run(args, &block)
|
29
|
+
Java::JavaLang::System.exit(Java::OrgApacheHadoopUtil::ToolRunner.run(new(&block), args.to_java(:string)))
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def containing_jar
|
35
|
+
@containing_jar ||= begin
|
36
|
+
relative_setup_script = @setup_script[/(?<=#{PUCK_ROOT}).+\Z/]
|
37
|
+
class_loader = JRuby.runtime.jruby_class_loader
|
38
|
+
if (url = class_loader.get_resources(relative_setup_script).find { |url| url.protocol == 'jar' })
|
39
|
+
path = url.path
|
40
|
+
path.slice!(/\Afile:/)
|
41
|
+
path = Java::JavaNet::URLDecoder.decode(path, 'UTF-8')
|
42
|
+
path.slice!(/!.*\Z/)
|
43
|
+
path
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
JobRunnerError = Class.new(StandardError)
|
50
|
+
end
|
data/lib/rubydoop/package.rb
CHANGED
@@ -1,11 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'bundler'
|
4
|
-
require '
|
5
|
-
require 'ant'
|
6
|
-
require 'fileutils'
|
7
|
-
require 'set'
|
8
|
-
|
4
|
+
require 'puck'
|
9
5
|
|
10
6
|
module Rubydoop
|
11
7
|
# Utility for making a job JAR that works with Hadoop.
|
@@ -25,15 +21,14 @@ module Rubydoop
|
|
25
21
|
# @option options [String] :project_base_dir The project's base dir, defaults to the current directory (the assumption is that Package will be used from a Rake task)
|
26
22
|
# @option options [String] :project_name The name of the JAR file (minus .jar), defaults to the directory name of the `:project_base_dir`
|
27
23
|
# @option options [String] :build_dir The directory to put the final JAR into, defaults to `:project_base_dir + '/build'`
|
24
|
+
# @option options [String] :jruby_jar_path The path to a local copy of `jruby-complete.jar`, unless specified you need to have `jruby-jars` in your `Gemfile`
|
28
25
|
# @option options [Array<String>] :gem_groups All gems from these Gemfile groups will be included, defaults to `[:default]` (the top-level group of a Gemfile)
|
29
26
|
# @option options [Array<String>] :lib_jars Paths to extra JAR files to include in the JAR's lib directory (where they will be on the classpath when the job is run)
|
30
|
-
# @option options [String] :jruby_version The JRuby version to package, defaults to `JRUBY_VERSION`
|
31
|
-
# @option options [String] :jruby_jar_path The path to a local copy of `jruby-complete.jar`, defaults to downloading and caching a version defined by `:jruby_version`
|
32
27
|
def initialize(options={})
|
33
28
|
@options = default_options.merge(options)
|
34
|
-
@options[:project_name]
|
35
|
-
@options[:build_dir]
|
36
|
-
@options[:
|
29
|
+
@options[:project_name] ||= File.basename(@options[:project_base_dir])
|
30
|
+
@options[:build_dir] ||= File.join(@options[:project_base_dir], 'build')
|
31
|
+
@options[:jar_path] ||= "#{@options[:project_name]}.jar"
|
37
32
|
end
|
38
33
|
|
39
34
|
# Create the JAR package, see {Package#initialize} for configuration options.
|
@@ -42,9 +37,15 @@ module Rubydoop
|
|
42
37
|
# (`jruby-complete.jar`) and locally cached, but if you already have a
|
43
38
|
# copy in a local Ivy or Maven repository that will be used instead.
|
44
39
|
def create!
|
45
|
-
|
46
|
-
|
47
|
-
|
40
|
+
Puck::Jar.new(
|
41
|
+
app_dir: @options[:project_base_dir],
|
42
|
+
app_name: @options[:project_name],
|
43
|
+
build_dir: @options[:build_dir],
|
44
|
+
jar_name: @options[:jar_path],
|
45
|
+
gem_groups: @options[:gem_groups],
|
46
|
+
extra_files: lib_jars,
|
47
|
+
jruby_complete: @options[:jruby_jar_path]
|
48
|
+
).create
|
48
49
|
end
|
49
50
|
|
50
51
|
# A shortcut for `Package.new(options).create!`.
|
@@ -52,11 +53,25 @@ module Rubydoop
|
|
52
53
|
new(options).create!
|
53
54
|
end
|
54
55
|
|
56
|
+
def respond_to?(name)
|
57
|
+
@options.key?(name) or super
|
58
|
+
end
|
59
|
+
|
60
|
+
def method_missing(name, *args)
|
61
|
+
@options[name] or super
|
62
|
+
end
|
63
|
+
|
64
|
+
def lib_jars
|
65
|
+
extra_files = { File.join(rubydoop_base_dir, 'lib/rubydoop.jar') => 'lib/rubydoop.jar' }
|
66
|
+
@options[:lib_jars].each_with_object(extra_files) do |jar, extra_files|
|
67
|
+
extra_files[jar] = File.join('lib', File.basename(jar))
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
55
71
|
private
|
56
72
|
|
57
73
|
def default_options
|
58
74
|
defaults = {
|
59
|
-
:main_class => 'rubydoop.RubydoopJobRunner',
|
60
75
|
:rubydoop_base_dir => File.expand_path('../../..', __FILE__),
|
61
76
|
:project_base_dir => Dir.getwd,
|
62
77
|
:gem_groups => [:default],
|
@@ -64,60 +79,5 @@ module Rubydoop
|
|
64
79
|
:jruby_version => JRUBY_VERSION
|
65
80
|
}
|
66
81
|
end
|
67
|
-
|
68
|
-
def create_directories!
|
69
|
-
FileUtils.mkdir_p(@options[:build_dir])
|
70
|
-
end
|
71
|
-
|
72
|
-
def fetch_jruby!
|
73
|
-
return if File.exists?(@options[:jruby_jar_path])
|
74
|
-
|
75
|
-
local_maven_path = File.expand_path("~/.m2/repository/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar")
|
76
|
-
local_ivy_path = File.expand_path("~/.ivy2/cache/org.jruby/jruby-complete/jars/jruby-complete-#{@options[:jruby_version]}.jar")
|
77
|
-
remote_maven_url = "http://central.maven.org/maven2/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar"
|
78
|
-
|
79
|
-
if File.exists?(local_maven_path)
|
80
|
-
$stderr.puts("Using #{File.basename(local_maven_path)} from local Maven cache")
|
81
|
-
@options[:jruby_jar_path] = local_maven_path
|
82
|
-
elsif File.exists?(local_ivy_path)
|
83
|
-
$stderr.puts("Using #{File.basename(local_maven_path)} from local Ivy2 cache")
|
84
|
-
@options[:jruby_jar_path] = local_ivy_path
|
85
|
-
else
|
86
|
-
$stderr.puts("Downloading #{remote_maven_url} to #{@options[:jruby_jar_path]}")
|
87
|
-
jruby_complete_bytes = open(remote_maven_url).read
|
88
|
-
File.open(@options[:jruby_jar_path], 'wb') do |io|
|
89
|
-
io.write(jruby_complete_bytes)
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def build_jar!
|
95
|
-
# the ant block is instance_exec'ed so instance variables and methods are not in scope
|
96
|
-
options = @options
|
97
|
-
bundled_gems = load_path
|
98
|
-
lib_jars = [options[:jruby_jar_path], *options[:lib_jars]]
|
99
|
-
ant do
|
100
|
-
jar :destfile => "#{options[:build_dir]}/#{options[:project_name]}.jar" do
|
101
|
-
manifest { attribute :name => 'Main-Class', :value => options[:main_class] }
|
102
|
-
zipfileset :src => "#{options[:rubydoop_base_dir]}/lib/rubydoop.jar"
|
103
|
-
fileset :dir => "#{options[:rubydoop_base_dir]}/lib", :includes => '**/*.rb', :excludes => '*.jar'
|
104
|
-
fileset :dir => "#{options[:project_base_dir]}/lib"
|
105
|
-
bundled_gems.each { |path| fileset :dir => path }
|
106
|
-
lib_jars.each { |extra_jar| zipfileset :dir => File.dirname(extra_jar), :includes => File.basename(extra_jar), :prefix => 'lib' }
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
def load_path
|
112
|
-
Bundler.definition.specs_for(@options[:gem_groups]).flat_map do |spec|
|
113
|
-
if spec.full_name !~ /^(?:bundler|rubydoop)-\d+/
|
114
|
-
spec.require_paths.map do |rp|
|
115
|
-
"#{spec.full_gem_path}/#{rp}"
|
116
|
-
end
|
117
|
-
else
|
118
|
-
[]
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
82
|
end
|
123
83
|
end
|
data/lib/rubydoop/version.rb
CHANGED
metadata
CHANGED
@@ -1,16 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubydoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 1.0.0
|
4
|
+
version: 2.0.0.pre1
|
6
5
|
platform: java
|
7
6
|
authors:
|
8
7
|
- Theo Hultberg
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
13
|
-
dependencies:
|
11
|
+
date: 2016-01-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ~>
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '1.2'
|
19
|
+
name: puck
|
20
|
+
prerelease: false
|
21
|
+
type: :runtime
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.2'
|
14
27
|
description: Rubydoop embeds a JRuby runtime in Hadoop, letting you write map reduce code in Ruby without using the streaming APIs
|
15
28
|
email:
|
16
29
|
- theo@iconara.net
|
@@ -19,35 +32,35 @@ extensions: []
|
|
19
32
|
extra_rdoc_files: []
|
20
33
|
files:
|
21
34
|
- lib/hadoop.rb
|
35
|
+
- lib/rubydoop.jar
|
22
36
|
- lib/rubydoop.rb
|
23
37
|
- lib/rubydoop/dsl.rb
|
38
|
+
- lib/rubydoop/job_runner.rb
|
24
39
|
- lib/rubydoop/package.rb
|
25
40
|
- lib/rubydoop/version.rb
|
26
|
-
- lib/rubydoop.jar
|
27
41
|
homepage: http://github.com/iconara/rubydoop
|
28
|
-
licenses:
|
42
|
+
licenses:
|
43
|
+
- Apache License 2.0
|
44
|
+
metadata: {}
|
29
45
|
post_install_message:
|
30
46
|
rdoc_options: []
|
31
47
|
require_paths:
|
32
48
|
- lib
|
33
49
|
required_ruby_version: !ruby/object:Gem::Requirement
|
34
50
|
requirements:
|
35
|
-
- -
|
51
|
+
- - '>='
|
36
52
|
- !ruby/object:Gem::Version
|
37
53
|
version: '0'
|
38
|
-
none: false
|
39
54
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
55
|
requirements:
|
41
|
-
- -
|
56
|
+
- - '>'
|
42
57
|
- !ruby/object:Gem::Version
|
43
|
-
version:
|
44
|
-
none: false
|
58
|
+
version: 1.3.1
|
45
59
|
requirements: []
|
46
60
|
rubyforge_project: rubydoop
|
47
|
-
rubygems_version:
|
61
|
+
rubygems_version: 2.4.5
|
48
62
|
signing_key:
|
49
|
-
specification_version:
|
63
|
+
specification_version: 4
|
50
64
|
summary: Write Hadoop jobs in Ruby
|
51
65
|
test_files: []
|
52
66
|
has_rdoc:
|
53
|
-
...
|