rubydoop 1.0.0-java → 2.0.0.pre1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/rubydoop.jar +0 -0
- data/lib/rubydoop.rb +5 -49
- data/lib/rubydoop/dsl.rb +152 -71
- data/lib/rubydoop/job_runner.rb +50 -0
- data/lib/rubydoop/package.rb +29 -69
- data/lib/rubydoop/version.rb +2 -2
- metadata +27 -14
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0e1f088d9982e3f4cfa2083e3c9817e96a9f586f
|
4
|
+
data.tar.gz: b909d0ee95492428d0b2a29418fe84cb67fa8adb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2f49a006f7d5ed891af12f3e1da488e375bc4730da15f8488e920a433fbf45ba9d4d3a3a318e8eacac3fc726b524402ce266abc970bd9b18e37474f0068986ba
|
7
|
+
data.tar.gz: 4e740a1ad1a7294f98e0710ec57160b3dce03c1d3b179805d6ce18c3a5040de82e4fa06b175f83a9531b2e98631752dea6717d4513a4804d48a331ebec500c16
|
data/lib/rubydoop.jar
CHANGED
Binary file
|
data/lib/rubydoop.rb
CHANGED
@@ -1,58 +1,14 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
$LOAD_PATH << File.expand_path('..', __FILE__)
|
4
|
-
|
5
|
-
|
6
3
|
require 'hadoop'
|
4
|
+
require 'rubydoop.jar'
|
7
5
|
|
8
|
-
|
9
|
-
#
|
10
|
-
# {Package} for the packaging documentation, or the {file:README.md README}
|
6
|
+
# See {Rubydoop.run} for the job configuration DSL documentation,
|
7
|
+
# {Package} for the packaging documentation, or the {file:README.md README}
|
11
8
|
# for a getting started guide.
|
12
9
|
module Rubydoop
|
13
|
-
|
14
|
-
def self.create_mapper(conf)
|
15
|
-
create_instance(conf.get(MAPPER_KEY))
|
16
|
-
end
|
17
|
-
|
18
|
-
# @private
|
19
|
-
def self.create_reducer(conf)
|
20
|
-
create_instance(conf.get(REDUCER_KEY))
|
21
|
-
end
|
22
|
-
|
23
|
-
# @private
|
24
|
-
def self.create_combiner(conf)
|
25
|
-
create_instance(conf.get(COMBINER_KEY))
|
26
|
-
end
|
27
|
-
|
28
|
-
# @private
|
29
|
-
def self.create_partitioner(conf)
|
30
|
-
create_instance(conf.get(PARTITIONER_KEY))
|
31
|
-
end
|
32
|
-
|
33
|
-
# @private
|
34
|
-
def self.create_grouping_comparator(conf)
|
35
|
-
create_instance(conf.get(GROUPING_COMPARATOR_KEY))
|
36
|
-
end
|
37
|
-
|
38
|
-
# @private
|
39
|
-
def self.create_sort_comparator(conf)
|
40
|
-
create_instance(conf.get(SORT_COMPARATOR_KEY))
|
41
|
-
end
|
42
|
-
|
43
|
-
private
|
44
|
-
|
45
|
-
MAPPER_KEY = 'rubydoop.mapper'.freeze
|
46
|
-
REDUCER_KEY = 'rubydoop.reducer'.freeze
|
47
|
-
COMBINER_KEY = 'rubydoop.combiner'.freeze
|
48
|
-
PARTITIONER_KEY = 'rubydoop.partitioner'.freeze
|
49
|
-
GROUPING_COMPARATOR_KEY = 'rubydoop.grouping_comparator'.freeze
|
50
|
-
SORT_COMPARATOR_KEY = 'rubydoop.sort_comparator'.freeze
|
51
|
-
|
52
|
-
def self.create_instance(const_path)
|
53
|
-
cls = const_path.split('::').reduce(Object) { |host, name| host.const_get(name) }
|
54
|
-
cls.new
|
55
|
-
end
|
10
|
+
include_package 'rubydoop'
|
56
11
|
end
|
57
12
|
|
58
13
|
require 'rubydoop/dsl'
|
14
|
+
require 'rubydoop/job_runner'
|
data/lib/rubydoop/dsl.rb
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
module Rubydoop
|
4
4
|
# Main entrypoint into the configuration DSL.
|
5
5
|
#
|
6
|
-
# @example
|
6
|
+
# @example Running a job
|
7
7
|
#
|
8
|
-
# Rubydoop.
|
8
|
+
# Rubydoop.run do |*args|
|
9
9
|
# job 'word_count' do
|
10
10
|
# input args[0]
|
11
11
|
# output args[1]
|
@@ -18,67 +18,69 @@ module Rubydoop
|
|
18
18
|
# end
|
19
19
|
# end
|
20
20
|
#
|
21
|
-
# Within a
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
21
|
+
# Within a run block you can specify one or more jobs, the `job` blocks
|
22
|
+
# are run in the context of a {JobDefinition} instance, so look at that
|
23
|
+
# class for documentation about the available properties. The `run` block
|
24
|
+
# is run within the context of a {ConfigurationDefinition} instance. The
|
25
|
+
# arguments to the `run` block is the command line arguments, minus those
|
26
|
+
# handled by Hadoop's `ToolRunner`.
|
27
27
|
#
|
28
28
|
# @yieldparam [Array<String>] *arguments The command line arguments
|
29
29
|
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# block is not run (this is a feature, it means that the configuration
|
34
|
-
# block doesn't run in mappers and reducers).
|
35
|
-
#
|
36
|
-
def self.configure(impl=ConfigurationDefinition, &block)
|
37
|
-
impl.new($rubydoop_context, &block) if $rubydoop_context
|
30
|
+
def self.run(args=ARGV, &block)
|
31
|
+
return if $rubydoop_embedded
|
32
|
+
JobRunner.run(args, &block)
|
38
33
|
end
|
39
34
|
|
40
|
-
#
|
41
|
-
|
42
|
-
|
35
|
+
# @ see {Rubydoop.run}
|
36
|
+
def self.configure(&block)
|
37
|
+
run(&block)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Configuration DSL.
|
43
41
|
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
# # same DSL as shown in the documentation for Rubydoop.configure
|
47
|
-
# end
|
42
|
+
# `Rubydoop.run` blocks are run within the context of an instance of this
|
43
|
+
# class. These are the methods available in those blocks.
|
48
44
|
#
|
49
45
|
class ConfigurationDefinition
|
50
|
-
|
46
|
+
# @private
|
47
|
+
def initialize(context)
|
51
48
|
@context = context
|
52
|
-
instance_exec(*arguments, &block) if @context && block_given?
|
53
|
-
end
|
54
|
-
|
55
|
-
def arguments
|
56
|
-
@context.arguments
|
57
49
|
end
|
58
50
|
|
59
51
|
def job(name, &block)
|
60
|
-
|
61
|
-
job = JobDefinition.new(@context, @context.create_job(name))
|
52
|
+
job = JobDefinition.new(@context.create_job(name))
|
62
53
|
job.instance_exec(&block)
|
63
54
|
job
|
64
55
|
end
|
56
|
+
|
57
|
+
def parallel(&block)
|
58
|
+
@context.parallel(&block)
|
59
|
+
end
|
60
|
+
|
61
|
+
def sequence(&block)
|
62
|
+
@context.sequence(&block)
|
63
|
+
end
|
64
|
+
|
65
|
+
def wait_for_completion(verbose)
|
66
|
+
@context.wait_for_completion(verbose)
|
67
|
+
end
|
65
68
|
end
|
66
69
|
|
67
70
|
# Job configuration DSL.
|
68
71
|
#
|
69
|
-
# `
|
70
|
-
#
|
72
|
+
# `job` blocks are run within the context of an instance of this
|
73
|
+
# class. These are the methods available in those blocks.
|
71
74
|
#
|
72
75
|
class JobDefinition
|
73
76
|
# @private
|
74
|
-
def initialize(
|
75
|
-
@context = context
|
77
|
+
def initialize(job)
|
76
78
|
@job = job
|
77
79
|
end
|
78
80
|
|
79
81
|
# Sets the input paths of the job.
|
80
82
|
#
|
81
|
-
# Calls `setInputFormatClass` on the Hadoop job and uses the static
|
83
|
+
# Calls `setInputFormatClass` on the Hadoop job and uses the static
|
82
84
|
# `setInputPaths` on the input format to set the job's input path.
|
83
85
|
#
|
84
86
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setInputFormatClass(java.lang.Class) Hadoop's Job#setInputFormatClass
|
@@ -89,12 +91,20 @@ module Rubydoop
|
|
89
91
|
# @option options [JavaClass] :format The input format to use, defaults to `TextInputFormat`
|
90
92
|
def input(paths, options={})
|
91
93
|
paths = paths.join(',') if paths.is_a?(Enumerable)
|
92
|
-
format = options
|
94
|
+
format = options.fetch(:format, :text)
|
95
|
+
unless format.is_a?(Class)
|
96
|
+
class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "InputFormat"
|
97
|
+
format = Hadoop::Mapreduce::Lib::Input.const_get(class_name)
|
98
|
+
end
|
99
|
+
unless format <= Hadoop::Mapreduce::InputFormat
|
100
|
+
@job.configuration.set(Rubydoop::InputFormatProxy::RUBY_CLASS_KEY, format.name)
|
101
|
+
format = Rubydoop::InputFormatProxy
|
102
|
+
end
|
93
103
|
format.set_input_paths(@job, paths)
|
94
104
|
@job.set_input_format_class(format)
|
95
105
|
end
|
96
106
|
|
97
|
-
# Sets the output path of the job.
|
107
|
+
# Sets or gets the output path of the job.
|
98
108
|
#
|
99
109
|
# Calls `setOutputFormatClass` on the Hadoop job and uses the static
|
100
110
|
# `setOutputPath` on the output format to set the job's output path.
|
@@ -104,15 +114,35 @@ module Rubydoop
|
|
104
114
|
# @param [String] dir The output path
|
105
115
|
# @param [Hash] options
|
106
116
|
# @option options [JavaClass] :format The output format to use, defaults to `TextOutputFormat`
|
107
|
-
def output(dir, options={})
|
108
|
-
|
109
|
-
|
110
|
-
|
117
|
+
def output(dir=nil, options={})
|
118
|
+
if dir
|
119
|
+
if dir.is_a?(Hash)
|
120
|
+
options = dir
|
121
|
+
if options[:intermediate]
|
122
|
+
dir = @job.job_name
|
123
|
+
else
|
124
|
+
raise ArgumentError, sprintf('neither dir nor intermediate: true was specified')
|
125
|
+
end
|
126
|
+
end
|
127
|
+
dir = sprintf('%s-%010d-%05d', dir, Time.now, rand(1e5)) if options[:intermediate]
|
128
|
+
@output_dir = dir
|
129
|
+
format = options.fetch(:format, :text)
|
130
|
+
unless format.is_a?(Class)
|
131
|
+
class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "OutputFormat"
|
132
|
+
format = Hadoop::Mapreduce::Lib::Output.const_get(class_name)
|
133
|
+
end
|
134
|
+
format.set_output_path(@job, Hadoop::Fs::Path.new(@output_dir))
|
135
|
+
@job.set_output_format_class(format)
|
136
|
+
if options[:lazy]
|
137
|
+
Hadoop::Mapreduce::Lib::Output::LazyOutputFormat.set_output_format_class(@job, format)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
@output_dir
|
111
141
|
end
|
112
142
|
|
113
143
|
# Sets a job property.
|
114
144
|
#
|
115
|
-
# Calls `set`/`setBoolean`/`setLong`/`setFloat` on the Hadoop Job's
|
145
|
+
# Calls `set`/`setBoolean`/`setLong`/`setFloat` on the Hadoop Job's
|
116
146
|
# configuration (exact method depends on the type of the value).
|
117
147
|
#
|
118
148
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#set
|
@@ -144,7 +174,7 @@ module Rubydoop
|
|
144
174
|
# The class only needs to implement the method `map`, which will be called
|
145
175
|
# exactly like a Java mapper class' `map` method would be called.
|
146
176
|
#
|
147
|
-
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
177
|
+
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
148
178
|
# methods of the same name in Java mappers.
|
149
179
|
#
|
150
180
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Mapper.html Hadoop's Mapper
|
@@ -153,8 +183,8 @@ module Rubydoop
|
|
153
183
|
# @param [Class] cls The (Ruby) mapper class.
|
154
184
|
def mapper(cls=nil)
|
155
185
|
if cls
|
156
|
-
@job.configuration.set(
|
157
|
-
@job.set_mapper_class(
|
186
|
+
@job.configuration.set(Rubydoop::MapperProxy::RUBY_CLASS_KEY, cls.name)
|
187
|
+
@job.set_mapper_class(Rubydoop::MapperProxy)
|
158
188
|
@mapper = cls
|
159
189
|
end
|
160
190
|
@mapper
|
@@ -170,7 +200,7 @@ module Rubydoop
|
|
170
200
|
# The class only needs to implement the method `reduce`, which will be called
|
171
201
|
# exactly like a Java reducer class' `reduce` method would be called.
|
172
202
|
#
|
173
|
-
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
203
|
+
# You can optionally implement `setup` and `cleanup`, which mirrors the
|
174
204
|
# methods of the same name in Java reducers.
|
175
205
|
#
|
176
206
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Reducer.html Hadoop's Reducer
|
@@ -179,8 +209,8 @@ module Rubydoop
|
|
179
209
|
# @param [Class] cls The (Ruby) reducer class.
|
180
210
|
def reducer(cls=nil)
|
181
211
|
if cls
|
182
|
-
@job.configuration.set(
|
183
|
-
@job.set_reducer_class(
|
212
|
+
@job.configuration.set(Rubydoop::ReducerProxy::RUBY_CLASS_KEY, cls.name)
|
213
|
+
@job.set_reducer_class(Rubydoop::ReducerProxy)
|
184
214
|
@reducer = cls
|
185
215
|
end
|
186
216
|
@reducer
|
@@ -200,8 +230,8 @@ module Rubydoop
|
|
200
230
|
# @param [Class] cls The (Ruby) combiner class.
|
201
231
|
def combiner(cls=nil)
|
202
232
|
if cls
|
203
|
-
@job.configuration.set(
|
204
|
-
@job.set_combiner_class(
|
233
|
+
@job.configuration.set(Rubydoop::CombinerProxy::RUBY_CLASS_KEY, cls.name)
|
234
|
+
@job.set_combiner_class(Rubydoop::CombinerProxy)
|
205
235
|
@combiner = cls
|
206
236
|
end
|
207
237
|
@combiner
|
@@ -222,8 +252,8 @@ module Rubydoop
|
|
222
252
|
# @param [Class] cls The (Ruby) partitioner class.
|
223
253
|
def partitioner(cls=nil)
|
224
254
|
if cls
|
225
|
-
@job.configuration.set(
|
226
|
-
@job.set_partitioner_class(
|
255
|
+
@job.configuration.set(Rubydoop::PartitionerProxy::RUBY_CLASS_KEY, cls.name)
|
256
|
+
@job.set_partitioner_class(Rubydoop::PartitionerProxy)
|
227
257
|
@partitioner = cls
|
228
258
|
end
|
229
259
|
@partitioner
|
@@ -232,7 +262,7 @@ module Rubydoop
|
|
232
262
|
|
233
263
|
# Sets a custom grouping comparator.
|
234
264
|
#
|
235
|
-
# The equivalent of calling `setGroupingComparatorClass` on a Hadoop job,
|
265
|
+
# The equivalent of calling `setGroupingComparatorClass` on a Hadoop job,
|
236
266
|
# but instead of a Java class you pass a Ruby class and Rubydoop will wrap
|
237
267
|
# it in a way that works with Hadoop.
|
238
268
|
#
|
@@ -241,8 +271,8 @@ module Rubydoop
|
|
241
271
|
# @param [Class] cls The (Ruby) comparator class.
|
242
272
|
def grouping_comparator(cls=nil)
|
243
273
|
if cls
|
244
|
-
@job.configuration.set(
|
245
|
-
@job.set_grouping_comparator_class(
|
274
|
+
@job.configuration.set(Rubydoop::GroupingComparatorProxy::RUBY_CLASS_KEY, cls.name)
|
275
|
+
@job.set_grouping_comparator_class(Rubydoop::GroupingComparatorProxy)
|
246
276
|
@grouping_comparator = cls
|
247
277
|
end
|
248
278
|
@grouping_comparator
|
@@ -251,7 +281,7 @@ module Rubydoop
|
|
251
281
|
|
252
282
|
# Sets a custom sort comparator.
|
253
283
|
#
|
254
|
-
# The equivalent of calling `setSortComparatorClass` on a Hadoop job,
|
284
|
+
# The equivalent of calling `setSortComparatorClass` on a Hadoop job,
|
255
285
|
# but instead of a Java class you pass a Ruby class and Rubydoop will wrap
|
256
286
|
# it in a way that works with Hadoop.
|
257
287
|
#
|
@@ -260,8 +290,8 @@ module Rubydoop
|
|
260
290
|
# @param [Class] cls The (Ruby) comparator class.
|
261
291
|
def sort_comparator(cls=nil)
|
262
292
|
if cls
|
263
|
-
@job.configuration.set(
|
264
|
-
@job.set_sort_comparator_class(
|
293
|
+
@job.configuration.set(Rubydoop::SortComparatorProxy::RUBY_CLASS_KEY, cls.name)
|
294
|
+
@job.set_sort_comparator_class(Rubydoop::SortComparatorProxy)
|
265
295
|
@sort_comparator = cls
|
266
296
|
end
|
267
297
|
@sort_comparator
|
@@ -284,13 +314,13 @@ module Rubydoop
|
|
284
314
|
def self.class_setter(dsl_name)
|
285
315
|
define_method(dsl_name) do |cls|
|
286
316
|
if cls
|
287
|
-
@job.send("set_#{dsl_name}_class", cls.
|
317
|
+
@job.send("set_#{dsl_name}_class", cls.to_java(Java::JavaLang::Class))
|
288
318
|
instance_variable_set(:"@#{dsl_name}", cls)
|
289
319
|
end
|
290
320
|
instance_variable_get(:"@#{dsl_name}")
|
291
321
|
end
|
292
322
|
define_method("#{dsl_name}=") do |cls|
|
293
|
-
@job.send("set_#{dsl_name}_class", cls.
|
323
|
+
@job.send("set_#{dsl_name}_class", cls.to_java(Java::JavaLang::Class))
|
294
324
|
end
|
295
325
|
end
|
296
326
|
|
@@ -317,7 +347,7 @@ module Rubydoop
|
|
317
347
|
# @!method output_key(cls)
|
318
348
|
#
|
319
349
|
# Sets the reducer's output key type.
|
320
|
-
#
|
350
|
+
#
|
321
351
|
# @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputKeyClass(java.lang.Class) Hadoop's Job#setOutputKeyClass
|
322
352
|
#
|
323
353
|
# @param [Class] cls The reducer's output key type
|
@@ -335,23 +365,74 @@ module Rubydoop
|
|
335
365
|
|
336
366
|
# @private
|
337
367
|
class Context
|
338
|
-
|
339
|
-
|
340
|
-
def initialize(conf, proxy_classes, arguments)
|
368
|
+
def initialize(conf)
|
341
369
|
@conf = conf
|
342
|
-
@
|
343
|
-
@arguments = arguments
|
344
|
-
@jobs = []
|
370
|
+
@job_stack = [Jobs::Sequence.new]
|
345
371
|
end
|
346
372
|
|
347
373
|
def create_job(name)
|
348
374
|
hadoop_job = Hadoop::Mapreduce::Job.new(@conf, name)
|
349
|
-
@
|
375
|
+
@job_stack.last.add(hadoop_job)
|
350
376
|
hadoop_job
|
351
377
|
end
|
352
378
|
|
353
|
-
def
|
354
|
-
@
|
379
|
+
def wait_for_completion(verbose)
|
380
|
+
@job_stack.first.wait_for_completion(verbose)
|
381
|
+
end
|
382
|
+
|
383
|
+
def parallel
|
384
|
+
push(Jobs::Parallel.new)
|
385
|
+
if block_given?
|
386
|
+
yield
|
387
|
+
pop
|
388
|
+
end
|
389
|
+
end
|
390
|
+
|
391
|
+
def sequence
|
392
|
+
push(Jobs::Sequence.new)
|
393
|
+
if block_given?
|
394
|
+
yield
|
395
|
+
pop
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
def push(job_list)
|
400
|
+
@job_stack.last.add(job_list)
|
401
|
+
@job_stack.push(job_list)
|
402
|
+
end
|
403
|
+
|
404
|
+
def pop
|
405
|
+
@job_stack.pop
|
406
|
+
end
|
407
|
+
|
408
|
+
class Jobs
|
409
|
+
attr_reader :jobs
|
410
|
+
|
411
|
+
def initialize
|
412
|
+
@jobs = []
|
413
|
+
end
|
414
|
+
|
415
|
+
def add(job)
|
416
|
+
@jobs.push(job)
|
417
|
+
end
|
418
|
+
|
419
|
+
class Sequence < Jobs
|
420
|
+
def wait_for_completion(verbose)
|
421
|
+
@jobs.all? do |job|
|
422
|
+
job.wait_for_completion(verbose)
|
423
|
+
end
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
class Parallel < Jobs
|
428
|
+
def wait_for_completion(verbose)
|
429
|
+
@jobs.map do |job|
|
430
|
+
Thread.new do
|
431
|
+
job.wait_for_completion(verbose)
|
432
|
+
end
|
433
|
+
end.map!(&:value).all?
|
434
|
+
end
|
435
|
+
end
|
355
436
|
end
|
356
437
|
end
|
357
438
|
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module Rubydoop
|
5
|
+
# @private
|
6
|
+
class JobRunner < Java::OrgApacheHadoopConf::Configured
|
7
|
+
include Java::OrgApacheHadoopUtil::Tool
|
8
|
+
|
9
|
+
def initialize(setup_script=$0, &block)
|
10
|
+
@setup_script = setup_script
|
11
|
+
@block = block
|
12
|
+
end
|
13
|
+
|
14
|
+
def run(args)
|
15
|
+
conf = Java::OrgApacheHadoopMapred::JobConf.new(get_conf)
|
16
|
+
conf.set(Java::Rubydoop::InstanceContainer::JOB_SETUP_SCRIPT_KEY, File.basename(@setup_script))
|
17
|
+
conf.jar = containing_jar
|
18
|
+
context = Context.new(conf)
|
19
|
+
configuration_definition = ConfigurationDefinition.new(context)
|
20
|
+
begin
|
21
|
+
configuration_definition.instance_exec(*args, &@block)
|
22
|
+
rescue => e
|
23
|
+
raise JobRunnerError, sprintf('Could not load job setup script (%s): %s', @setup_script.inspect, e.message.inspect), e.backtrace
|
24
|
+
end
|
25
|
+
configuration_definition.wait_for_completion(true) ? 0 : 1
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.run(args, &block)
|
29
|
+
Java::JavaLang::System.exit(Java::OrgApacheHadoopUtil::ToolRunner.run(new(&block), args.to_java(:string)))
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def containing_jar
|
35
|
+
@containing_jar ||= begin
|
36
|
+
relative_setup_script = @setup_script[/(?<=#{PUCK_ROOT}).+\Z/]
|
37
|
+
class_loader = JRuby.runtime.jruby_class_loader
|
38
|
+
if (url = class_loader.get_resources(relative_setup_script).find { |url| url.protocol == 'jar' })
|
39
|
+
path = url.path
|
40
|
+
path.slice!(/\Afile:/)
|
41
|
+
path = Java::JavaNet::URLDecoder.decode(path, 'UTF-8')
|
42
|
+
path.slice!(/!.*\Z/)
|
43
|
+
path
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
JobRunnerError = Class.new(StandardError)
|
50
|
+
end
|
data/lib/rubydoop/package.rb
CHANGED
@@ -1,11 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'bundler'
|
4
|
-
require '
|
5
|
-
require 'ant'
|
6
|
-
require 'fileutils'
|
7
|
-
require 'set'
|
8
|
-
|
4
|
+
require 'puck'
|
9
5
|
|
10
6
|
module Rubydoop
|
11
7
|
# Utility for making a job JAR that works with Hadoop.
|
@@ -25,15 +21,14 @@ module Rubydoop
|
|
25
21
|
# @option options [String] :project_base_dir The project's base dir, defaults to the current directory (the assumption is that Package will be used from a Rake task)
|
26
22
|
# @option options [String] :project_name The name of the JAR file (minus .jar), defaults to the directory name of the `:project_base_dir`
|
27
23
|
# @option options [String] :build_dir The directory to put the final JAR into, defaults to `:project_base_dir + '/build'`
|
24
|
+
# @option options [String] :jruby_jar_path The path to a local copy of `jruby-complete.jar`, unless specified you need to have `jruby-jars` in your `Gemfile`
|
28
25
|
# @option options [Array<String>] :gem_groups All gems from these Gemfile groups will be included, defaults to `[:default]` (the top-level group of a Gemfile)
|
29
26
|
# @option options [Array<String>] :lib_jars Paths to extra JAR files to include in the JAR's lib directory (where they will be on the classpath when the job is run)
|
30
|
-
# @option options [String] :jruby_version The JRuby version to package, defaults to `JRUBY_VERSION`
|
31
|
-
# @option options [String] :jruby_jar_path The path to a local copy of `jruby-complete.jar`, defaults to downloading and caching a version defined by `:jruby_version`
|
32
27
|
def initialize(options={})
|
33
28
|
@options = default_options.merge(options)
|
34
|
-
@options[:project_name]
|
35
|
-
@options[:build_dir]
|
36
|
-
@options[:
|
29
|
+
@options[:project_name] ||= File.basename(@options[:project_base_dir])
|
30
|
+
@options[:build_dir] ||= File.join(@options[:project_base_dir], 'build')
|
31
|
+
@options[:jar_path] ||= "#{@options[:project_name]}.jar"
|
37
32
|
end
|
38
33
|
|
39
34
|
# Create the JAR package, see {Package#initialize} for configuration options.
|
@@ -42,9 +37,15 @@ module Rubydoop
|
|
42
37
|
# (`jruby-complete.jar`) and locally cached, but if you already have a
|
43
38
|
# copy in a local Ivy or Maven repository that will be used instead.
|
44
39
|
def create!
|
45
|
-
|
46
|
-
|
47
|
-
|
40
|
+
Puck::Jar.new(
|
41
|
+
app_dir: @options[:project_base_dir],
|
42
|
+
app_name: @options[:project_name],
|
43
|
+
build_dir: @options[:build_dir],
|
44
|
+
jar_name: @options[:jar_path],
|
45
|
+
gem_groups: @options[:gem_groups],
|
46
|
+
extra_files: lib_jars,
|
47
|
+
jruby_complete: @options[:jruby_jar_path]
|
48
|
+
).create
|
48
49
|
end
|
49
50
|
|
50
51
|
# A shortcut for `Package.new(options).create!`.
|
@@ -52,11 +53,25 @@ module Rubydoop
|
|
52
53
|
new(options).create!
|
53
54
|
end
|
54
55
|
|
56
|
+
def respond_to?(name)
|
57
|
+
@options.key?(name) or super
|
58
|
+
end
|
59
|
+
|
60
|
+
def method_missing(name, *args)
|
61
|
+
@options[name] or super
|
62
|
+
end
|
63
|
+
|
64
|
+
def lib_jars
|
65
|
+
extra_files = { File.join(rubydoop_base_dir, 'lib/rubydoop.jar') => 'lib/rubydoop.jar' }
|
66
|
+
@options[:lib_jars].each_with_object(extra_files) do |jar, extra_files|
|
67
|
+
extra_files[jar] = File.join('lib', File.basename(jar))
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
55
71
|
private
|
56
72
|
|
57
73
|
def default_options
|
58
74
|
defaults = {
|
59
|
-
:main_class => 'rubydoop.RubydoopJobRunner',
|
60
75
|
:rubydoop_base_dir => File.expand_path('../../..', __FILE__),
|
61
76
|
:project_base_dir => Dir.getwd,
|
62
77
|
:gem_groups => [:default],
|
@@ -64,60 +79,5 @@ module Rubydoop
|
|
64
79
|
:jruby_version => JRUBY_VERSION
|
65
80
|
}
|
66
81
|
end
|
67
|
-
|
68
|
-
def create_directories!
|
69
|
-
FileUtils.mkdir_p(@options[:build_dir])
|
70
|
-
end
|
71
|
-
|
72
|
-
def fetch_jruby!
|
73
|
-
return if File.exists?(@options[:jruby_jar_path])
|
74
|
-
|
75
|
-
local_maven_path = File.expand_path("~/.m2/repository/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar")
|
76
|
-
local_ivy_path = File.expand_path("~/.ivy2/cache/org.jruby/jruby-complete/jars/jruby-complete-#{@options[:jruby_version]}.jar")
|
77
|
-
remote_maven_url = "http://central.maven.org/maven2/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar"
|
78
|
-
|
79
|
-
if File.exists?(local_maven_path)
|
80
|
-
$stderr.puts("Using #{File.basename(local_maven_path)} from local Maven cache")
|
81
|
-
@options[:jruby_jar_path] = local_maven_path
|
82
|
-
elsif File.exists?(local_ivy_path)
|
83
|
-
$stderr.puts("Using #{File.basename(local_maven_path)} from local Ivy2 cache")
|
84
|
-
@options[:jruby_jar_path] = local_ivy_path
|
85
|
-
else
|
86
|
-
$stderr.puts("Downloading #{remote_maven_url} to #{@options[:jruby_jar_path]}")
|
87
|
-
jruby_complete_bytes = open(remote_maven_url).read
|
88
|
-
File.open(@options[:jruby_jar_path], 'wb') do |io|
|
89
|
-
io.write(jruby_complete_bytes)
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def build_jar!
|
95
|
-
# the ant block is instance_exec'ed so instance variables and methods are not in scope
|
96
|
-
options = @options
|
97
|
-
bundled_gems = load_path
|
98
|
-
lib_jars = [options[:jruby_jar_path], *options[:lib_jars]]
|
99
|
-
ant do
|
100
|
-
jar :destfile => "#{options[:build_dir]}/#{options[:project_name]}.jar" do
|
101
|
-
manifest { attribute :name => 'Main-Class', :value => options[:main_class] }
|
102
|
-
zipfileset :src => "#{options[:rubydoop_base_dir]}/lib/rubydoop.jar"
|
103
|
-
fileset :dir => "#{options[:rubydoop_base_dir]}/lib", :includes => '**/*.rb', :excludes => '*.jar'
|
104
|
-
fileset :dir => "#{options[:project_base_dir]}/lib"
|
105
|
-
bundled_gems.each { |path| fileset :dir => path }
|
106
|
-
lib_jars.each { |extra_jar| zipfileset :dir => File.dirname(extra_jar), :includes => File.basename(extra_jar), :prefix => 'lib' }
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
def load_path
|
112
|
-
Bundler.definition.specs_for(@options[:gem_groups]).flat_map do |spec|
|
113
|
-
if spec.full_name !~ /^(?:bundler|rubydoop)-\d+/
|
114
|
-
spec.require_paths.map do |rp|
|
115
|
-
"#{spec.full_gem_path}/#{rp}"
|
116
|
-
end
|
117
|
-
else
|
118
|
-
[]
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
82
|
end
|
123
83
|
end
|
data/lib/rubydoop/version.rb
CHANGED
metadata
CHANGED
@@ -1,16 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubydoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 1.0.0
|
4
|
+
version: 2.0.0.pre1
|
6
5
|
platform: java
|
7
6
|
authors:
|
8
7
|
- Theo Hultberg
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
13
|
-
dependencies:
|
11
|
+
date: 2016-01-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ~>
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '1.2'
|
19
|
+
name: puck
|
20
|
+
prerelease: false
|
21
|
+
type: :runtime
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.2'
|
14
27
|
description: Rubydoop embeds a JRuby runtime in Hadoop, letting you write map reduce code in Ruby without using the streaming APIs
|
15
28
|
email:
|
16
29
|
- theo@iconara.net
|
@@ -19,35 +32,35 @@ extensions: []
|
|
19
32
|
extra_rdoc_files: []
|
20
33
|
files:
|
21
34
|
- lib/hadoop.rb
|
35
|
+
- lib/rubydoop.jar
|
22
36
|
- lib/rubydoop.rb
|
23
37
|
- lib/rubydoop/dsl.rb
|
38
|
+
- lib/rubydoop/job_runner.rb
|
24
39
|
- lib/rubydoop/package.rb
|
25
40
|
- lib/rubydoop/version.rb
|
26
|
-
- lib/rubydoop.jar
|
27
41
|
homepage: http://github.com/iconara/rubydoop
|
28
|
-
licenses:
|
42
|
+
licenses:
|
43
|
+
- Apache License 2.0
|
44
|
+
metadata: {}
|
29
45
|
post_install_message:
|
30
46
|
rdoc_options: []
|
31
47
|
require_paths:
|
32
48
|
- lib
|
33
49
|
required_ruby_version: !ruby/object:Gem::Requirement
|
34
50
|
requirements:
|
35
|
-
- -
|
51
|
+
- - '>='
|
36
52
|
- !ruby/object:Gem::Version
|
37
53
|
version: '0'
|
38
|
-
none: false
|
39
54
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
55
|
requirements:
|
41
|
-
- -
|
56
|
+
- - '>'
|
42
57
|
- !ruby/object:Gem::Version
|
43
|
-
version:
|
44
|
-
none: false
|
58
|
+
version: 1.3.1
|
45
59
|
requirements: []
|
46
60
|
rubyforge_project: rubydoop
|
47
|
-
rubygems_version:
|
61
|
+
rubygems_version: 2.4.5
|
48
62
|
signing_key:
|
49
|
-
specification_version:
|
63
|
+
specification_version: 4
|
50
64
|
summary: Write Hadoop jobs in Ruby
|
51
65
|
test_files: []
|
52
66
|
has_rdoc:
|
53
|
-
...
|