rubydoop 1.1.3 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rubydoop.jar +0 -0
- data/lib/rubydoop.rb +1 -43
- data/lib/rubydoop/dsl.rb +108 -29
- data/lib/rubydoop/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82bf7b52baa55faaa4787d61890c72ad3eb7a9e1
|
4
|
+
data.tar.gz: f48b41c4268d3bc970ef942b845bcab414a6714f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30b7c232ed09dc1425d1798f5828da8f9b109769f5fcc3dd9a1c97a71e35aad3605f49c3a84b3a52b879119df37357e9ee8bb81f6d85ba3f051f82c71b4b4624
|
7
|
+
data.tar.gz: 218dc633c5038de6b8964bcfc57c2d8d582032d2a3216e715ac755807aa1585946ceac944417d738376b24b757a441d99f7d9e695b66c8cc606c8f4a46314a0a
|
data/lib/rubydoop.jar
CHANGED
Binary file
|
data/lib/rubydoop.rb
CHANGED
@@ -10,49 +10,7 @@ require 'hadoop'
|
|
10
10
|
# {Package} for the packaging documentation, or the {file:README.md README}
|
11
11
|
# for a getting started guide.
|
12
12
|
module Rubydoop
|
13
|
-
|
14
|
-
def self.create_mapper(conf)
|
15
|
-
create_instance(conf.get(MAPPER_KEY))
|
16
|
-
end
|
17
|
-
|
18
|
-
# @private
|
19
|
-
def self.create_reducer(conf)
|
20
|
-
create_instance(conf.get(REDUCER_KEY))
|
21
|
-
end
|
22
|
-
|
23
|
-
# @private
|
24
|
-
def self.create_combiner(conf)
|
25
|
-
create_instance(conf.get(COMBINER_KEY))
|
26
|
-
end
|
27
|
-
|
28
|
-
# @private
|
29
|
-
def self.create_partitioner(conf)
|
30
|
-
create_instance(conf.get(PARTITIONER_KEY))
|
31
|
-
end
|
32
|
-
|
33
|
-
# @private
|
34
|
-
def self.create_grouping_comparator(conf)
|
35
|
-
create_instance(conf.get(GROUPING_COMPARATOR_KEY))
|
36
|
-
end
|
37
|
-
|
38
|
-
# @private
|
39
|
-
def self.create_sort_comparator(conf)
|
40
|
-
create_instance(conf.get(SORT_COMPARATOR_KEY))
|
41
|
-
end
|
42
|
-
|
43
|
-
private
|
44
|
-
|
45
|
-
MAPPER_KEY = 'rubydoop.mapper'.freeze
|
46
|
-
REDUCER_KEY = 'rubydoop.reducer'.freeze
|
47
|
-
COMBINER_KEY = 'rubydoop.combiner'.freeze
|
48
|
-
PARTITIONER_KEY = 'rubydoop.partitioner'.freeze
|
49
|
-
GROUPING_COMPARATOR_KEY = 'rubydoop.grouping_comparator'.freeze
|
50
|
-
SORT_COMPARATOR_KEY = 'rubydoop.sort_comparator'.freeze
|
51
|
-
|
52
|
-
def self.create_instance(const_path)
|
53
|
-
cls = const_path.split('::').reduce(Object) { |host, name| host.const_get(name) }
|
54
|
-
cls.new
|
55
|
-
end
|
13
|
+
include_package 'rubydoop'
|
56
14
|
end
|
57
15
|
|
58
16
|
require 'rubydoop/dsl'
|
data/lib/rubydoop/dsl.rb
CHANGED
@@ -62,6 +62,14 @@ module Rubydoop
|
|
62
62
|
job.instance_exec(&block)
|
63
63
|
job
|
64
64
|
end
|
65
|
+
|
66
|
+
def parallel(&block)
|
67
|
+
@context.parallel(&block)
|
68
|
+
end
|
69
|
+
|
70
|
+
def sequence(&block)
|
71
|
+
@context.sequence(&block)
|
72
|
+
end
|
65
73
|
end
|
66
74
|
|
67
75
|
# Job configuration DSL.
|
@@ -94,11 +102,15 @@ module Rubydoop
|
|
94
102
|
class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "InputFormat"
|
95
103
|
format = Hadoop::Mapreduce::Lib::Input.const_get(class_name)
|
96
104
|
end
|
105
|
+
unless format <= Hadoop::Mapreduce::InputFormat
|
106
|
+
@job.configuration.set(Rubydoop::InputFormatProxy::RUBY_CLASS_KEY, format.name)
|
107
|
+
format = Rubydoop::InputFormatProxy
|
108
|
+
end
|
97
109
|
format.set_input_paths(@job, paths)
|
98
110
|
@job.set_input_format_class(format)
|
99
111
|
end
|
100
112
|
|
101
|
-
# Sets the output path of the job.
|
113
|
+
# Sets or gets the output path of the job.
|
102
114
|
#
|
103
115
|
# Calls `setOutputFormatClass` on the Hadoop job and uses the static
|
104
116
|
# `setOutputPath` on the output format to set the job's output path.
|
@@ -108,14 +120,27 @@ module Rubydoop
|
|
108
120
|
# @param [String] dir The output path
|
109
121
|
# @param [Hash] options
|
110
122
|
# @option options [JavaClass] :format The output format to use, defaults to `TextOutputFormat`
|
111
|
-
def output(dir, options={})
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
123
|
+
def output(dir=nil, options={})
|
124
|
+
if dir
|
125
|
+
if dir.is_a?(Hash)
|
126
|
+
options = dir
|
127
|
+
if options[:intermediate]
|
128
|
+
dir = @job.job_name
|
129
|
+
else
|
130
|
+
raise ArgumentError, sprintf('neither dir nor intermediate: true was specified')
|
131
|
+
end
|
132
|
+
end
|
133
|
+
dir = sprintf('%s-%010d-%05d', dir, Time.now, rand(1e5)) if options[:intermediate]
|
134
|
+
@output_dir = dir
|
135
|
+
format = options.fetch(:format, :text)
|
136
|
+
unless format.is_a?(Class)
|
137
|
+
class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "OutputFormat"
|
138
|
+
format = Hadoop::Mapreduce::Lib::Output.const_get(class_name)
|
139
|
+
end
|
140
|
+
format.set_output_path(@job, Hadoop::Fs::Path.new(@output_dir))
|
141
|
+
@job.set_output_format_class(format)
|
116
142
|
end
|
117
|
-
|
118
|
-
@job.set_output_format_class(format)
|
143
|
+
@output_dir
|
119
144
|
end
|
120
145
|
|
121
146
|
# Sets a job property.
|
@@ -161,8 +186,8 @@ module Rubydoop
|
|
161
186
|
# @param [Class] cls The (Ruby) mapper class.
|
162
187
|
def mapper(cls=nil)
|
163
188
|
if cls
|
164
|
-
@job.configuration.set(
|
165
|
-
@job.set_mapper_class(
|
189
|
+
@job.configuration.set(Rubydoop::MapperProxy::RUBY_CLASS_KEY, cls.name)
|
190
|
+
@job.set_mapper_class(Rubydoop::MapperProxy)
|
166
191
|
@mapper = cls
|
167
192
|
end
|
168
193
|
@mapper
|
@@ -187,8 +212,8 @@ module Rubydoop
|
|
187
212
|
# @param [Class] cls The (Ruby) reducer class.
|
188
213
|
def reducer(cls=nil)
|
189
214
|
if cls
|
190
|
-
@job.configuration.set(
|
191
|
-
@job.set_reducer_class(
|
215
|
+
@job.configuration.set(Rubydoop::ReducerProxy::RUBY_CLASS_KEY, cls.name)
|
216
|
+
@job.set_reducer_class(Rubydoop::ReducerProxy)
|
192
217
|
@reducer = cls
|
193
218
|
end
|
194
219
|
@reducer
|
@@ -208,8 +233,8 @@ module Rubydoop
|
|
208
233
|
# @param [Class] cls The (Ruby) combiner class.
|
209
234
|
def combiner(cls=nil)
|
210
235
|
if cls
|
211
|
-
@job.configuration.set(
|
212
|
-
@job.set_combiner_class(
|
236
|
+
@job.configuration.set(Rubydoop::CombinerProxy::RUBY_CLASS_KEY, cls.name)
|
237
|
+
@job.set_combiner_class(Rubydoop::CombinerProxy)
|
213
238
|
@combiner = cls
|
214
239
|
end
|
215
240
|
@combiner
|
@@ -230,8 +255,8 @@ module Rubydoop
|
|
230
255
|
# @param [Class] cls The (Ruby) partitioner class.
|
231
256
|
def partitioner(cls=nil)
|
232
257
|
if cls
|
233
|
-
@job.configuration.set(
|
234
|
-
@job.set_partitioner_class(
|
258
|
+
@job.configuration.set(Rubydoop::PartitionerProxy::RUBY_CLASS_KEY, cls.name)
|
259
|
+
@job.set_partitioner_class(Rubydoop::PartitionerProxy)
|
235
260
|
@partitioner = cls
|
236
261
|
end
|
237
262
|
@partitioner
|
@@ -249,8 +274,8 @@ module Rubydoop
|
|
249
274
|
# @param [Class] cls The (Ruby) comparator class.
|
250
275
|
def grouping_comparator(cls=nil)
|
251
276
|
if cls
|
252
|
-
@job.configuration.set(
|
253
|
-
@job.set_grouping_comparator_class(
|
277
|
+
@job.configuration.set(Rubydoop::GroupingComparatorProxy::RUBY_CLASS_KEY, cls.name)
|
278
|
+
@job.set_grouping_comparator_class(Rubydoop::GroupingComparatorProxy)
|
254
279
|
@grouping_comparator = cls
|
255
280
|
end
|
256
281
|
@grouping_comparator
|
@@ -268,8 +293,8 @@ module Rubydoop
|
|
268
293
|
# @param [Class] cls The (Ruby) comparator class.
|
269
294
|
def sort_comparator(cls=nil)
|
270
295
|
if cls
|
271
|
-
@job.configuration.set(
|
272
|
-
@job.set_sort_comparator_class(
|
296
|
+
@job.configuration.set(Rubydoop::SortComparatorProxy::RUBY_CLASS_KEY, cls.name)
|
297
|
+
@job.set_sort_comparator_class(Rubydoop::SortComparatorProxy)
|
273
298
|
@sort_comparator = cls
|
274
299
|
end
|
275
300
|
@sort_comparator
|
@@ -292,13 +317,13 @@ module Rubydoop
|
|
292
317
|
def self.class_setter(dsl_name)
|
293
318
|
define_method(dsl_name) do |cls|
|
294
319
|
if cls
|
295
|
-
@job.send("set_#{dsl_name}_class", cls.
|
320
|
+
@job.send("set_#{dsl_name}_class", cls.to_java(Java::JavaLang::Class))
|
296
321
|
instance_variable_set(:"@#{dsl_name}", cls)
|
297
322
|
end
|
298
323
|
instance_variable_get(:"@#{dsl_name}")
|
299
324
|
end
|
300
325
|
define_method("#{dsl_name}=") do |cls|
|
301
|
-
@job.send("set_#{dsl_name}_class", cls.
|
326
|
+
@job.send("set_#{dsl_name}_class", cls.to_java(Java::JavaLang::Class))
|
302
327
|
end
|
303
328
|
end
|
304
329
|
|
@@ -343,23 +368,77 @@ module Rubydoop
|
|
343
368
|
|
344
369
|
# @private
|
345
370
|
class Context
|
346
|
-
attr_reader :
|
371
|
+
attr_reader :arguments
|
347
372
|
|
348
|
-
def initialize(conf,
|
373
|
+
def initialize(conf, arguments)
|
349
374
|
@conf = conf
|
350
|
-
@proxy_classes = proxy_classes
|
351
375
|
@arguments = arguments.to_a
|
352
|
-
@
|
376
|
+
@job_stack = [Jobs::Sequence.new]
|
353
377
|
end
|
354
378
|
|
355
379
|
def create_job(name)
|
356
380
|
hadoop_job = Hadoop::Mapreduce::Job.new(@conf, name)
|
357
|
-
@
|
381
|
+
@job_stack.last.add(hadoop_job)
|
358
382
|
hadoop_job
|
359
383
|
end
|
360
384
|
|
361
|
-
def
|
362
|
-
@
|
385
|
+
def wait_for_completion(verbose)
|
386
|
+
@job_stack.first.wait_for_completion(verbose)
|
387
|
+
end
|
388
|
+
|
389
|
+
def parallel
|
390
|
+
push(Jobs::Parallel.new)
|
391
|
+
if block_given?
|
392
|
+
yield
|
393
|
+
pop
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
def sequence
|
398
|
+
push(Jobs::Sequence.new)
|
399
|
+
if block_given?
|
400
|
+
yield
|
401
|
+
pop
|
402
|
+
end
|
403
|
+
end
|
404
|
+
|
405
|
+
def push(job_list)
|
406
|
+
@job_stack.last.add(job_list)
|
407
|
+
@job_stack.push(job_list)
|
408
|
+
end
|
409
|
+
|
410
|
+
def pop
|
411
|
+
@job_stack.pop
|
412
|
+
end
|
413
|
+
|
414
|
+
class Jobs
|
415
|
+
attr_reader :jobs
|
416
|
+
|
417
|
+
def initialize
|
418
|
+
@jobs = []
|
419
|
+
end
|
420
|
+
|
421
|
+
def add(job)
|
422
|
+
@jobs.push(job)
|
423
|
+
end
|
424
|
+
|
425
|
+
class Sequence < Jobs
|
426
|
+
def wait_for_completion(verbose)
|
427
|
+
@jobs.all? do |job|
|
428
|
+
job.wait_for_completion(verbose)
|
429
|
+
end
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
class Parallel < Jobs
|
434
|
+
def wait_for_completion(verbose)
|
435
|
+
@jobs.map do |job|
|
436
|
+
Thread.new do
|
437
|
+
job.wait_for_completion(verbose)
|
438
|
+
end
|
439
|
+
end.map!(&:value).all?
|
440
|
+
end
|
441
|
+
end
|
363
442
|
end
|
364
443
|
end
|
365
444
|
end
|
data/lib/rubydoop/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubydoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Theo Hultberg
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Rubydoop embeds a JRuby runtime in Hadoop, letting you write map reduce code in Ruby without using the streaming APIs
|
14
14
|
email:
|
@@ -42,7 +42,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
42
42
|
version: '0'
|
43
43
|
requirements: []
|
44
44
|
rubyforge_project: rubydoop
|
45
|
-
rubygems_version: 2.
|
45
|
+
rubygems_version: 2.4.6
|
46
46
|
signing_key:
|
47
47
|
specification_version: 4
|
48
48
|
summary: Write Hadoop jobs in Ruby
|