cascading.jruby 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/HACKING.md +15 -0
  2. data/History.txt +0 -0
  3. data/LICENSE.txt +165 -0
  4. data/README.md +7 -0
  5. data/Rakefile +45 -0
  6. data/bin/make_job +81 -0
  7. data/lib/cascading/assembly.rb +726 -0
  8. data/lib/cascading/base.rb +63 -0
  9. data/lib/cascading/cascade.rb +63 -0
  10. data/lib/cascading/cascading.rb +134 -0
  11. data/lib/cascading/cascading_exception.rb +30 -0
  12. data/lib/cascading/expr_stub.rb +33 -0
  13. data/lib/cascading/ext/array.rb +15 -0
  14. data/lib/cascading/flow.rb +168 -0
  15. data/lib/cascading/operations.rb +204 -0
  16. data/lib/cascading/scope.rb +160 -0
  17. data/lib/cascading.rb +63 -0
  18. data/samples/branch.rb +31 -0
  19. data/samples/cascading.rb +41 -0
  20. data/samples/copy.rb +18 -0
  21. data/samples/data/data2.txt +88799 -0
  22. data/samples/data/data_join1.txt +3 -0
  23. data/samples/data/data_join2.txt +3 -0
  24. data/samples/data/data_join3.txt +3 -0
  25. data/samples/join.rb +32 -0
  26. data/samples/logwordcount.rb +22 -0
  27. data/samples/project.rb +24 -0
  28. data/samples/rename.rb +21 -0
  29. data/samples/scorenames.rb +20 -0
  30. data/samples/splitter.rb +20 -0
  31. data/samples/union.rb +35 -0
  32. data/spec/cascading_spec.rb +100 -0
  33. data/spec/expr_spec.rb +10 -0
  34. data/spec/primary_key_spec.rb +119 -0
  35. data/spec/resource/join_input.txt +3 -0
  36. data/spec/resource/test_input.txt +4 -0
  37. data/spec/scope_spec.rb +174 -0
  38. data/spec/spec.opts +6 -0
  39. data/spec/spec_helper.rb +5 -0
  40. data/spec/spec_util.rb +188 -0
  41. data/src/cascading/jruby/Main.java +38 -0
  42. data/src/cascading/jruby/runner.rb +6 -0
  43. data/tags +238 -0
  44. data/tasks/ann.rake +80 -0
  45. data/tasks/ant.rake +11 -0
  46. data/tasks/bones.rake +20 -0
  47. data/tasks/gem.rake +206 -0
  48. data/tasks/git.rake +40 -0
  49. data/tasks/notes.rake +27 -0
  50. data/tasks/post_load.rake +34 -0
  51. data/tasks/rdoc.rake +50 -0
  52. data/tasks/rubyforge.rake +55 -0
  53. data/tasks/samples.rake +13 -0
  54. data/tasks/setup.rb +300 -0
  55. data/tasks/spec.rake +59 -0
  56. data/tasks/svn.rake +47 -0
  57. data/tasks/test.rake +42 -0
  58. data/test/data/data1.txt +14 -0
  59. data/test/data/data2.txt +14 -0
  60. data/test/test_assembly.rb +321 -0
  61. data/test/test_cascading.rb +49 -0
  62. data/test/test_flow.rb +15 -0
  63. metadata +137 -0
@@ -0,0 +1,726 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ require 'cascading/base'
6
+ require 'cascading/operations'
7
+ require 'cascading/ext/array'
8
+
9
+ module Cascading
10
+ class Assembly < Cascading::Node
11
+ include Operations
12
+
13
+ attr_accessor :tail_pipe, :head_pipe, :outgoing_scopes
14
+
15
+ def initialize(name, parent, outgoing_scopes = {})
16
+ super(name, parent)
17
+
18
+ @every_applied = false
19
+ @outgoing_scopes = outgoing_scopes
20
+ if parent.kind_of?(Assembly)
21
+ @head_pipe = Java::CascadingPipe::Pipe.new(name, parent.tail_pipe)
22
+ # Copy to allow destructive update of name
23
+ @outgoing_scopes[name] = parent.scope.copy
24
+ scope.scope.name = name
25
+ else # Parent is a Flow
26
+ @head_pipe = Java::CascadingPipe::Pipe.new(name)
27
+ @outgoing_scopes[name] ||= Scope.empty_scope(name)
28
+ end
29
+ @tail_pipe = @head_pipe
30
+ end
31
+
32
+ def parent_flow
33
+ return parent if parent.kind_of?(Flow)
34
+ parent.parent_flow
35
+ end
36
+
37
+ def scope
38
+ @outgoing_scopes[name]
39
+ end
40
+
41
+ def debug_scope
42
+ puts "Current scope for '#{name}':\n #{scope}\n----------\n"
43
+ end
44
+
45
+ def primary(*args)
46
+ options = args.extract_options!
47
+ if args.size > 0 && args[0] != nil
48
+ scope.primary_key_fields = fields(args)
49
+ else
50
+ scope.primary_key_fields = nil
51
+ end
52
+ scope.grouping_primary_key_fields = scope.primary_key_fields
53
+ end
54
+
55
+ def make_each(type, *parameters)
56
+ make_pipe(type, parameters)
57
+ @every_applied = false
58
+ end
59
+
60
+ def make_every(type, *parameters)
61
+ make_pipe(type, parameters, scope.grouping_key_fields)
62
+ @every_applied = true
63
+ end
64
+
65
+ def every_applied?
66
+ @every_applied
67
+ end
68
+
69
+ def do_every_block_and_rename_fields(group_fields, incoming_scopes, &block)
70
+ return unless block
71
+
72
+ # TODO: this should really be instance evaled on an object
73
+ # that only allows aggregation and buffer operations.
74
+ instance_eval &block
75
+
76
+ # First all non-primary key fields from each pipe if its primary key is a
77
+ # subset of the grouping primary key
78
+ first_fields = incoming_scopes.map do |scope|
79
+ if scope.primary_key_fields
80
+ primary_key = scope.primary_key_fields.to_a
81
+ grouping_primary_key = scope.grouping_primary_key_fields.to_a
82
+ if (primary_key & grouping_primary_key) == primary_key
83
+ difference_fields(scope.values_fields, scope.primary_key_fields).to_a
84
+ end
85
+ end
86
+ end.compact.flatten
87
+ # assert first_fields == first_fields.uniq
88
+
89
+ # Do no first any fields explicitly aggregated over
90
+ first_fields = first_fields - scope.grouping_fields.to_a
91
+ if first_fields.size > 0
92
+ first *first_fields
93
+ puts "Firsting: #{first_fields.inspect} in assembly: #{@name}"
94
+ end
95
+
96
+ bind_names scope.grouping_fields.to_a if every_applied?
97
+ end
98
+
99
+ def make_pipe(type, parameters, grouping_key_fields = [], incoming_scopes = [scope])
100
+ @tail_pipe = type.new(*parameters)
101
+ @outgoing_scopes[name] = Scope.outgoing_scope(@tail_pipe, incoming_scopes, grouping_key_fields, every_applied?)
102
+ end
103
+
104
+ def to_s
105
+ "#{@name} : head pipe : #{@head_pipe} - tail pipe: #{@tail_pipe}"
106
+ end
107
+
108
+ # Builds a join (CoGroup) pipe. Requires a list of assembly names to join.
109
+ def join(*args, &block)
110
+ options = args.extract_options!
111
+
112
+ pipes, incoming_scopes = [], []
113
+ args.each do |assembly_name|
114
+ assembly = parent_flow.find_child(assembly_name)
115
+ raise "Could not find assembly '#{assembly_name}' in join" unless assembly
116
+
117
+ pipes << assembly.tail_pipe
118
+ incoming_scopes << @outgoing_scopes[assembly.name]
119
+ end
120
+
121
+ group_fields_args = options.delete(:on)
122
+ if group_fields_args.kind_of?(String)
123
+ group_fields_args = [group_fields_args]
124
+ end
125
+ group_fields_names = group_fields_args.to_a
126
+ group_fields = []
127
+ if group_fields_args.kind_of?(Array)
128
+ pipes.size.times do
129
+ group_fields << fields(group_fields_args)
130
+ end
131
+ elsif group_fields_args.kind_of?(Hash)
132
+ pipes, incoming_scopes = [], []
133
+ keys = group_fields_args.keys.sort
134
+ keys.each do |assembly_name|
135
+ v = group_fields_args[assembly_name]
136
+ assembly = parent_flow.find_child(assembly_name)
137
+ raise "Could not find assembly '#{assembly_name}' in join" unless assembly
138
+
139
+ pipes << assembly.tail_pipe
140
+ incoming_scopes << @outgoing_scopes[assembly.name]
141
+ group_fields << fields(v)
142
+ group_fields_names = group_fields_args[keys.first].to_a
143
+ end
144
+ end
145
+
146
+ group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
147
+ incoming_fields = incoming_scopes.map{ |s| s.values_fields }
148
+ declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
149
+ joiner = options.delete(:joiner)
150
+
151
+ if declared_fields
152
+ case joiner
153
+ when :inner, "inner", nil
154
+ joiner = Java::CascadingPipeCogroup::InnerJoin.new
155
+ when :left, "left"
156
+ joiner = Java::CascadingPipeCogroup::LeftJoin.new
157
+ when :right, "right"
158
+ joiner = Java::CascadingPipeCogroup::RightJoin.new
159
+ when :outer, "outer"
160
+ joiner = Java::CascadingPipeCogroup::OuterJoin.new
161
+ when Array
162
+ joiner = joiner.map do |t|
163
+ case t
164
+ when true, 1, :inner then true
165
+ when false, 0, :outer then false
166
+ else fail "invalid mixed joiner entry: #{t}"
167
+ end
168
+ end
169
+ joiner = Java::CascadingPipeCogroup::MixedJoin.new(joiner.to_java(:boolean))
170
+ end
171
+ end
172
+
173
+ parameters = [pipes.to_java(Java::CascadingPipe::Pipe), group_fields, declared_fields, joiner].compact
174
+ grouping_key_fields = group_fields[0] # Left key group wins
175
+ make_pipe(Java::CascadingPipe::CoGroup, parameters, grouping_key_fields, incoming_scopes)
176
+ do_every_block_and_rename_fields(group_fields_names, incoming_scopes, &block)
177
+ end
178
+ alias co_group join
179
+
180
+ def inner_join(*args, &block)
181
+ options = args.extract_options!
182
+ options[:joiner] = :inner
183
+ args << options
184
+ join(*args, &block)
185
+ end
186
+
187
+ def left_join(*args, &block)
188
+ options = args.extract_options!
189
+ options[:joiner] = :left
190
+ args << options
191
+ join(*args, &block)
192
+ end
193
+
194
+ def right_join(*args, &block)
195
+ options = args.extract_options!
196
+ options[:joiner] = :right
197
+ args << options
198
+ join(*args, &block)
199
+ end
200
+
201
+ def outer_join(*args, &block)
202
+ options = args.extract_options!
203
+ options[:joiner] = :outer
204
+ args << options
205
+ join(*args, &block)
206
+ end
207
+
208
+ # Builds a new branch.
209
+ def branch(name, &block)
210
+ raise "Could not build branch '#{name}'; block required" unless block_given?
211
+ assembly = Assembly.new(name, self, @outgoing_scopes)
212
+ add_child(assembly)
213
+ assembly.instance_eval(&block)
214
+ assembly
215
+ end
216
+
217
+ # Builds a new _group_by_ pipe. The fields used for grouping are specified in the args
218
+ # array.
219
+ def group_by(*args, &block)
220
+ options = args.extract_options!
221
+
222
+ group_fields = fields(args)
223
+
224
+ sort_fields = fields(options[:sort_by] || args)
225
+ reverse = options[:reverse]
226
+
227
+ parameters = [@tail_pipe, group_fields, sort_fields, reverse].compact
228
+ make_pipe(Java::CascadingPipe::GroupBy, parameters, group_fields)
229
+ do_every_block_and_rename_fields(args, [scope], &block)
230
+ end
231
+
232
+ # Unifies several pipes sharing the same field structure.
233
+ # This actually creates a GroupBy pipe.
234
+ # It expects a list of assembly names as parameter.
235
+ def union_pipes(*args)
236
+ pipes, incoming_scopes = [], []
237
+ args[0].each do |assembly_name|
238
+ assembly = parent_flow.find_child(assembly_name)
239
+ pipes << assembly.tail_pipe
240
+ incoming_scopes << @outgoing_scopes[assembly.name]
241
+ end
242
+
243
+ # Groups only on the 1st field (see line 186 of GroupBy.java)
244
+ grouping_key_fields = fields(incoming_scopes.first.values_fields.get(0))
245
+ make_pipe(Java::CascadingPipe::GroupBy, [pipes.to_java(Java::CascadingPipe::Pipe)], grouping_key_fields, incoming_scopes)
246
+ # TODO: Shouldn't union_pipes accept an every block?
247
+ #do_every_block_and_rename_fields(args, incoming_scopes, &block)
248
+ end
249
+
250
+ # Builds an basic _every_ pipe, and adds it to the current assembly.
251
+ def every(*args)
252
+ options = args.extract_options!
253
+
254
+ in_fields = fields(args)
255
+ out_fields = fields(options[:output])
256
+ operation = options[:aggregator] || options[:buffer]
257
+
258
+ parameters = [@tail_pipe, in_fields, operation, out_fields].compact
259
+ make_every(Java::CascadingPipe::Every, *parameters)
260
+ end
261
+
262
+ # Builds a basic _each_ pipe, and adds it to the current assembly.
263
+ # --
264
+ # Example:
265
+ # each "line", :filter=>regex_splitter(["name", "val1", "val2", "id"],
266
+ # :pattern => /[.,]*\s+/),
267
+ # :output=>["id", "name", "val1", "val2"]
268
+ def each(*args)
269
+ options = args.extract_options!
270
+
271
+ in_fields = fields(args)
272
+ out_fields = fields(options[:output])
273
+ operation = options[:filter] || options[:function]
274
+
275
+ parameters = [@tail_pipe, in_fields, operation, out_fields].compact
276
+ make_each(Java::CascadingPipe::Each, *parameters)
277
+ end
278
+
279
+ # Restricts the current assembly to the specified fields.
280
+ # --
281
+ # Example:
282
+ # project "field1", "field2"
283
+ def project(*args)
284
+ fields = fields(args)
285
+ operation = Java::CascadingOperation::Identity.new
286
+ make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
287
+ end
288
+
289
+ # Removes the specified fields from the current assembly.
290
+ # --
291
+ # Example:
292
+ # discard "field1", "field2"
293
+ def discard(*args)
294
+ discard_fields = fields(args)
295
+ keep_fields = difference_fields(scope.values_fields, discard_fields)
296
+ project(*keep_fields.to_a)
297
+ end
298
+
299
+ # Assign new names to initial fields in positional order.
300
+ # --
301
+ # Example:
302
+ # bind_names "field1", "field2"
303
+ def bind_names(*new_names)
304
+ new_fields = fields(new_names)
305
+ operation = Java::CascadingOperation::Identity.new(new_fields)
306
+ make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
307
+ end
308
+
309
+ # Renames fields according to the mapping provided.
310
+ # --
311
+ # Example:
312
+ # rename "old_name" => "new_name"
313
+ def rename(name_map)
314
+ old_names = scope.values_fields.to_a
315
+ new_names = old_names.map{ |name| name_map[name] || name }
316
+ invalid = name_map.keys.sort - old_names
317
+ raise "invalid names: #{invalid.inspect}" unless invalid.empty?
318
+
319
+ old_key = scope.primary_key_fields.to_a
320
+ new_key = old_key.map{ |name| name_map[name] || name }
321
+
322
+ new_fields = fields(new_names)
323
+ operation = Java::CascadingOperation::Identity.new(new_fields)
324
+ make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
325
+ primary(*new_key)
326
+ end
327
+
328
+ def cast(type_map)
329
+ names = type_map.keys.sort
330
+ types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
331
+ fields = fields(names)
332
+ types = types.to_java(java.lang.Class)
333
+ operation = Java::CascadingOperation::Identity.new(fields, types)
334
+ make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
335
+ end
336
+
337
+ def copy(*args)
338
+ options = args.extract_options!
339
+ from = args[0] || all_fields
340
+ into = args[1] || options[:into] || all_fields
341
+ operation = Java::CascadingOperation::Identity.new(fields(into))
342
+ make_each(Java::CascadingPipe::Each, @tail_pipe, fields(from), operation, Java::CascadingTuple::Fields::ALL)
343
+ end
344
+
345
+ # A pipe that does nothing.
346
+ def pass(*args)
347
+ operation = Java::CascadingOperation::Identity.new
348
+ make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
349
+ end
350
+
351
+ def assert(*args)
352
+ options = args.extract_options!
353
+ assertion = args[0]
354
+ assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
355
+ make_each(Java::CascadingPipe::Each, @tail_pipe, assertion_level, assertion)
356
+ end
357
+
358
+ def assert_group(*args)
359
+ options = args.extract_options!
360
+ assertion = args[0]
361
+ assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
362
+ make_every(Java::CascadingPipe::Every, @tail_pipe, assertion_level, assertion)
363
+ end
364
+
365
+ # Builds a debugging pipe.
366
+ #
367
+ # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard
368
+ # output.
369
+ #
370
+ # The other named options are:
371
+ # * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
372
+ #
373
+ def debug(*args)
374
+ options = args.extract_options!
375
+ print_fields = options[:print_fields] || true
376
+ parameters = [print_fields].compact
377
+ debug = Java::CascadingOperation::Debug.new(*parameters)
378
+ debug.print_tuple_every = options[:tuple_interval] || 1
379
+ debug.print_fields_every = options[:fields_interval] || 10
380
+ each(all_fields, :filter => debug)
381
+ end
382
+
383
+ # Builds a pipe that assert the size of the tuple is the size specified in parameter.
384
+ #
385
+ # The method accept an unique uname argument : a number indicating the size expected.
386
+ def assert_size_equals(*args)
387
+ options = args.extract_options!
388
+ assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
389
+ assert(assertion, options)
390
+ end
391
+
392
+ # Builds a pipe that assert the none of the fields in the tuple are null.
393
+ def assert_not_null(*args)
394
+ options = args.extract_options!
395
+ assertion = Java::CascadingOperationAssertion::AssertNotNull.new
396
+ assert(assertion, options)
397
+ end
398
+
399
+ def assert_group_size_equals(*args)
400
+ options = args.extract_options!
401
+ assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0])
402
+ assert_group(assertion, options)
403
+ end
404
+
405
+ # Builds a series of every pipes for aggregation.
406
+ #
407
+ # Args can either be a list of fields to aggregate and an options hash or
408
+ # a hash that maps input field name to output field name (similar to
409
+ # insert) and an options hash.
410
+ #
411
+ # Options include:
412
+ # * <tt>:sql</tt> a boolean indicating whether the operation should act like the SQL equivalent
413
+ #
414
+ # <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
415
+ def composite_aggregator(args, function)
416
+ if !args.empty? && args.first.kind_of?(Hash)
417
+ field_map = args.shift.sort
418
+ options = args.extract_options!
419
+ else
420
+ options = args.extract_options!
421
+ field_map = args.zip(args)
422
+ end
423
+ field_map.each do |in_field, out_field|
424
+ agg = self.send(function, out_field, options)
425
+ every(in_field, :aggregator => agg, :output => all_fields)
426
+ end
427
+ puts "WARNING: composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields; will be ignored" if field_map.empty?
428
+ end
429
+
430
+ def min(*args); composite_aggregator(args, :min_function); end
431
+ def max(*args); composite_aggregator(args, :max_function); end
432
+ def first(*args); composite_aggregator(args, :first_function); end
433
+ def last(*args); composite_aggregator(args, :last_function); end
434
+ def average(*args); composite_aggregator(args, :average_function); end
435
+
436
+ # Counts elements of a group. First unnamed parameter is the name of the
437
+ # output count field (defaults to 'count' if it is not provided).
438
+ def count(*args)
439
+ options = args.extract_options!
440
+ name = args[0] || 'count'
441
+ every(last_grouping_fields, :aggregator => count_function(name, options), :output => all_fields)
442
+ end
443
+
444
+ # Fields to be summed may either be provided as an array, in which case
445
+ # they will be aggregated into the same field in the given order, or as a
446
+ # hash, in which case they will be aggregated from the field named by the
447
+ # key into the field named by the value after being sorted.
448
+ def sum(*args)
449
+ options = args.extract_options!
450
+ type = JAVA_TYPE_MAP[options[:type]]
451
+ raise "No type specified for sum" unless type
452
+
453
+ mapping = options[:mapping] ? options[:mapping].sort : args.zip(args)
454
+ mapping.each do |in_field, out_field|
455
+ every(in_field, :aggregator => sum_function(out_field, :type => type), :output => all_fields)
456
+ end
457
+ end
458
+
459
+ # Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
460
+ # using a specified regex pattern.
461
+ #
462
+ # If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
463
+ # fields are used.
464
+ #
465
+ # The named options are:
466
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
467
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
468
+ def parse(*args)
469
+ options = args.extract_options!
470
+ fields = args || all_fields
471
+ pattern = options[:pattern]
472
+ output = options[:output] || all_fields
473
+ each(fields, :filter => regex_parser(pattern, options), :output => output)
474
+ end
475
+
476
+ # Builds a pipe that splits a field into other fields, using a specified regular expression.
477
+ #
478
+ # The first unnamed argument is the field to be split.
479
+ # The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
480
+ #
481
+ # The named options are:
482
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
483
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
484
+ def split(*args)
485
+ options = args.extract_options!
486
+ fields = options[:into] || args[1]
487
+ pattern = options[:pattern] || /[.,]*\s+/
488
+ output = options[:output] || all_fields
489
+ each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
490
+ end
491
+
492
+ # Builds a pipe that splits a field into new rows, using a specified regular expression.
493
+ #
494
+ # The first unnamed argument is the field to be split.
495
+ # The second unnamed argument is the field receiving the result of the split.
496
+ #
497
+ # The named options are:
498
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
499
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
500
+ def split_rows(*args)
501
+ options = args.extract_options!
502
+ fields = options[:into] || args[1]
503
+ pattern = options[:pattern] || /[.,]*\s+/
504
+ output = options[:output] || all_fields
505
+ each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
506
+ end
507
+
508
+ # Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
509
+ #
510
+ # The first unnamed argument is the field to be matched against.
511
+ # The second unnamed argument is the field receiving the result of the match.
512
+ #
513
+ # The named options are:
514
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
515
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
516
+ def match_rows(*args)
517
+ options = args.extract_options!
518
+ fields = options[:into] || args[1]
519
+ pattern = options[:pattern] || /[\w]+/
520
+ output = options[:output] || all_fields
521
+ each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
522
+ end
523
+
524
+ # Builds a pipe that parses the specified field as a date using hte provided format string.
525
+ # The unamed argument specifies the field to format.
526
+ #
527
+ # The named options are:
528
+ # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
529
+ # the input argument.
530
+ # * <tt>:pattern</tt> a string. Specifies the date format.
531
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
532
+ def parse_date(*args)
533
+ options = args.extract_options!
534
+ field = options[:into] || "#{args[0]}_parsed"
535
+ output = options[:output] || all_fields
536
+ pattern = options[:pattern] || "yyyy/MM/dd"
537
+
538
+ each args[0], :function => date_parser(field, pattern), :output => output
539
+ end
540
+
541
+ # Builds a pipe that format a date using a specified format pattern.
542
+ #
543
+ # The unamed argument specifies the field to format.
544
+ #
545
+ # The named options are:
546
+ # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
547
+ # the input argument.
548
+ # * <tt>:pattern</tt> a string. Specifies the date format.
549
+ # * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
550
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
551
+ def format_date(*args)
552
+ options = args.extract_options!
553
+ field = options[:into] || "#{args[0]}_formatted"
554
+ pattern = options[:pattern] || "yyyy/MM/dd"
555
+ output = options[:output] || all_fields
556
+
557
+ each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
558
+ end
559
+
560
+ # Builds a pipe that perform a query/replace based on a regular expression.
561
+ #
562
+ # The first unamed argument specifies the input field.
563
+ #
564
+ # The named options are:
565
+ # * <tt>:pattern</tt> a string or regex. Specifies the pattern to look for in the input field. This non-optional argument
566
+ # can also be specified as a second _unamed_ argument.
567
+ # * <tt>:replacement</tt> a string. Specifies the replacement.
568
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
569
+ def replace(*args)
570
+ options = args.extract_options!
571
+
572
+ pattern = options[:pattern] || args[1]
573
+ replacement = options[:replacement] || args[2]
574
+ into = options[:into] || "#{args[0]}_replaced"
575
+ output = options[:output] || all_fields
576
+
577
+ each args[0], :function => regex_replace(into, pattern, replacement), :output => output
578
+ end
579
+
580
+ # Builds a pipe that inserts values into the current tuple.
581
+ #
582
+ # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert
583
+ # and as values, the values they must contain. For example:
584
+ #
585
+ # insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
586
+ #
587
+ # will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
588
+ # the formatted current date.
589
+ # The methods outputs all fields.
590
+ # The named options are:
591
+ def insert(args)
592
+ args.keys.sort.each do |field_name|
593
+ value = args[field_name]
594
+
595
+ if value.kind_of?(ExprStub)
596
+ each all_fields,
597
+ :function => expression_function(field_name, :expression => value.expression,
598
+ :parameters => value.types), :output => all_fields
599
+ else
600
+ each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
601
+ end
602
+ end
603
+ end
604
+
605
+ # Builds a pipe that filters the tuples based on an expression or a pattern (but not both !).
606
+ #
607
+ # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
608
+ #
609
+ # The named options are:
610
+ # * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
611
+ # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
612
+ # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
613
+ # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
614
+ # expression-based. This is incompatible with the _pattern_ option.
615
+ def filter(*args)
616
+ options = args.extract_options!
617
+ from = options.delete(:from) || all_fields
618
+ expression = options.delete(:expression) || args.shift
619
+ regex = options.delete(:pattern)
620
+ if expression
621
+ stub = ExprStub.new(expression)
622
+ types, expression = stub.types, stub.expression
623
+
624
+ each from, :filter => expression_filter(
625
+ :parameters => types,
626
+ :expression => expression
627
+ )
628
+ elsif regex
629
+ each from, :filter => regex_filter(regex, options)
630
+ end
631
+ end
632
+
633
+ def filter_null(*args)
634
+ options = args.extract_options!
635
+ each(args, :filter => Java::CascadingOperationFilter::FilterNull.new)
636
+ end
637
+ alias reject_null filter_null
638
+
639
+ def filter_not_null(*args)
640
+ options = args.extract_options!
641
+ each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
642
+ end
643
+ alias where_null filter_not_null
644
+
645
+ # Builds a pipe that rejects the tuples based on an expression.
646
+ #
647
+ # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
648
+ #
649
+ # The named options are:
650
+ # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
651
+ # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
652
+ # expression-based.
653
+ def reject(*args)
654
+ options = args.extract_options
655
+ raise "Regex not allowed" if options && options[:pattern]
656
+
657
+ filter(*args)
658
+ end
659
+
660
+ # Builds a pipe that includes just the tuples matching an expression.
661
+ #
662
+ # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
663
+ #
664
+ # The named options are:
665
+ # * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
666
+ # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
667
+ # expression-based.
668
+ def where(*args)
669
+ options = args.extract_options
670
+ raise "Regex not allowed" if options && options[:pattern]
671
+
672
+ if options[:expression]
673
+ options[:expression] = "!(#{options[:expression]})"
674
+ elsif args[0]
675
+ args[0] = "!(#{args[0]})"
676
+ end
677
+
678
+ filter(*args)
679
+ end
680
+
681
+ # Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple.
682
+ #
683
+ # The named options are:
684
+ # * <tt>:from</tt> a string or array of strings. Specifies the input fields.
685
+ # * <tt>:express</tt> a string. The janino expression.
686
+ # * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
687
+ # * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
688
+ def eval_expression(*args)
689
+ options = args.extract_options!
690
+
691
+ into = options.delete(:into)
692
+ from = options.delete(:from) || all_fields
693
+ output = options.delete(:output) || all_fields
694
+ options[:expression] ||= args.shift
695
+ options[:parameters] ||= args.shift
696
+
697
+ each from, :function => expression_function(into, options), :output=>output
698
+ end
699
+
700
+ # Builds a pipe that returns distinct tuples based on the provided fields.
701
+ #
702
+ # The method accepts optional unamed argument specifying the fields to base the distinct on
703
+ # (all fields, by default).
704
+ def distinct(*args)
705
+ raise "Distinct is badly broken"
706
+ fields = args[0] || all_fields
707
+ group_by *fields
708
+ pass
709
+ end
710
+
711
+ # Builds a pipe that will unify (merge) pipes. The method accepts the list of pipes as argument.
712
+ # Tuples unified must share the same fields.
713
+ def union(*args)
714
+ options = args.extract_options!
715
+ pipes = args
716
+ union_pipes pipes
717
+ end
718
+
719
+ def join_fields(*args)
720
+ options = args.extract_options!
721
+ output = options[:output] || all_fields
722
+
723
+ each args, :function => field_joiner(options), :output => output
724
+ end
725
+ end
726
+ end