cascading.jruby 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/HACKING.md +15 -0
  2. data/History.txt +0 -0
  3. data/LICENSE.txt +165 -0
  4. data/README.md +7 -0
  5. data/Rakefile +45 -0
  6. data/bin/make_job +81 -0
  7. data/lib/cascading/assembly.rb +726 -0
  8. data/lib/cascading/base.rb +63 -0
  9. data/lib/cascading/cascade.rb +63 -0
  10. data/lib/cascading/cascading.rb +134 -0
  11. data/lib/cascading/cascading_exception.rb +30 -0
  12. data/lib/cascading/expr_stub.rb +33 -0
  13. data/lib/cascading/ext/array.rb +15 -0
  14. data/lib/cascading/flow.rb +168 -0
  15. data/lib/cascading/operations.rb +204 -0
  16. data/lib/cascading/scope.rb +160 -0
  17. data/lib/cascading.rb +63 -0
  18. data/samples/branch.rb +31 -0
  19. data/samples/cascading.rb +41 -0
  20. data/samples/copy.rb +18 -0
  21. data/samples/data/data2.txt +88799 -0
  22. data/samples/data/data_join1.txt +3 -0
  23. data/samples/data/data_join2.txt +3 -0
  24. data/samples/data/data_join3.txt +3 -0
  25. data/samples/join.rb +32 -0
  26. data/samples/logwordcount.rb +22 -0
  27. data/samples/project.rb +24 -0
  28. data/samples/rename.rb +21 -0
  29. data/samples/scorenames.rb +20 -0
  30. data/samples/splitter.rb +20 -0
  31. data/samples/union.rb +35 -0
  32. data/spec/cascading_spec.rb +100 -0
  33. data/spec/expr_spec.rb +10 -0
  34. data/spec/primary_key_spec.rb +119 -0
  35. data/spec/resource/join_input.txt +3 -0
  36. data/spec/resource/test_input.txt +4 -0
  37. data/spec/scope_spec.rb +174 -0
  38. data/spec/spec.opts +6 -0
  39. data/spec/spec_helper.rb +5 -0
  40. data/spec/spec_util.rb +188 -0
  41. data/src/cascading/jruby/Main.java +38 -0
  42. data/src/cascading/jruby/runner.rb +6 -0
  43. data/tags +238 -0
  44. data/tasks/ann.rake +80 -0
  45. data/tasks/ant.rake +11 -0
  46. data/tasks/bones.rake +20 -0
  47. data/tasks/gem.rake +206 -0
  48. data/tasks/git.rake +40 -0
  49. data/tasks/notes.rake +27 -0
  50. data/tasks/post_load.rake +34 -0
  51. data/tasks/rdoc.rake +50 -0
  52. data/tasks/rubyforge.rake +55 -0
  53. data/tasks/samples.rake +13 -0
  54. data/tasks/setup.rb +300 -0
  55. data/tasks/spec.rake +59 -0
  56. data/tasks/svn.rake +47 -0
  57. data/tasks/test.rake +42 -0
  58. data/test/data/data1.txt +14 -0
  59. data/test/data/data2.txt +14 -0
  60. data/test/test_assembly.rb +321 -0
  61. data/test/test_cascading.rb +49 -0
  62. data/test/test_flow.rb +15 -0
  63. metadata +137 -0
@@ -0,0 +1,726 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ require 'cascading/base'
6
+ require 'cascading/operations'
7
+ require 'cascading/ext/array'
8
+
9
+ module Cascading
10
+ class Assembly < Cascading::Node
11
+ include Operations
12
+
13
+ attr_accessor :tail_pipe, :head_pipe, :outgoing_scopes
14
+
15
+ def initialize(name, parent, outgoing_scopes = {})
16
+ super(name, parent)
17
+
18
+ @every_applied = false
19
+ @outgoing_scopes = outgoing_scopes
20
+ if parent.kind_of?(Assembly)
21
+ @head_pipe = Java::CascadingPipe::Pipe.new(name, parent.tail_pipe)
22
+ # Copy to allow destructive update of name
23
+ @outgoing_scopes[name] = parent.scope.copy
24
+ scope.scope.name = name
25
+ else # Parent is a Flow
26
+ @head_pipe = Java::CascadingPipe::Pipe.new(name)
27
+ @outgoing_scopes[name] ||= Scope.empty_scope(name)
28
+ end
29
+ @tail_pipe = @head_pipe
30
+ end
31
+
32
+ def parent_flow
33
+ return parent if parent.kind_of?(Flow)
34
+ parent.parent_flow
35
+ end
36
+
37
+ def scope
38
+ @outgoing_scopes[name]
39
+ end
40
+
41
+ def debug_scope
42
+ puts "Current scope for '#{name}':\n #{scope}\n----------\n"
43
+ end
44
+
45
+ def primary(*args)
46
+ options = args.extract_options!
47
+ if args.size > 0 && args[0] != nil
48
+ scope.primary_key_fields = fields(args)
49
+ else
50
+ scope.primary_key_fields = nil
51
+ end
52
+ scope.grouping_primary_key_fields = scope.primary_key_fields
53
+ end
54
+
55
+ def make_each(type, *parameters)
56
+ make_pipe(type, parameters)
57
+ @every_applied = false
58
+ end
59
+
60
+ def make_every(type, *parameters)
61
+ make_pipe(type, parameters, scope.grouping_key_fields)
62
+ @every_applied = true
63
+ end
64
+
65
+ def every_applied?
66
+ @every_applied
67
+ end
68
+
69
+ def do_every_block_and_rename_fields(group_fields, incoming_scopes, &block)
70
+ return unless block
71
+
72
+ # TODO: this should really be instance evaled on an object
73
+ # that only allows aggregation and buffer operations.
74
+ instance_eval &block
75
+
76
+ # First all non-primary key fields from each pipe if its primary key is a
77
+ # subset of the grouping primary key
78
+ first_fields = incoming_scopes.map do |scope|
79
+ if scope.primary_key_fields
80
+ primary_key = scope.primary_key_fields.to_a
81
+ grouping_primary_key = scope.grouping_primary_key_fields.to_a
82
+ if (primary_key & grouping_primary_key) == primary_key
83
+ difference_fields(scope.values_fields, scope.primary_key_fields).to_a
84
+ end
85
+ end
86
+ end.compact.flatten
87
+ # assert first_fields == first_fields.uniq
88
+
89
+ # Do no first any fields explicitly aggregated over
90
+ first_fields = first_fields - scope.grouping_fields.to_a
91
+ if first_fields.size > 0
92
+ first *first_fields
93
+ puts "Firsting: #{first_fields.inspect} in assembly: #{@name}"
94
+ end
95
+
96
+ bind_names scope.grouping_fields.to_a if every_applied?
97
+ end
98
+
99
+ def make_pipe(type, parameters, grouping_key_fields = [], incoming_scopes = [scope])
100
+ @tail_pipe = type.new(*parameters)
101
+ @outgoing_scopes[name] = Scope.outgoing_scope(@tail_pipe, incoming_scopes, grouping_key_fields, every_applied?)
102
+ end
103
+
104
+ def to_s
105
+ "#{@name} : head pipe : #{@head_pipe} - tail pipe: #{@tail_pipe}"
106
+ end
107
+
108
+ # Builds a join (CoGroup) pipe. Requires a list of assembly names to join.
109
+ def join(*args, &block)
110
+ options = args.extract_options!
111
+
112
+ pipes, incoming_scopes = [], []
113
+ args.each do |assembly_name|
114
+ assembly = parent_flow.find_child(assembly_name)
115
+ raise "Could not find assembly '#{assembly_name}' in join" unless assembly
116
+
117
+ pipes << assembly.tail_pipe
118
+ incoming_scopes << @outgoing_scopes[assembly.name]
119
+ end
120
+
121
+ group_fields_args = options.delete(:on)
122
+ if group_fields_args.kind_of?(String)
123
+ group_fields_args = [group_fields_args]
124
+ end
125
+ group_fields_names = group_fields_args.to_a
126
+ group_fields = []
127
+ if group_fields_args.kind_of?(Array)
128
+ pipes.size.times do
129
+ group_fields << fields(group_fields_args)
130
+ end
131
+ elsif group_fields_args.kind_of?(Hash)
132
+ pipes, incoming_scopes = [], []
133
+ keys = group_fields_args.keys.sort
134
+ keys.each do |assembly_name|
135
+ v = group_fields_args[assembly_name]
136
+ assembly = parent_flow.find_child(assembly_name)
137
+ raise "Could not find assembly '#{assembly_name}' in join" unless assembly
138
+
139
+ pipes << assembly.tail_pipe
140
+ incoming_scopes << @outgoing_scopes[assembly.name]
141
+ group_fields << fields(v)
142
+ group_fields_names = group_fields_args[keys.first].to_a
143
+ end
144
+ end
145
+
146
+ group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
147
+ incoming_fields = incoming_scopes.map{ |s| s.values_fields }
148
+ declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
149
+ joiner = options.delete(:joiner)
150
+
151
+ if declared_fields
152
+ case joiner
153
+ when :inner, "inner", nil
154
+ joiner = Java::CascadingPipeCogroup::InnerJoin.new
155
+ when :left, "left"
156
+ joiner = Java::CascadingPipeCogroup::LeftJoin.new
157
+ when :right, "right"
158
+ joiner = Java::CascadingPipeCogroup::RightJoin.new
159
+ when :outer, "outer"
160
+ joiner = Java::CascadingPipeCogroup::OuterJoin.new
161
+ when Array
162
+ joiner = joiner.map do |t|
163
+ case t
164
+ when true, 1, :inner then true
165
+ when false, 0, :outer then false
166
+ else fail "invalid mixed joiner entry: #{t}"
167
+ end
168
+ end
169
+ joiner = Java::CascadingPipeCogroup::MixedJoin.new(joiner.to_java(:boolean))
170
+ end
171
+ end
172
+
173
+ parameters = [pipes.to_java(Java::CascadingPipe::Pipe), group_fields, declared_fields, joiner].compact
174
+ grouping_key_fields = group_fields[0] # Left key group wins
175
+ make_pipe(Java::CascadingPipe::CoGroup, parameters, grouping_key_fields, incoming_scopes)
176
+ do_every_block_and_rename_fields(group_fields_names, incoming_scopes, &block)
177
+ end
178
+ alias co_group join
179
+
180
+ def inner_join(*args, &block)
181
+ options = args.extract_options!
182
+ options[:joiner] = :inner
183
+ args << options
184
+ join(*args, &block)
185
+ end
186
+
187
+ def left_join(*args, &block)
188
+ options = args.extract_options!
189
+ options[:joiner] = :left
190
+ args << options
191
+ join(*args, &block)
192
+ end
193
+
194
+ def right_join(*args, &block)
195
+ options = args.extract_options!
196
+ options[:joiner] = :right
197
+ args << options
198
+ join(*args, &block)
199
+ end
200
+
201
+ def outer_join(*args, &block)
202
+ options = args.extract_options!
203
+ options[:joiner] = :outer
204
+ args << options
205
+ join(*args, &block)
206
+ end
207
+
208
+ # Builds a new branch.
209
+ def branch(name, &block)
210
+ raise "Could not build branch '#{name}'; block required" unless block_given?
211
+ assembly = Assembly.new(name, self, @outgoing_scopes)
212
+ add_child(assembly)
213
+ assembly.instance_eval(&block)
214
+ assembly
215
+ end
216
+
217
+ # Builds a new _group_by_ pipe. The fields used for grouping are specified in the args
218
+ # array.
219
+ def group_by(*args, &block)
220
+ options = args.extract_options!
221
+
222
+ group_fields = fields(args)
223
+
224
+ sort_fields = fields(options[:sort_by] || args)
225
+ reverse = options[:reverse]
226
+
227
+ parameters = [@tail_pipe, group_fields, sort_fields, reverse].compact
228
+ make_pipe(Java::CascadingPipe::GroupBy, parameters, group_fields)
229
+ do_every_block_and_rename_fields(args, [scope], &block)
230
+ end
231
+
232
+ # Unifies several pipes sharing the same field structure.
233
+ # This actually creates a GroupBy pipe.
234
+ # It expects a list of assembly names as parameter.
235
+ def union_pipes(*args)
236
+ pipes, incoming_scopes = [], []
237
+ args[0].each do |assembly_name|
238
+ assembly = parent_flow.find_child(assembly_name)
239
+ pipes << assembly.tail_pipe
240
+ incoming_scopes << @outgoing_scopes[assembly.name]
241
+ end
242
+
243
+ # Groups only on the 1st field (see line 186 of GroupBy.java)
244
+ grouping_key_fields = fields(incoming_scopes.first.values_fields.get(0))
245
+ make_pipe(Java::CascadingPipe::GroupBy, [pipes.to_java(Java::CascadingPipe::Pipe)], grouping_key_fields, incoming_scopes)
246
+ # TODO: Shouldn't union_pipes accept an every block?
247
+ #do_every_block_and_rename_fields(args, incoming_scopes, &block)
248
+ end
249
+
250
+ # Builds an basic _every_ pipe, and adds it to the current assembly.
251
+ def every(*args)
252
+ options = args.extract_options!
253
+
254
+ in_fields = fields(args)
255
+ out_fields = fields(options[:output])
256
+ operation = options[:aggregator] || options[:buffer]
257
+
258
+ parameters = [@tail_pipe, in_fields, operation, out_fields].compact
259
+ make_every(Java::CascadingPipe::Every, *parameters)
260
+ end
261
+
262
+ # Builds a basic _each_ pipe, and adds it to the current assembly.
263
+ # --
264
+ # Example:
265
+ # each "line", :filter=>regex_splitter(["name", "val1", "val2", "id"],
266
+ # :pattern => /[.,]*\s+/),
267
+ # :output=>["id", "name", "val1", "val2"]
268
+ def each(*args)
269
+ options = args.extract_options!
270
+
271
+ in_fields = fields(args)
272
+ out_fields = fields(options[:output])
273
+ operation = options[:filter] || options[:function]
274
+
275
+ parameters = [@tail_pipe, in_fields, operation, out_fields].compact
276
+ make_each(Java::CascadingPipe::Each, *parameters)
277
+ end
278
+
279
+ # Restricts the current assembly to the specified fields.
280
+ # --
281
+ # Example:
282
+ # project "field1", "field2"
283
+ def project(*args)
284
+ fields = fields(args)
285
+ operation = Java::CascadingOperation::Identity.new
286
+ make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
287
+ end
288
+
289
+ # Removes the specified fields from the current assembly.
290
+ # --
291
+ # Example:
292
+ # discard "field1", "field2"
293
+ def discard(*args)
294
+ discard_fields = fields(args)
295
+ keep_fields = difference_fields(scope.values_fields, discard_fields)
296
+ project(*keep_fields.to_a)
297
+ end
298
+
299
+ # Assign new names to initial fields in positional order.
300
+ # --
301
+ # Example:
302
+ # bind_names "field1", "field2"
303
+ def bind_names(*new_names)
304
+ new_fields = fields(new_names)
305
+ operation = Java::CascadingOperation::Identity.new(new_fields)
306
+ make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
307
+ end
308
+
309
+ # Renames fields according to the mapping provided.
310
+ # --
311
+ # Example:
312
+ # rename "old_name" => "new_name"
313
+ def rename(name_map)
314
+ old_names = scope.values_fields.to_a
315
+ new_names = old_names.map{ |name| name_map[name] || name }
316
+ invalid = name_map.keys.sort - old_names
317
+ raise "invalid names: #{invalid.inspect}" unless invalid.empty?
318
+
319
+ old_key = scope.primary_key_fields.to_a
320
+ new_key = old_key.map{ |name| name_map[name] || name }
321
+
322
+ new_fields = fields(new_names)
323
+ operation = Java::CascadingOperation::Identity.new(new_fields)
324
+ make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
325
+ primary(*new_key)
326
+ end
327
+
328
+ def cast(type_map)
329
+ names = type_map.keys.sort
330
+ types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
331
+ fields = fields(names)
332
+ types = types.to_java(java.lang.Class)
333
+ operation = Java::CascadingOperation::Identity.new(fields, types)
334
+ make_each(Java::CascadingPipe::Each, @tail_pipe, fields, operation)
335
+ end
336
+
337
+ def copy(*args)
338
+ options = args.extract_options!
339
+ from = args[0] || all_fields
340
+ into = args[1] || options[:into] || all_fields
341
+ operation = Java::CascadingOperation::Identity.new(fields(into))
342
+ make_each(Java::CascadingPipe::Each, @tail_pipe, fields(from), operation, Java::CascadingTuple::Fields::ALL)
343
+ end
344
+
345
+ # A pipe that does nothing.
346
+ def pass(*args)
347
+ operation = Java::CascadingOperation::Identity.new
348
+ make_each(Java::CascadingPipe::Each, @tail_pipe, all_fields, operation)
349
+ end
350
+
351
+ def assert(*args)
352
+ options = args.extract_options!
353
+ assertion = args[0]
354
+ assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
355
+ make_each(Java::CascadingPipe::Each, @tail_pipe, assertion_level, assertion)
356
+ end
357
+
358
+ def assert_group(*args)
359
+ options = args.extract_options!
360
+ assertion = args[0]
361
+ assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
362
+ make_every(Java::CascadingPipe::Every, @tail_pipe, assertion_level, assertion)
363
+ end
364
+
365
+ # Builds a debugging pipe.
366
+ #
367
+ # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard
368
+ # output.
369
+ #
370
+ # The other named options are:
371
+ # * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
372
+ #
373
+ def debug(*args)
374
+ options = args.extract_options!
375
+ print_fields = options[:print_fields] || true
376
+ parameters = [print_fields].compact
377
+ debug = Java::CascadingOperation::Debug.new(*parameters)
378
+ debug.print_tuple_every = options[:tuple_interval] || 1
379
+ debug.print_fields_every = options[:fields_interval] || 10
380
+ each(all_fields, :filter => debug)
381
+ end
382
+
383
+ # Builds a pipe that assert the size of the tuple is the size specified in parameter.
384
+ #
385
+ # The method accept an unique uname argument : a number indicating the size expected.
386
+ def assert_size_equals(*args)
387
+ options = args.extract_options!
388
+ assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
389
+ assert(assertion, options)
390
+ end
391
+
392
+ # Builds a pipe that assert the none of the fields in the tuple are null.
393
+ def assert_not_null(*args)
394
+ options = args.extract_options!
395
+ assertion = Java::CascadingOperationAssertion::AssertNotNull.new
396
+ assert(assertion, options)
397
+ end
398
+
399
+ def assert_group_size_equals(*args)
400
+ options = args.extract_options!
401
+ assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0])
402
+ assert_group(assertion, options)
403
+ end
404
+
405
+ # Builds a series of every pipes for aggregation.
406
+ #
407
+ # Args can either be a list of fields to aggregate and an options hash or
408
+ # a hash that maps input field name to output field name (similar to
409
+ # insert) and an options hash.
410
+ #
411
+ # Options include:
412
+ # * <tt>:sql</tt> a boolean indicating whether the operation should act like the SQL equivalent
413
+ #
414
+ # <tt>function</tt> is a symbol that is the method to call to construct the Cascading Aggregator.
415
+ def composite_aggregator(args, function)
416
+ if !args.empty? && args.first.kind_of?(Hash)
417
+ field_map = args.shift.sort
418
+ options = args.extract_options!
419
+ else
420
+ options = args.extract_options!
421
+ field_map = args.zip(args)
422
+ end
423
+ field_map.each do |in_field, out_field|
424
+ agg = self.send(function, out_field, options)
425
+ every(in_field, :aggregator => agg, :output => all_fields)
426
+ end
427
+ puts "WARNING: composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields; will be ignored" if field_map.empty?
428
+ end
429
+
430
+ def min(*args); composite_aggregator(args, :min_function); end
431
+ def max(*args); composite_aggregator(args, :max_function); end
432
+ def first(*args); composite_aggregator(args, :first_function); end
433
+ def last(*args); composite_aggregator(args, :last_function); end
434
+ def average(*args); composite_aggregator(args, :average_function); end
435
+
436
+ # Counts elements of a group. First unnamed parameter is the name of the
437
+ # output count field (defaults to 'count' if it is not provided).
438
+ def count(*args)
439
+ options = args.extract_options!
440
+ name = args[0] || 'count'
441
+ every(last_grouping_fields, :aggregator => count_function(name, options), :output => all_fields)
442
+ end
443
+
444
+ # Fields to be summed may either be provided as an array, in which case
445
+ # they will be aggregated into the same field in the given order, or as a
446
+ # hash, in which case they will be aggregated from the field named by the
447
+ # key into the field named by the value after being sorted.
448
+ def sum(*args)
449
+ options = args.extract_options!
450
+ type = JAVA_TYPE_MAP[options[:type]]
451
+ raise "No type specified for sum" unless type
452
+
453
+ mapping = options[:mapping] ? options[:mapping].sort : args.zip(args)
454
+ mapping.each do |in_field, out_field|
455
+ every(in_field, :aggregator => sum_function(out_field, :type => type), :output => all_fields)
456
+ end
457
+ end
458
+
459
+ # Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
460
+ # using a specified regex pattern.
461
+ #
462
+ # If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
463
+ # fields are used.
464
+ #
465
+ # The named options are:
466
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
467
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
468
+ def parse(*args)
469
+ options = args.extract_options!
470
+ fields = args || all_fields
471
+ pattern = options[:pattern]
472
+ output = options[:output] || all_fields
473
+ each(fields, :filter => regex_parser(pattern, options), :output => output)
474
+ end
475
+
476
+ # Builds a pipe that splits a field into other fields, using a specified regular expression.
477
+ #
478
+ # The first unnamed argument is the field to be split.
479
+ # The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
480
+ #
481
+ # The named options are:
482
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
483
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
484
+ def split(*args)
485
+ options = args.extract_options!
486
+ fields = options[:into] || args[1]
487
+ pattern = options[:pattern] || /[.,]*\s+/
488
+ output = options[:output] || all_fields
489
+ each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
490
+ end
491
+
492
+ # Builds a pipe that splits a field into new rows, using a specified regular expression.
493
+ #
494
+ # The first unnamed argument is the field to be split.
495
+ # The second unnamed argument is the field receiving the result of the split.
496
+ #
497
+ # The named options are:
498
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
499
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
500
+ def split_rows(*args)
501
+ options = args.extract_options!
502
+ fields = options[:into] || args[1]
503
+ pattern = options[:pattern] || /[.,]*\s+/
504
+ output = options[:output] || all_fields
505
+ each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
506
+ end
507
+
508
+ # Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
509
+ #
510
+ # The first unnamed argument is the field to be matched against.
511
+ # The second unnamed argument is the field receiving the result of the match.
512
+ #
513
+ # The named options are:
514
+ # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
515
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
516
+ def match_rows(*args)
517
+ options = args.extract_options!
518
+ fields = options[:into] || args[1]
519
+ pattern = options[:pattern] || /[\w]+/
520
+ output = options[:output] || all_fields
521
+ each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
522
+ end
523
+
524
+ # Builds a pipe that parses the specified field as a date using hte provided format string.
525
+ # The unamed argument specifies the field to format.
526
+ #
527
+ # The named options are:
528
+ # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
529
+ # the input argument.
530
+ # * <tt>:pattern</tt> a string. Specifies the date format.
531
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
532
+ def parse_date(*args)
533
+ options = args.extract_options!
534
+ field = options[:into] || "#{args[0]}_parsed"
535
+ output = options[:output] || all_fields
536
+ pattern = options[:pattern] || "yyyy/MM/dd"
537
+
538
+ each args[0], :function => date_parser(field, pattern), :output => output
539
+ end
540
+
541
+ # Builds a pipe that format a date using a specified format pattern.
542
+ #
543
+ # The unamed argument specifies the field to format.
544
+ #
545
+ # The named options are:
546
+ # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
547
+ # the input argument.
548
+ # * <tt>:pattern</tt> a string. Specifies the date format.
549
+ # * <tt>:timezone</tt> a string. Specifies the timezone (defaults to UTC).
550
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
551
+ def format_date(*args)
552
+ options = args.extract_options!
553
+ field = options[:into] || "#{args[0]}_formatted"
554
+ pattern = options[:pattern] || "yyyy/MM/dd"
555
+ output = options[:output] || all_fields
556
+
557
+ each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
558
+ end
559
+
560
+ # Builds a pipe that perform a query/replace based on a regular expression.
561
+ #
562
+ # The first unamed argument specifies the input field.
563
+ #
564
+ # The named options are:
565
+ # * <tt>:pattern</tt> a string or regex. Specifies the pattern to look for in the input field. This non-optional argument
566
+ # can also be specified as a second _unamed_ argument.
567
+ # * <tt>:replacement</tt> a string. Specifies the replacement.
568
+ # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
569
+ def replace(*args)
570
+ options = args.extract_options!
571
+
572
+ pattern = options[:pattern] || args[1]
573
+ replacement = options[:replacement] || args[2]
574
+ into = options[:into] || "#{args[0]}_replaced"
575
+ output = options[:output] || all_fields
576
+
577
+ each args[0], :function => regex_replace(into, pattern, replacement), :output => output
578
+ end
579
+
580
+ # Builds a pipe that inserts values into the current tuple.
581
+ #
582
+ # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert
583
+ # and as values, the values they must contain. For example:
584
+ #
585
+ # insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
586
+ #
587
+ # will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
588
+ # the formatted current date.
589
+ # The methods outputs all fields.
590
+ # The named options are:
591
+ def insert(args)
592
+ args.keys.sort.each do |field_name|
593
+ value = args[field_name]
594
+
595
+ if value.kind_of?(ExprStub)
596
+ each all_fields,
597
+ :function => expression_function(field_name, :expression => value.expression,
598
+ :parameters => value.types), :output => all_fields
599
+ else
600
+ each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
601
+ end
602
+ end
603
+ end
604
+
605
+ # Builds a pipe that filters the tuples based on an expression or a pattern (but not both !).
606
+ #
607
+ # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
608
+ #
609
+ # The named options are:
610
+ # * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
611
+ # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
612
+ # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
613
+ # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
614
+ # expression-based. This is incompatible with the _pattern_ option.
615
+ def filter(*args)
616
+ options = args.extract_options!
617
+ from = options.delete(:from) || all_fields
618
+ expression = options.delete(:expression) || args.shift
619
+ regex = options.delete(:pattern)
620
+ if expression
621
+ stub = ExprStub.new(expression)
622
+ types, expression = stub.types, stub.expression
623
+
624
+ each from, :filter => expression_filter(
625
+ :parameters => types,
626
+ :expression => expression
627
+ )
628
+ elsif regex
629
+ each from, :filter => regex_filter(regex, options)
630
+ end
631
+ end
632
+
633
+ def filter_null(*args)
634
+ options = args.extract_options!
635
+ each(args, :filter => Java::CascadingOperationFilter::FilterNull.new)
636
+ end
637
+ alias reject_null filter_null
638
+
639
+ def filter_not_null(*args)
640
+ options = args.extract_options!
641
+ each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
642
+ end
643
+ alias where_null filter_not_null
644
+
645
+ # Builds a pipe that rejects the tuples based on an expression.
646
+ #
647
+ # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
648
+ #
649
+ # The named options are:
650
+ # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
651
+ # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
652
+ # expression-based.
653
+ def reject(*args)
654
+ options = args.extract_options
655
+ raise "Regex not allowed" if options && options[:pattern]
656
+
657
+ filter(*args)
658
+ end
659
+
660
+ # Builds a pipe that includes just the tuples matching an expression.
661
+ #
662
+ # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
663
+ #
664
+ # The named options are:
665
+ # * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
666
+ # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
667
+ # expression-based.
668
+ def where(*args)
669
+ options = args.extract_options
670
+ raise "Regex not allowed" if options && options[:pattern]
671
+
672
+ if options[:expression]
673
+ options[:expression] = "!(#{options[:expression]})"
674
+ elsif args[0]
675
+ args[0] = "!(#{args[0]})"
676
+ end
677
+
678
+ filter(*args)
679
+ end
680
+
681
+ # Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple.
682
+ #
683
+ # The named options are:
684
+ # * <tt>:from</tt> a string or array of strings. Specifies the input fields.
685
+ # * <tt>:express</tt> a string. The janino expression.
686
+ # * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
687
+ # * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
688
+ def eval_expression(*args)
689
+ options = args.extract_options!
690
+
691
+ into = options.delete(:into)
692
+ from = options.delete(:from) || all_fields
693
+ output = options.delete(:output) || all_fields
694
+ options[:expression] ||= args.shift
695
+ options[:parameters] ||= args.shift
696
+
697
+ each from, :function => expression_function(into, options), :output=>output
698
+ end
699
+
700
+ # Builds a pipe that returns distinct tuples based on the provided fields.
701
+ #
702
+ # The method accepts optional unamed argument specifying the fields to base the distinct on
703
+ # (all fields, by default).
704
+ def distinct(*args)
705
+ raise "Distinct is badly broken"
706
+ fields = args[0] || all_fields
707
+ group_by *fields
708
+ pass
709
+ end
710
+
711
+ # Builds a pipe that will unify (merge) pipes. The method accepts the list of pipes as argument.
712
+ # Tuples unified must share the same fields.
713
+ def union(*args)
714
+ options = args.extract_options!
715
+ pipes = args
716
+ union_pipes pipes
717
+ end
718
+
719
+ def join_fields(*args)
720
+ options = args.extract_options!
721
+ output = options[:output] || all_fields
722
+
723
+ each args, :function => field_joiner(options), :output => output
724
+ end
725
+ end
726
+ end