cascading.jruby 0.0.10 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,116 +1,118 @@
1
1
  module Cascading
2
- # The Cascading::Operations module is deprecated. The original idea from long
3
- # ago is that it would be useful to mixin operator wrappers to places other
4
- # than Cascading::Assembly, but this is not true. Instead, put Eaches in
5
- # Cascading::Assembly, Everies in Cascading::Aggregations, and any more
6
- # generally useful utility code directly in the Cascading module
7
- # (cascading/cascading.rb).
8
- #
9
- # Further, the entire *args pattern should be deprecated as it leads to
10
- # functions that can only be understood by reading their code. Instead,
11
- # idiomatic Ruby (positional required params and a params hash for optional
12
- # args) should be used. See Cascading::Assembly#set_value for an example.
13
2
  module Operations
14
- def identity
15
- Java::CascadingOperation::Identity.new
16
- end
17
-
18
- def aggregator_function(args, aggregator_klass)
19
- options = args.extract_options!
20
- ignore = options[:ignore]
21
-
22
- parameters = [Cascading.fields(args), ignore].compact
23
- aggregator_klass.new(*parameters)
24
- end
25
-
26
- def first_function(*args)
27
- aggregator_function(args, Java::CascadingOperationAggregator::First)
28
- end
29
-
30
- def min_function(*args)
31
- aggregator_function(args, Java::CascadingOperationAggregator::Min)
32
- end
33
-
34
- def max_function(*args)
35
- aggregator_function(args, Java::CascadingOperationAggregator::Max)
36
- end
37
-
38
- def last_function(*args)
39
- aggregator_function(args, Java::CascadingOperationAggregator::Last)
40
- end
41
-
42
- def regex_parser(*args)
43
- options = args.extract_options!
44
-
45
- pattern = args[0].to_s
46
- fields = Cascading.fields(options[:fields])
47
- groups = options[:groups].to_java(:int) if options[:groups]
48
- parameters = [fields, pattern, groups].compact
49
-
50
- Java::CascadingOperationRegex::RegexParser.new(*parameters)
51
- end
52
-
53
- def regex_splitter(*args)
54
- options = args.extract_options!
55
-
56
- fields = Cascading.fields(args)
57
- pattern = options[:pattern].to_s
58
- parameters = [fields, pattern].compact
59
- Java::CascadingOperationRegex::RegexSplitter.new(*parameters)
60
- end
61
-
62
- def regex_split_generator(*args)
63
- options = args.extract_options!
64
-
65
- fields = Cascading.fields(args)
66
- pattern = options[:pattern].to_s
67
- parameters = [fields, pattern].compact
68
- Java::CascadingOperationRegex::RegexSplitGenerator.new(*parameters)
69
- end
70
-
71
- def regex_generator(*args)
72
- options = args.extract_options!
73
-
74
- fields = Cascading.fields(args)
75
- pattern = options[:pattern].to_s
76
- parameters = [fields, pattern].compact
77
- Java::CascadingOperationRegex::RegexGenerator.new(*parameters)
78
- end
79
-
80
- def expression_function(*args)
81
- options = args.extract_options!
82
-
83
- fields = Cascading.fields(args)
84
- expression = options[:expression].to_s
85
- parameters = options[:parameters]
86
- parameter_names = []
87
- parameter_types = []
88
- if parameters.is_a? ::Hash
89
- parameters.each do |name, type|
90
- parameter_names << name
91
- parameter_types << type
3
+ # Debugs the current assembly at runtime, printing every tuple and fields
4
+ # every 10 tuples by default.
5
+ #
6
+ # The named options are:
7
+ # [prefix] String to prefix prints with.
8
+ # [print_fields] Boolean controlling field printing, defaults to false.
9
+ # [tuple_interval] Integer specifying interval between printed tuples
10
+ # [fields_interval] Integer specifying interval between printing fields
11
+ #
12
+ # Example:
13
+ # debug :prefix => 'DEBUG', :print_fields => true, :fields_interval => 1000
14
+ def debug(options = {})
15
+ input_fields = options[:input] || all_fields
16
+ prefix = options[:prefix]
17
+ print_fields = options[:print_fields]
18
+
19
+ debug = Java::CascadingOperation::Debug.new(*[prefix, print_fields].compact)
20
+
21
+ debug.print_tuple_every = options[:tuple_interval] || 1
22
+ debug.print_fields_every = options[:fields_interval] || 10
23
+
24
+ each(input_fields, :filter => debug)
25
+ end
26
+
27
+ # Inserts new fields into the current assembly. Values may be constants or
28
+ # expressions (see Cascading::expr). Fields will be inserted in
29
+ # lexicographic order (not necessarily the order provided).
30
+ #
31
+ # Example:
32
+ # insert 'field1' => 'constant_string', 'field2' => 0, 'field3' => expr('fieldA:long + fieldB:long')
33
+ def insert(insert_map)
34
+ insert_map.keys.sort.each do |field_name|
35
+ value = insert_map[field_name]
36
+
37
+ if value.kind_of?(ExprStub)
38
+ value.validate_scope(scope)
39
+ names, types = value.names_and_types
40
+ each(
41
+ all_fields,
42
+ :function => Java::CascadingOperationExpression::ExpressionFunction.new(fields(field_name), value.expression, names, types),
43
+ :output => all_fields
44
+ )
45
+ else # value is a constant
46
+ each(
47
+ all_fields,
48
+ :function => Java::CascadingOperation::Insert.new(fields(field_name), to_java_comparable_array([value])),
49
+ :output => all_fields
50
+ )
92
51
  end
93
- parameter_names = parameter_names.to_java(java.lang.String)
94
- parameter_types = parameter_types.to_java(java.lang.Class)
95
-
96
- arguments = [fields, expression, parameter_names, parameter_types].compact
97
- elsif !parameters.nil?
98
- arguments = [fields, expression, parameters.java_class].compact
99
- else
100
- arguments = [fields, expression, java.lang.String.java_class].compact
101
52
  end
102
-
103
- Java::CascadingOperationExpression::ExpressionFunction.new(*arguments)
104
53
  end
105
54
 
106
- def insert_function(*args)
107
- options=args.extract_options!
108
- fields = Cascading.fields(args)
109
- values = options[:values]
110
-
111
- parameters = [fields, to_java_comparable_array(values)].compact
112
- Java::CascadingOperation::Insert.new(*parameters)
113
- end
55
+ # Ungroups, or unpivots, a tuple (see Cascading's {UnGroup}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/function/UnGroup.html]).
56
+ #
57
+ # You must provide exactly one of :value_selectors and :num_values.
58
+ #
59
+ # The named options are:
60
+ # [value_selectors] Array of field names to ungroup. Each field will be
61
+ # ungrouped into an output tuple along with the key fields
62
+ # in the order provided.
63
+ # [num_values] Integer specifying the number of fields to ungroup into each
64
+ # output tuple (excluding the key fields). All input fields
65
+ # will be ungrouped.
66
+ #
67
+ # Example:
68
+ # ungroup 'key', ['new_key', 'val], :value_selectors => ['val1', 'val2', 'val3'], :output => ['new_key', 'val']
69
+ def ungroup(key, into_fields, options = {})
70
+ input_fields = options[:input] || all_fields
71
+ output = options[:output] || all_fields
72
+
73
+ raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
74
+ value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
75
+ num_values = options[:num_values] if options.has_key?(:num_values)
76
+
77
+ parameters = [fields(into_fields), fields(key), value_selectors, num_values].compact
78
+ each input_fields, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
79
+ end
80
+
81
+ # Inserts one of two values into the dataflow based upon the result of the
82
+ # supplied filter on the input_fields. This is primarily useful for
83
+ # creating indicators from filters. keep_value specifies the Java value to
84
+ # produce when the filter would keep the given input and remove_value
85
+ # specifies the Java value to produce when the filter would remove the given
86
+ # input.
87
+ #
88
+ # Example:
89
+ # set_value 'field1', Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, 'is_field1_null'
90
+ def set_value(input_fields, filter, keep_value, remove_value, into_field, options = {})
91
+ output = options[:output] || all_fields
92
+ each input_fields, :function => Java::CascadingOperationFunction::SetValue.new(fields(into_field), filter, keep_value, remove_value), :output => output
93
+ end
94
+
95
+ # Efficient way of inserting a null indicator for any field, even one that
96
+ # cannot be coerced to a string. This is accomplished using Cascading's
97
+ # FilterNull and SetValue operators rather than Janino. 1 is produced if
98
+ # the field is null and 0 otherwise.
99
+ #
100
+ # Example:
101
+ # null_indicator 'field1', 'is_field1_null'
102
+ def null_indicator(input_field, into_field, options = {})
103
+ set_value input_field, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, into_field, :output => options[:output]
104
+ end
105
+
106
+ # Given an input_field and a regex, returns an indicator that is 1 if the string
107
+ # contains at least 1 match and 0 otherwise.
108
+ #
109
+ # Example:
110
+ # regex_contains 'field1', /\w+\s+\w+/, 'does_field1_contain_pair'
111
+ def regex_contains(input_field, regex, into_field, options = {})
112
+ set_value input_field, Java::CascadingOperationRegex::RegexFilter.new(pattern.to_s), 1.to_java, 0.to_java, into_field, :output => options[:output]
113
+ end
114
+
115
+ private
114
116
 
115
117
  def to_java_comparable_array(arr)
116
118
  (arr.map do |v|
@@ -130,72 +132,5 @@ module Cascading
130
132
  java.lang.String.new(v.to_s)
131
133
  end
132
134
  end
133
-
134
- def expression_filter(*args)
135
- options = args.extract_options!
136
- expression = (args[0] || options[:expression]).to_s
137
- parameters = options[:parameters]
138
- parameter_names = []
139
- parameter_types = []
140
- if parameters.is_a? ::Hash
141
- parameters.each do |name, type|
142
- parameter_names << name
143
- parameter_types << type
144
- end
145
- parameter_names = parameter_names.to_java(java.lang.String)
146
- parameter_types = parameter_types.to_java(java.lang.Class)
147
-
148
- arguments = [expression, parameter_names, parameter_types].compact
149
- elsif !parameters.nil?
150
- arguments = [expression, parameters.java_class].compact
151
- else
152
- arguments = [expression, java.lang.String.java_class].compact
153
- end
154
-
155
- Java::CascadingOperationExpression::ExpressionFilter.new(*arguments)
156
- end
157
-
158
- def date_parser(field, format)
159
- fields = fields(field)
160
- Java::CascadingOperationText::DateParser.new(fields, format)
161
- end
162
-
163
- def date_formatter(fields, format, timezone=nil)
164
- fields = fields(fields)
165
- timezone = Java::JavaUtil::TimeZone.get_time_zone(timezone) if timezone
166
- arguments = [fields, format, timezone].compact
167
- Java::CascadingOperationText::DateFormatter.new(*arguments)
168
- end
169
-
170
- def regex_filter(*args)
171
- options = args.extract_options!
172
-
173
- pattern = args[0]
174
- remove_match = options[:remove_match]
175
- match_each_element = options[:match_each_element]
176
- parameters = [pattern.to_s, remove_match, match_each_element].compact
177
- Java::CascadingOperationRegex::RegexFilter.new(*parameters)
178
- end
179
-
180
- def regex_replace(*args)
181
- options = args.extract_options!
182
-
183
- fields = fields(args[0])
184
- pattern = args[1]
185
- replacement = args[2]
186
- replace_all = options[:replace_all]
187
-
188
- parameters = [fields, pattern.to_s, replacement.to_s, replace_all].compact
189
- Java::CascadingOperationRegex::RegexReplace.new(*parameters)
190
- end
191
-
192
- def field_joiner(*args)
193
- options = args.extract_options!
194
- delimiter = options[:delimiter] || ','
195
- fields = fields(options[:into])
196
-
197
- parameters = [fields, delimiter].compact
198
- Java::CascadingOperationText::FieldJoiner.new(*parameters)
199
- end
200
135
  end
201
136
  end
@@ -0,0 +1,133 @@
1
+ module Cascading
2
+ # Module of pipe assemblies that wrap operations defined in the Cascading
3
+ # cascading.operations.regex package. These are split out only to group
4
+ # similar functionality.
5
+ #
6
+ # All DSL regex pipes require an input_field, a regex, and either a single
7
+ # into_field or one or more into_fields. Requiring a single input field
8
+ # allows us to raise an exception early if the wrong input is specified and
9
+ # avoids the non-intuitive situation where the first of many fields is
10
+ # silently taken as in Cascading. Requiring a regex means you don't have to
11
+ # go looking for defaults in code. And into_field(s) means we can propagate
12
+ # field names through the dataflow.
13
+ #
14
+ # Mapping of DSL pipes into Cascading regex operations:
15
+ # parse:: {RegexParser}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexParser.html]
16
+ # split:: {RegexSplitter}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitter.html]
17
+ # split\_rows:: {RegexSplitGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitGenerator.html]
18
+ # match\_rows:: {RegexGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexGenerator.html]
19
+ # replace:: {RegexReplace}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexReplace.html]
20
+ module RegexOperations
21
+ # Parses the given input_field using the specified regular expression to
22
+ # produce one output per group in that expression.
23
+ #
24
+ # The named options are:
25
+ # [groups] Array of integers specifying which groups to capture if you want
26
+ # a subset of groups.
27
+ #
28
+ # Example:
29
+ # parse 'field1', /(\w+)\s+(\w+)/, ['out1', 'out2'], :groups => [1, 2]
30
+ def parse(input_field, regex, into_fields, options = {})
31
+ groups = options[:groups].to_java(:int) if options[:groups]
32
+ output = options[:output] || all_fields # Overrides Cascading default
33
+
34
+ input_field = fields(input_field)
35
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
36
+
37
+ parameters = [fields(into_fields), regex.to_s, groups].compact
38
+ each(
39
+ input_field,
40
+ :function => Java::CascadingOperationRegex::RegexParser.new(*parameters),
41
+ :output => output
42
+ )
43
+ end
44
+ alias regex_parser parse
45
+
46
+ # Splits the given input_field into multiple fields using the specified
47
+ # regular expression.
48
+ #
49
+ # Example:
50
+ # split 'line', /\s+/, ['out1', 'out2']
51
+ def split(input_field, regex, into_fields, options = {})
52
+ output = options[:output] || all_fields # Overrides Cascading default
53
+
54
+ input_field = fields(input_field)
55
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
56
+
57
+ each(
58
+ input_field,
59
+ :function => Java::CascadingOperationRegex::RegexSplitter.new(fields(into_fields), regex.to_s),
60
+ :output => output
61
+ )
62
+ end
63
+ alias regex_splitter split
64
+
65
+ # Splits the given input_field into new rows using the specified regular
66
+ # expression.
67
+ #
68
+ # Example:
69
+ # split_rows 'line', /\s+/, 'word'
70
+ def split_rows(input_field, regex, into_field, options = {})
71
+ output = options[:output] || all_fields # Overrides Cascading default
72
+
73
+ input_field = fields(input_field)
74
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
75
+ into_field = fields(into_field)
76
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
77
+
78
+ each(
79
+ input_field,
80
+ :function => Java::CascadingOperationRegex::RegexSplitGenerator.new(into_field, regex.to_s),
81
+ :output => output
82
+ )
83
+ end
84
+ alias regex_split_generator split_rows
85
+
86
+ # Emits a new row for each regex group matched in input_field using the
87
+ # specified regular expression.
88
+ #
89
+ # Example:
90
+ # match_rows 'line', /(\w+)\s+(\w+)/, 'word'
91
+ def match_rows(input_field, regex, into_field, options = {})
92
+ output = options[:output] || all_fields # Overrides Cascading default
93
+
94
+ input_field = fields(input_field)
95
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
96
+ into_field = fields(into_field)
97
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
98
+
99
+ each(
100
+ input_field,
101
+ :function => Java::CascadingOperationRegex::RegexGenerator.new(into_field, regex.to_s),
102
+ :output => output
103
+ )
104
+ end
105
+ alias regex_generator match_rows
106
+
107
+ # Performs a query/replace on the given input_field using the specified
108
+ # regular expression and replacement.
109
+ #
110
+ # The named options are:
111
+ # [replace_all] Boolean indicating if all matches should be replaced;
112
+ # defaults to true (the Cascading default).
113
+ #
114
+ # Example:
115
+ # replace 'line', /[.,]*\s+/, 'tab_separated_line', "\t"
116
+ def replace(input_field, regex, into_field, replacement, options = {})
117
+ output = options[:output] || all_fields # Overrides Cascading default
118
+
119
+ input_field = fields(input_field)
120
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
121
+ into_field = fields(into_field)
122
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
123
+
124
+ parameters = [into_field, regex.to_s, replacement.to_s, options[:replace_all]].compact
125
+ each(
126
+ input_field,
127
+ :function => Java::CascadingOperationRegex::RegexReplace.new(*parameters),
128
+ :output => output
129
+ )
130
+ end
131
+ alias regex_replace replace
132
+ end
133
+ end
@@ -1,23 +1,35 @@
1
1
  module Cascading
2
+ # Scope is a wrapper for a the private Cascading c.f.p.Scope object used to
3
+ # connect the dataflow graph by resolving fields. cascading.jruby wraps this
4
+ # facility so that it may be used to propagate field names at composition
5
+ # time (not Cascading plan time) in the same way they will later be
6
+ # propagated by the planner.
2
7
  class Scope
3
8
  attr_accessor :scope
4
9
 
10
+ # Construct a Scope given the Cascading c.f.p.Scope to wrap.
5
11
  def initialize(scope)
6
12
  @scope = scope
7
13
  end
8
14
 
15
+ # Copy one Scope into another; relies upon the copy constructor of
16
+ # c.f.p.Scope.
9
17
  def copy
10
18
  Scope.new(Java::CascadingFlowPlanner::Scope.new(@scope))
11
19
  end
12
20
 
21
+ # Build a c.f.p.Scope for a Flow, which is empty except for its name.
13
22
  def self.flow_scope(name)
14
23
  Java::CascadingFlowPlanner::Scope.new(name)
15
24
  end
16
25
 
26
+ # Build an empty Scope, wrapping an empty c.f.p.Scope.
17
27
  def self.empty_scope(name)
18
28
  Scope.new(Java::CascadingFlowPlanner::Scope.new(name))
19
29
  end
20
30
 
31
+ # Build a Scope for a single source Tap. The flow_scope is propagated
32
+ # through this call into a new Scope.
21
33
  def self.source_scope(name, tap, flow_scope)
22
34
  incoming_scopes = java.util.HashSet.new
23
35
  incoming_scopes.add(flow_scope)
@@ -27,28 +39,30 @@ module Cascading
27
39
  Scope.new(java_scope)
28
40
  end
29
41
 
42
+ # Build a Scope for an arbitrary flow element. This is used to update the
43
+ # Scope at each stage in a pipe Assembly.
30
44
  def self.outgoing_scope(flow_element, incoming_scopes)
31
45
  java_scopes = incoming_scopes.compact.map{ |s| s.scope }
32
46
  Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)))
33
47
  end
34
48
 
49
+ # The values fields of the Scope, which indicate the fields in the current
50
+ # dataflow tuple.
35
51
  def values_fields
36
52
  @scope.out_values_fields
37
53
  end
38
54
 
55
+ # The grouping fields of the Scope, which indicate the keys of an
56
+ # group/cogroup.
39
57
  def grouping_fields
40
58
  @scope.out_grouping_fields
41
59
  end
42
60
 
43
- def scope_fields_to_s(accessor)
44
- begin
45
- fields = @scope.send(accessor)
46
- fields.nil? ? 'null' : fields.to_s
47
- rescue
48
- 'ERROR'
49
- end
50
- end
51
-
61
+ # Prints a detailed description of this Scope, including its type and
62
+ # various selectors, fields, and key fields. Data is bubbled up directly
63
+ # from the Cascading c.f.p.Scope. This output can be useful for debugging
64
+ # the propagation of fields through your job (see Flow#debug_scope and
65
+ # Assembly#debug_scope, which both rely upon this method).
52
66
  def to_s
53
67
  kind = 'Unknown'
54
68
  kind = 'Tap' if @scope.tap?
@@ -77,6 +91,15 @@ END
77
91
 
78
92
  private
79
93
 
94
+ def scope_fields_to_s(accessor)
95
+ begin
96
+ fields = @scope.send(accessor)
97
+ fields.nil? ? 'null' : fields.to_s
98
+ rescue Exception => e
99
+ 'ERROR'
100
+ end
101
+ end
102
+
80
103
  def self.outgoing_scope_for(flow_element, incoming_scopes)
81
104
  begin
82
105
  flow_element.outgoing_scope_for(incoming_scopes)