cascading.jruby 0.0.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,116 +1,118 @@
1
1
  module Cascading
2
- # The Cascading::Operations module is deprecated. The original idea from long
3
- # ago is that it would be useful to mixin operator wrappers to places other
4
- # than Cascading::Assembly, but this is not true. Instead, put Eaches in
5
- # Cascading::Assembly, Everies in Cascading::Aggregations, and any more
6
- # generally useful utility code directly in the Cascading module
7
- # (cascading/cascading.rb).
8
- #
9
- # Further, the entire *args pattern should be deprecated as it leads to
10
- # functions that can only be understood by reading their code. Instead,
11
- # idiomatic Ruby (positional required params and a params hash for optional
12
- # args) should be used. See Cascading::Assembly#set_value for an example.
13
2
  module Operations
14
- def identity
15
- Java::CascadingOperation::Identity.new
16
- end
17
-
18
- def aggregator_function(args, aggregator_klass)
19
- options = args.extract_options!
20
- ignore = options[:ignore]
21
-
22
- parameters = [Cascading.fields(args), ignore].compact
23
- aggregator_klass.new(*parameters)
24
- end
25
-
26
- def first_function(*args)
27
- aggregator_function(args, Java::CascadingOperationAggregator::First)
28
- end
29
-
30
- def min_function(*args)
31
- aggregator_function(args, Java::CascadingOperationAggregator::Min)
32
- end
33
-
34
- def max_function(*args)
35
- aggregator_function(args, Java::CascadingOperationAggregator::Max)
36
- end
37
-
38
- def last_function(*args)
39
- aggregator_function(args, Java::CascadingOperationAggregator::Last)
40
- end
41
-
42
- def regex_parser(*args)
43
- options = args.extract_options!
44
-
45
- pattern = args[0].to_s
46
- fields = Cascading.fields(options[:fields])
47
- groups = options[:groups].to_java(:int) if options[:groups]
48
- parameters = [fields, pattern, groups].compact
49
-
50
- Java::CascadingOperationRegex::RegexParser.new(*parameters)
51
- end
52
-
53
- def regex_splitter(*args)
54
- options = args.extract_options!
55
-
56
- fields = Cascading.fields(args)
57
- pattern = options[:pattern].to_s
58
- parameters = [fields, pattern].compact
59
- Java::CascadingOperationRegex::RegexSplitter.new(*parameters)
60
- end
61
-
62
- def regex_split_generator(*args)
63
- options = args.extract_options!
64
-
65
- fields = Cascading.fields(args)
66
- pattern = options[:pattern].to_s
67
- parameters = [fields, pattern].compact
68
- Java::CascadingOperationRegex::RegexSplitGenerator.new(*parameters)
69
- end
70
-
71
- def regex_generator(*args)
72
- options = args.extract_options!
73
-
74
- fields = Cascading.fields(args)
75
- pattern = options[:pattern].to_s
76
- parameters = [fields, pattern].compact
77
- Java::CascadingOperationRegex::RegexGenerator.new(*parameters)
78
- end
79
-
80
- def expression_function(*args)
81
- options = args.extract_options!
82
-
83
- fields = Cascading.fields(args)
84
- expression = options[:expression].to_s
85
- parameters = options[:parameters]
86
- parameter_names = []
87
- parameter_types = []
88
- if parameters.is_a? ::Hash
89
- parameters.each do |name, type|
90
- parameter_names << name
91
- parameter_types << type
3
+ # Debugs the current assembly at runtime, printing every tuple and fields
4
+ # every 10 tuples by default.
5
+ #
6
+ # The named options are:
7
+ # [prefix] String to prefix prints with.
8
+ # [print_fields] Boolean controlling field printing, defaults to false.
9
+ # [tuple_interval] Integer specifying interval between printed tuples
10
+ # [fields_interval] Integer specifying interval between printing fields
11
+ #
12
+ # Example:
13
+ # debug :prefix => 'DEBUG', :print_fields => true, :fields_interval => 1000
14
+ def debug(options = {})
15
+ input_fields = options[:input] || all_fields
16
+ prefix = options[:prefix]
17
+ print_fields = options[:print_fields]
18
+
19
+ debug = Java::CascadingOperation::Debug.new(*[prefix, print_fields].compact)
20
+
21
+ debug.print_tuple_every = options[:tuple_interval] || 1
22
+ debug.print_fields_every = options[:fields_interval] || 10
23
+
24
+ each(input_fields, :filter => debug)
25
+ end
26
+
27
+ # Inserts new fields into the current assembly. Values may be constants or
28
+ # expressions (see Cascading::expr). Fields will be inserted in
29
+ # lexicographic order (not necessarily the order provided).
30
+ #
31
+ # Example:
32
+ # insert 'field1' => 'constant_string', 'field2' => 0, 'field3' => expr('fieldA:long + fieldB:long')
33
+ def insert(insert_map)
34
+ insert_map.keys.sort.each do |field_name|
35
+ value = insert_map[field_name]
36
+
37
+ if value.kind_of?(ExprStub)
38
+ value.validate_scope(scope)
39
+ names, types = value.names_and_types
40
+ each(
41
+ all_fields,
42
+ :function => Java::CascadingOperationExpression::ExpressionFunction.new(fields(field_name), value.expression, names, types),
43
+ :output => all_fields
44
+ )
45
+ else # value is a constant
46
+ each(
47
+ all_fields,
48
+ :function => Java::CascadingOperation::Insert.new(fields(field_name), to_java_comparable_array([value])),
49
+ :output => all_fields
50
+ )
92
51
  end
93
- parameter_names = parameter_names.to_java(java.lang.String)
94
- parameter_types = parameter_types.to_java(java.lang.Class)
95
-
96
- arguments = [fields, expression, parameter_names, parameter_types].compact
97
- elsif !parameters.nil?
98
- arguments = [fields, expression, parameters.java_class].compact
99
- else
100
- arguments = [fields, expression, java.lang.String.java_class].compact
101
52
  end
102
-
103
- Java::CascadingOperationExpression::ExpressionFunction.new(*arguments)
104
53
  end
105
54
 
106
- def insert_function(*args)
107
- options=args.extract_options!
108
- fields = Cascading.fields(args)
109
- values = options[:values]
110
-
111
- parameters = [fields, to_java_comparable_array(values)].compact
112
- Java::CascadingOperation::Insert.new(*parameters)
113
- end
55
+ # Ungroups, or unpivots, a tuple (see Cascading's {UnGroup}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/function/UnGroup.html]).
56
+ #
57
+ # You must provide exactly one of :value_selectors and :num_values.
58
+ #
59
+ # The named options are:
60
+ # [value_selectors] Array of field names to ungroup. Each field will be
61
+ # ungrouped into an output tuple along with the key fields
62
+ # in the order provided.
63
+ # [num_values] Integer specifying the number of fields to ungroup into each
64
+ # output tuple (excluding the key fields). All input fields
65
+ # will be ungrouped.
66
+ #
67
+ # Example:
68
+ # ungroup 'key', ['new_key', 'val], :value_selectors => ['val1', 'val2', 'val3'], :output => ['new_key', 'val']
69
+ def ungroup(key, into_fields, options = {})
70
+ input_fields = options[:input] || all_fields
71
+ output = options[:output] || all_fields
72
+
73
+ raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
74
+ value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
75
+ num_values = options[:num_values] if options.has_key?(:num_values)
76
+
77
+ parameters = [fields(into_fields), fields(key), value_selectors, num_values].compact
78
+ each input_fields, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
79
+ end
80
+
81
+ # Inserts one of two values into the dataflow based upon the result of the
82
+ # supplied filter on the input_fields. This is primarily useful for
83
+ # creating indicators from filters. keep_value specifies the Java value to
84
+ # produce when the filter would keep the given input and remove_value
85
+ # specifies the Java value to produce when the filter would remove the given
86
+ # input.
87
+ #
88
+ # Example:
89
+ # set_value 'field1', Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, 'is_field1_null'
90
+ def set_value(input_fields, filter, keep_value, remove_value, into_field, options = {})
91
+ output = options[:output] || all_fields
92
+ each input_fields, :function => Java::CascadingOperationFunction::SetValue.new(fields(into_field), filter, keep_value, remove_value), :output => output
93
+ end
94
+
95
+ # Efficient way of inserting a null indicator for any field, even one that
96
+ # cannot be coerced to a string. This is accomplished using Cascading's
97
+ # FilterNull and SetValue operators rather than Janino. 1 is produced if
98
+ # the field is null and 0 otherwise.
99
+ #
100
+ # Example:
101
+ # null_indicator 'field1', 'is_field1_null'
102
+ def null_indicator(input_field, into_field, options = {})
103
+ set_value input_field, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, into_field, :output => options[:output]
104
+ end
105
+
106
+ # Given an input_field and a regex, returns an indicator that is 1 if the string
107
+ # contains at least 1 match and 0 otherwise.
108
+ #
109
+ # Example:
110
+ # regex_contains 'field1', /\w+\s+\w+/, 'does_field1_contain_pair'
111
+ def regex_contains(input_field, regex, into_field, options = {})
112
+ set_value input_field, Java::CascadingOperationRegex::RegexFilter.new(pattern.to_s), 1.to_java, 0.to_java, into_field, :output => options[:output]
113
+ end
114
+
115
+ private
114
116
 
115
117
  def to_java_comparable_array(arr)
116
118
  (arr.map do |v|
@@ -130,72 +132,5 @@ module Cascading
130
132
  java.lang.String.new(v.to_s)
131
133
  end
132
134
  end
133
-
134
- def expression_filter(*args)
135
- options = args.extract_options!
136
- expression = (args[0] || options[:expression]).to_s
137
- parameters = options[:parameters]
138
- parameter_names = []
139
- parameter_types = []
140
- if parameters.is_a? ::Hash
141
- parameters.each do |name, type|
142
- parameter_names << name
143
- parameter_types << type
144
- end
145
- parameter_names = parameter_names.to_java(java.lang.String)
146
- parameter_types = parameter_types.to_java(java.lang.Class)
147
-
148
- arguments = [expression, parameter_names, parameter_types].compact
149
- elsif !parameters.nil?
150
- arguments = [expression, parameters.java_class].compact
151
- else
152
- arguments = [expression, java.lang.String.java_class].compact
153
- end
154
-
155
- Java::CascadingOperationExpression::ExpressionFilter.new(*arguments)
156
- end
157
-
158
- def date_parser(field, format)
159
- fields = fields(field)
160
- Java::CascadingOperationText::DateParser.new(fields, format)
161
- end
162
-
163
- def date_formatter(fields, format, timezone=nil)
164
- fields = fields(fields)
165
- timezone = Java::JavaUtil::TimeZone.get_time_zone(timezone) if timezone
166
- arguments = [fields, format, timezone].compact
167
- Java::CascadingOperationText::DateFormatter.new(*arguments)
168
- end
169
-
170
- def regex_filter(*args)
171
- options = args.extract_options!
172
-
173
- pattern = args[0]
174
- remove_match = options[:remove_match]
175
- match_each_element = options[:match_each_element]
176
- parameters = [pattern.to_s, remove_match, match_each_element].compact
177
- Java::CascadingOperationRegex::RegexFilter.new(*parameters)
178
- end
179
-
180
- def regex_replace(*args)
181
- options = args.extract_options!
182
-
183
- fields = fields(args[0])
184
- pattern = args[1]
185
- replacement = args[2]
186
- replace_all = options[:replace_all]
187
-
188
- parameters = [fields, pattern.to_s, replacement.to_s, replace_all].compact
189
- Java::CascadingOperationRegex::RegexReplace.new(*parameters)
190
- end
191
-
192
- def field_joiner(*args)
193
- options = args.extract_options!
194
- delimiter = options[:delimiter] || ','
195
- fields = fields(options[:into])
196
-
197
- parameters = [fields, delimiter].compact
198
- Java::CascadingOperationText::FieldJoiner.new(*parameters)
199
- end
200
135
  end
201
136
  end
@@ -0,0 +1,133 @@
1
+ module Cascading
2
+ # Module of pipe assemblies that wrap operations defined in the Cascading
3
+ # cascading.operations.regex package. These are split out only to group
4
+ # similar functionality.
5
+ #
6
+ # All DSL regex pipes require an input_field, a regex, and either a single
7
+ # into_field or one or more into_fields. Requiring a single input field
8
+ # allows us to raise an exception early if the wrong input is specified and
9
+ # avoids the non-intuitive situation where the first of many fields is
10
+ # silently taken as in Cascading. Requiring a regex means you don't have to
11
+ # go looking for defaults in code. And into_field(s) means we can propagate
12
+ # field names through the dataflow.
13
+ #
14
+ # Mapping of DSL pipes into Cascading regex operations:
15
+ # parse:: {RegexParser}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexParser.html]
16
+ # split:: {RegexSplitter}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitter.html]
17
+ # split\_rows:: {RegexSplitGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitGenerator.html]
18
+ # match\_rows:: {RegexGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexGenerator.html]
19
+ # replace:: {RegexReplace}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexReplace.html]
20
+ module RegexOperations
21
+ # Parses the given input_field using the specified regular expression to
22
+ # produce one output per group in that expression.
23
+ #
24
+ # The named options are:
25
+ # [groups] Array of integers specifying which groups to capture if you want
26
+ # a subset of groups.
27
+ #
28
+ # Example:
29
+ # parse 'field1', /(\w+)\s+(\w+)/, ['out1', 'out2'], :groups => [1, 2]
30
+ def parse(input_field, regex, into_fields, options = {})
31
+ groups = options[:groups].to_java(:int) if options[:groups]
32
+ output = options[:output] || all_fields # Overrides Cascading default
33
+
34
+ input_field = fields(input_field)
35
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
36
+
37
+ parameters = [fields(into_fields), regex.to_s, groups].compact
38
+ each(
39
+ input_field,
40
+ :function => Java::CascadingOperationRegex::RegexParser.new(*parameters),
41
+ :output => output
42
+ )
43
+ end
44
+ alias regex_parser parse
45
+
46
+ # Splits the given input_field into multiple fields using the specified
47
+ # regular expression.
48
+ #
49
+ # Example:
50
+ # split 'line', /\s+/, ['out1', 'out2']
51
+ def split(input_field, regex, into_fields, options = {})
52
+ output = options[:output] || all_fields # Overrides Cascading default
53
+
54
+ input_field = fields(input_field)
55
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
56
+
57
+ each(
58
+ input_field,
59
+ :function => Java::CascadingOperationRegex::RegexSplitter.new(fields(into_fields), regex.to_s),
60
+ :output => output
61
+ )
62
+ end
63
+ alias regex_splitter split
64
+
65
+ # Splits the given input_field into new rows using the specified regular
66
+ # expression.
67
+ #
68
+ # Example:
69
+ # split_rows 'line', /\s+/, 'word'
70
+ def split_rows(input_field, regex, into_field, options = {})
71
+ output = options[:output] || all_fields # Overrides Cascading default
72
+
73
+ input_field = fields(input_field)
74
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
75
+ into_field = fields(into_field)
76
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
77
+
78
+ each(
79
+ input_field,
80
+ :function => Java::CascadingOperationRegex::RegexSplitGenerator.new(into_field, regex.to_s),
81
+ :output => output
82
+ )
83
+ end
84
+ alias regex_split_generator split_rows
85
+
86
+ # Emits a new row for each regex group matched in input_field using the
87
+ # specified regular expression.
88
+ #
89
+ # Example:
90
+ # match_rows 'line', /(\w+)\s+(\w+)/, 'word'
91
+ def match_rows(input_field, regex, into_field, options = {})
92
+ output = options[:output] || all_fields # Overrides Cascading default
93
+
94
+ input_field = fields(input_field)
95
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
96
+ into_field = fields(into_field)
97
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
98
+
99
+ each(
100
+ input_field,
101
+ :function => Java::CascadingOperationRegex::RegexGenerator.new(into_field, regex.to_s),
102
+ :output => output
103
+ )
104
+ end
105
+ alias regex_generator match_rows
106
+
107
+ # Performs a query/replace on the given input_field using the specified
108
+ # regular expression and replacement.
109
+ #
110
+ # The named options are:
111
+ # [replace_all] Boolean indicating if all matches should be replaced;
112
+ # defaults to true (the Cascading default).
113
+ #
114
+ # Example:
115
+ # replace 'line', /[.,]*\s+/, 'tab_separated_line', "\t"
116
+ def replace(input_field, regex, into_field, replacement, options = {})
117
+ output = options[:output] || all_fields # Overrides Cascading default
118
+
119
+ input_field = fields(input_field)
120
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
121
+ into_field = fields(into_field)
122
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
123
+
124
+ parameters = [into_field, regex.to_s, replacement.to_s, options[:replace_all]].compact
125
+ each(
126
+ input_field,
127
+ :function => Java::CascadingOperationRegex::RegexReplace.new(*parameters),
128
+ :output => output
129
+ )
130
+ end
131
+ alias regex_replace replace
132
+ end
133
+ end
@@ -1,23 +1,35 @@
1
1
  module Cascading
2
+ # Scope is a wrapper for a the private Cascading c.f.p.Scope object used to
3
+ # connect the dataflow graph by resolving fields. cascading.jruby wraps this
4
+ # facility so that it may be used to propagate field names at composition
5
+ # time (not Cascading plan time) in the same way they will later be
6
+ # propagated by the planner.
2
7
  class Scope
3
8
  attr_accessor :scope
4
9
 
10
+ # Construct a Scope given the Cascading c.f.p.Scope to wrap.
5
11
  def initialize(scope)
6
12
  @scope = scope
7
13
  end
8
14
 
15
+ # Copy one Scope into another; relies upon the copy constructor of
16
+ # c.f.p.Scope.
9
17
  def copy
10
18
  Scope.new(Java::CascadingFlowPlanner::Scope.new(@scope))
11
19
  end
12
20
 
21
+ # Build a c.f.p.Scope for a Flow, which is empty except for its name.
13
22
  def self.flow_scope(name)
14
23
  Java::CascadingFlowPlanner::Scope.new(name)
15
24
  end
16
25
 
26
+ # Build an empty Scope, wrapping an empty c.f.p.Scope.
17
27
  def self.empty_scope(name)
18
28
  Scope.new(Java::CascadingFlowPlanner::Scope.new(name))
19
29
  end
20
30
 
31
+ # Build a Scope for a single source Tap. The flow_scope is propagated
32
+ # through this call into a new Scope.
21
33
  def self.source_scope(name, tap, flow_scope)
22
34
  incoming_scopes = java.util.HashSet.new
23
35
  incoming_scopes.add(flow_scope)
@@ -27,28 +39,30 @@ module Cascading
27
39
  Scope.new(java_scope)
28
40
  end
29
41
 
42
+ # Build a Scope for an arbitrary flow element. This is used to update the
43
+ # Scope at each stage in a pipe Assembly.
30
44
  def self.outgoing_scope(flow_element, incoming_scopes)
31
45
  java_scopes = incoming_scopes.compact.map{ |s| s.scope }
32
46
  Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)))
33
47
  end
34
48
 
49
+ # The values fields of the Scope, which indicate the fields in the current
50
+ # dataflow tuple.
35
51
  def values_fields
36
52
  @scope.out_values_fields
37
53
  end
38
54
 
55
+ # The grouping fields of the Scope, which indicate the keys of an
56
+ # group/cogroup.
39
57
  def grouping_fields
40
58
  @scope.out_grouping_fields
41
59
  end
42
60
 
43
- def scope_fields_to_s(accessor)
44
- begin
45
- fields = @scope.send(accessor)
46
- fields.nil? ? 'null' : fields.to_s
47
- rescue
48
- 'ERROR'
49
- end
50
- end
51
-
61
+ # Prints a detailed description of this Scope, including its type and
62
+ # various selectors, fields, and key fields. Data is bubbled up directly
63
+ # from the Cascading c.f.p.Scope. This output can be useful for debugging
64
+ # the propagation of fields through your job (see Flow#debug_scope and
65
+ # Assembly#debug_scope, which both rely upon this method).
52
66
  def to_s
53
67
  kind = 'Unknown'
54
68
  kind = 'Tap' if @scope.tap?
@@ -77,6 +91,15 @@ END
77
91
 
78
92
  private
79
93
 
94
+ def scope_fields_to_s(accessor)
95
+ begin
96
+ fields = @scope.send(accessor)
97
+ fields.nil? ? 'null' : fields.to_s
98
+ rescue Exception => e
99
+ 'ERROR'
100
+ end
101
+ end
102
+
80
103
  def self.outgoing_scope_for(flow_element, incoming_scopes)
81
104
  begin
82
105
  flow_element.outgoing_scope_for(incoming_scopes)