cascading.jruby 0.0.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,22 @@
1
1
  module Cascading
2
+ # A Node is a Cascade, Flow, or Assembly, all of which are composite
3
+ # structures that describe the hierarchical structure of your job. A Cascade
4
+ # may contain many Flows and a Flow and Assembly may contain many Assemblies
5
+ # (branches in the case of the Assembly). Nodes are named, contain parent
6
+ # and child pointers, and keep track of their children both by name and by
7
+ # insertion order.
8
+ #
9
+ # Nodes must be uniquely named within the scope of their parent so that they
10
+ # unambiguously looked up for connecting pipes within a flow. However, we
11
+ # only ensure that children are uniquely named upon insertion; full
12
+ # uniqueness isn't required until Node#find_child is called (this allows for
13
+ # name reuse in a few limited circumstances that was important when migrating
14
+ # the Etsy workload to enforce these constraints).
2
15
  class Node
3
16
  attr_accessor :name, :parent, :children, :child_names, :last_child
4
17
 
18
+ # A Node requires a name and a parent when it is constructed. Children are
19
+ # added later with Node#add_child.
5
20
  def initialize(name, parent)
6
21
  @name = name
7
22
  @parent = parent
@@ -23,10 +38,15 @@ module Cascading
23
38
  node
24
39
  end
25
40
 
41
+ # The qualified name of a node is formed from the name of all nodes in the
42
+ # path from the root to that node.
26
43
  def qualified_name
27
44
  parent ? "#{parent.qualified_name}.#{name}" : name
28
45
  end
29
46
 
47
+ # Produces a textual description of this Node. This method is overridden
48
+ # by all classes inheriting Node, so it serves mainly as a template for
49
+ # describing a node with children.
30
50
  def describe(offset = '')
31
51
  "#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
32
52
  end
@@ -44,6 +64,8 @@ module Cascading
44
64
  all_children_with_name.first
45
65
  end
46
66
 
67
+ # Returns the root Node, the topmost parent of the hierarchy (typically a
68
+ # Cascade or Flow).
47
69
  def root
48
70
  return self unless parent
49
71
  parent.root
@@ -2,6 +2,13 @@ require 'cascading/base'
2
2
  require 'yaml'
3
3
 
4
4
  module Cascading
5
+ # A Cascade wraps a c.c.Cascade. A Cascade is composed of Flows, which are
6
+ # constructed using the Cascade#flow method within the block passed to the
7
+ # Cascading::cascade constructor. Many flows may be nested within a Cascade.
8
+ #
9
+ # Note that you are not required to use a Cascade to wrap your job. Instead,
10
+ # you could start with a top-level Flow, which you might prefer if you have
11
+ # no need of a c.c.Cascade's make-like semantics wrt sinks.
5
12
  class Cascade < Cascading::Node
6
13
  extend Registerable
7
14
 
@@ -10,46 +17,72 @@ module Cascading
10
17
  # Do not use this constructor directly; instead, use Cascading::cascade to
11
18
  # build cascades.
12
19
  #
13
- # Builds a cascade given the specified name. Optionally accepts
14
- # :properties which will be used as the default properties for all child
15
- # flows. Properties must be a Ruby Hash with string keys and values and
16
- # will be copied before being passed into each flow in the cascade. See
17
- # Cascading::Flow#initialize for details on how flows handle properties.
18
- # Optionally accepts a :mode which will be used as the default mode for all
19
- # child flows. See Cascading::Mode.parse for details.
20
- def initialize(name, params = {})
21
- @properties = params[:properties] || {}
22
- @mode = params[:mode]
20
+ # Builds a Cascade given a name.
21
+ #
22
+ # The named options are:
23
+ # [properties] Properties hash which will be used as the default properties
24
+ # for all child flows. Properties must be a Ruby Hash with
25
+ # string keys and values and will be copied before being
26
+ # passed into each flow in the cascade. See Flow#initialize
27
+ # for details on how flows handle properties.
28
+ # [mode] Mode which will be used as the default mode for all child flows.
29
+ # See Mode.parse for details.
30
+ def initialize(name, options = {})
31
+ @properties = options[:properties] || {}
32
+ @mode = options[:mode]
23
33
  super(name, nil) # A Cascade cannot have a parent
24
34
  self.class.add(name, self)
25
35
  end
26
36
 
27
- # Builds a child flow given a name and block. Optionally accepts
28
- # :properties which will override the default properties stroed in this
29
- # cascade. Optionally accepts a :mode, which will override the default
30
- # mode stored in this cascade.
31
- def flow(name, params = {}, &block)
37
+ # Builds a child Flow in this Cascade given a name and block.
38
+ #
39
+ # The named options are:
40
+ # [properties] Properties hash which will override the default properties
41
+ # stored in this cascade.
42
+ # [mode] Mode which will override the default mode stored in this cascade.
43
+ #
44
+ # Example:
45
+ # cascade 'wordcount', :mode => :local do
46
+ # flow 'first_step' do
47
+ # ...
48
+ # end
49
+ #
50
+ # flow 'second_step' do
51
+ # ...
52
+ # end
53
+ # end
54
+ def flow(name, options = {}, &block)
32
55
  raise "Could not build flow '#{name}'; block required" unless block_given?
33
56
 
34
- params[:properties] ||= properties.dup
35
- params[:mode] ||= mode
57
+ options[:properties] ||= properties.dup
58
+ options[:mode] ||= mode
36
59
 
37
- flow = Flow.new(name, self, params)
60
+ flow = Flow.new(name, self, options)
38
61
  add_child(flow)
39
62
  flow.instance_eval(&block)
40
63
  flow
41
64
  end
42
65
 
66
+ # Produces a textual description of this Cascade. The description details
67
+ # the structure of the Cascade, the sources and sinks of each Flow, and the
68
+ # input and output fields of each Assembly. The offset parameter allows
69
+ # for this describe to be nested within a calling context, which lets us
70
+ # indent the structural hierarchy of a job.
43
71
  def describe(offset = '')
44
72
  "#{offset}#{name}:cascade\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
45
73
  end
46
74
 
75
+ # Writes out the DOT file describing the structure of this Cascade.
76
+ #
77
+ # NOTE: will be at Job in later version and also present on Flow
47
78
  def draw(dir)
48
79
  @children.each do |name, flow|
49
80
  flow.connect.writeDOT("#{dir}/#{name}.dot")
50
81
  end
51
82
  end
52
83
 
84
+ # Builds a map, keyed by flow name, of the sink metadata for each child
85
+ # flow. Currently, this contains only the field names of each sink.
53
86
  def sink_metadata
54
87
  @children.inject({}) do |sink_fields, (name, flow)|
55
88
  sink_fields[name] = flow.sink_metadata
@@ -57,12 +90,16 @@ module Cascading
57
90
  end
58
91
  end
59
92
 
93
+ # Writes the mapping produced by Cascade#sink_metadata to a file at the
94
+ # given path in YAML.
60
95
  def write_sink_metadata(file_name)
61
96
  File.open(file_name, 'w') do |file|
62
97
  YAML.dump(sink_metadata, file)
63
98
  end
64
99
  end
65
100
 
101
+ # Connects this Cascade, producing a c.c.Cascade, which is then completed,
102
+ # executing it. Child flows are connected, so no parameters are required.
66
103
  def complete
67
104
  begin
68
105
  Java::CascadingCascade::CascadeConnector.new.connect(name, make_flows(@children)).complete
@@ -1,6 +1,33 @@
1
+ require 'cascading/cascade'
2
+ require 'cascading/flow'
1
3
  require 'cascading/expr_stub'
2
4
 
5
+ # The Cascading module contains all of the cascading.jruby DSL. Inserting the
6
+ # following into your script:
7
+ # require 'rubygems'
8
+ # require 'cascading'
9
+ # includes this module at the top level, making all of its features available.
10
+ #
11
+ # To build a dataflow like the one in the README.md or
12
+ # {samples}[http://github.com/mrwalker/cascading.jruby/tree/master/samples],
13
+ # start by looking at Cascade or Flow. These are the
14
+ # highest level structures you'll use to put together your job.
15
+ #
16
+ # Within a flow, you'll connect sources to sinks by way of Assembly, which
17
+ # refers to "pipe assemblies" from Cascading. Within an Assembly, you'll use
18
+ # functions and filters (see Operations, IdentityOperations, RegexOperations,
19
+ # FilterOperations, and TextOperations) as well as Assembly#group_by,
20
+ # Assembly#union, and Assembly#join. You can provide those last pipes with a
21
+ # block that can select operations from Aggregations.
22
+ #
23
+ # Finally, you'll want to address the execution of your job, whether it be
24
+ # locally testing or running remotely on a Hadoop cluster. See the Mode class
25
+ # for the available modes, and parameterize your script such that it can operate
26
+ # in Cascading local mode locally and in Hadoop mode when run in a jar produced
27
+ # with {Jading}[http://github.com/mrwalker/jading].
3
28
  module Cascading
29
+ # Mapping that defines a convenient syntax for specifying Java classes, used
30
+ # in Janino expressions and elsewhere.
4
31
  JAVA_TYPE_MAP = {
5
32
  :int => java.lang.Integer.java_class, :long => java.lang.Long.java_class,
6
33
  :bool => java.lang.Boolean.java_class, :double => java.lang.Double.java_class,
@@ -24,44 +51,84 @@ module Cascading
24
51
  # directly building their own cascades and flows so that jading can send them
25
52
  # default properties.
26
53
 
27
- # Builds a top-level cascade given a name and a block. Optionally accepts a
28
- # :mode, as explained in Cascading::Cascade#initialize.
29
- def cascade(name, params = {}, &block)
54
+ # Builds a top-level Cascade given a name and a block.
55
+ #
56
+ # The named options are:
57
+ # [properties] See Cascade#initialize
58
+ # [mode] See Cascade#initialize
59
+ #
60
+ # Example:
61
+ # cascade 'wordcount', :mode => :local do
62
+ # flow 'first_step' do
63
+ # ...
64
+ # end
65
+ #
66
+ # flow 'second_step' do
67
+ # ...
68
+ # end
69
+ # end
70
+ def cascade(name, options = {}, &block)
30
71
  raise "Could not build cascade '#{name}'; block required" unless block_given?
31
- raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if params[:properties]
72
+ raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if options[:properties]
32
73
 
33
- params[:properties] = $jobconf_properties.dup if $jobconf_properties
74
+ options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
34
75
 
35
- cascade = Cascade.new(name, params)
76
+ cascade = Cascade.new(name, options)
36
77
  cascade.instance_eval(&block)
37
78
  cascade
38
79
  end
39
80
 
40
- # Builds a top-level flow given a name and block for applications built of
41
- # flows with no cascades. Optionally accepts a :mode, as explained in
42
- # Cascading::Flow#initialize.
43
- def flow(name, params = {}, &block)
81
+ # Builds a top-level Flow given a name and block for applications built of
82
+ # flows with no cascades.
83
+ #
84
+ # The named options are:
85
+ # [properties] See Flow#initialize
86
+ # [mode] See Flow#initialize
87
+ #
88
+ # Example:
89
+ # flow 'wordcount', :mode => :local do
90
+ # assembly 'first_step' do
91
+ # ...
92
+ # end
93
+ #
94
+ # assembly 'second_step' do
95
+ # ...
96
+ # end
97
+ # end
98
+ def flow(name, options = {}, &block)
44
99
  raise "Could not build flow '#{name}'; block required" unless block_given?
45
- raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if params[:properties]
100
+ raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if options[:properties]
46
101
 
47
- params[:properties] = $jobconf_properties.dup if $jobconf_properties
102
+ options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
48
103
 
49
- flow = Flow.new(name, nil, params)
104
+ flow = Flow.new(name, nil, options)
50
105
  flow.instance_eval(&block)
51
106
  flow
52
107
  end
53
108
 
109
+ # Produces a textual description of all Cascades in the global registry. The
110
+ # description details the structure of the Cascades, the sources and sinks of
111
+ # each Flow, and the input and output fields of each Assembly.
112
+ #
113
+ # NOTE: will be moved to Job in later version
54
114
  def describe
55
115
  Cascade.all.map{ |cascade| cascade.describe }.join("\n")
56
116
  end
57
117
  alias desc describe
58
118
 
59
119
  # See ExprStub.expr
60
- def expr(expression, params = {})
61
- ExprStub.expr(expression, params)
120
+ def expr(expression, options = {})
121
+ ExprStub.expr(expression, options)
62
122
  end
63
123
 
64
- # Creates a cascading.tuple.Fields instance from a string or an array of strings.
124
+ # Utility method for creating Cascading c.t.Fields from a field name (string)
125
+ # or list of field names (array of strings). If the input fields is already a
126
+ # c.t.Fields or nil, it is passed through. This allows for flexible use of
127
+ # the method at multiple layers in the DSL.
128
+ #
129
+ # Example:
130
+ # cascading_fields = fields(['first', 'second', 'third'])
131
+ # # cascading_fields.to_a == ['first', 'second', 'third']
65
132
  def fields(fields)
66
133
  if fields.nil?
67
134
  return nil
@@ -76,27 +143,45 @@ module Cascading
76
143
  return Java::CascadingTuple::Fields.new([fields].flatten.map{ |f| f.kind_of?(Fixnum) ? java.lang.Integer.new(f) : f }.to_java(java.lang.Comparable))
77
144
  end
78
145
 
146
+ # Convenience method wrapping c.t.Fields::ALL
79
147
  def all_fields
80
148
  Java::CascadingTuple::Fields::ALL
81
149
  end
82
150
 
83
- def union_fields(*fields)
84
- fields(fields.inject([]){ |acc, arr| acc | arr.to_a })
85
- end
86
-
87
- def difference_fields(*fields)
88
- fields(fields[1..-1].inject(fields.first.to_a){ |acc, arr| acc - arr.to_a })
151
+ # Convenience method wrapping c.t.Fields::VALUES
152
+ def last_grouping_fields
153
+ Java::CascadingTuple::Fields::VALUES
89
154
  end
90
155
 
91
- def copy_fields(fields)
92
- fields.select(all_fields)
156
+ # Computes fields formed by removing remove_fields from base_fields. Operates
157
+ # only on named fields, not positional fields.
158
+ #
159
+ # Example:
160
+ # base_fields = fields(['a', 'b', 'c'])
161
+ # remove_fields = fields(['b'])
162
+ # result_fields = difference_fields(base_fields, remove_fields)
163
+ # # results_fields.to_a == ['a', 'c']
164
+ def difference_fields(base_fields, remove_fields)
165
+ fields(base_fields.to_a - remove_fields.to_a)
93
166
  end
94
167
 
168
+ # Combines fields deduplicating them with trailing underscores as necessary.
169
+ # This is used in joins to avoid requiring the caller to unique fields before
170
+ # they are joined.
95
171
  def dedup_fields(*fields)
96
172
  raise 'Can only be applied to declarators' unless fields.all?{ |f| f.is_declarator? }
97
173
  fields(dedup_field_names(*fields.map{ |f| f.to_a }))
98
174
  end
99
175
 
176
+ # Helper used by dedup_fields that operates on arrays of field names rather
177
+ # than fields objects.
178
+ #
179
+ # Example:
180
+ # left_names = ['a', 'b']
181
+ # mid_names = ['a', 'c']
182
+ # right_names = ['a', 'd']
183
+ # deduped_names = dedup_field_names(left_names, mid_names, right_names)
184
+ # # deduped_names == ['a', 'b', 'a_', 'c', 'a__', 'd']
100
185
  def dedup_field_names(*names)
101
186
  names.inject([]) do |acc, arr|
102
187
  acc + arr.map{ |e| search_field_name(acc, e) }
@@ -106,30 +191,22 @@ module Cascading
106
191
  def search_field_name(names, candidate)
107
192
  names.include?(candidate) ? search_field_name(names, "#{candidate}_") : candidate
108
193
  end
109
-
110
- def last_grouping_fields
111
- Java::CascadingTuple::Fields::VALUES
112
- end
113
-
114
- def results_fields
115
- Java::CascadingTuple::Fields::RESULTS
116
- end
194
+ private :search_field_name
117
195
 
118
196
  # Creates a TextLine scheme (can be used in both Cascading local and hadoop
119
- # modes). Positional args are used if <tt>:source_fields</tt> is not
120
- # provided.
197
+ # modes). Positional args are used if :source_fields is not provided.
121
198
  #
122
199
  # The named options are:
123
- # * <tt>:source_fields</tt> a string or array of strings. Specifies the
124
- # fields to be read from a source with this scheme. Defaults to ['offset', 'line'].
125
- # * <tt>:sink_fields</tt> a string or array of strings. Specifies the fields
126
- # to be written to a sink with this scheme. Defaults to all_fields.
127
- # * <tt>:compression</tt> a symbol, either <tt>:enable</tt> or
128
- # <tt>:disable</tt>, that governs the TextLine scheme's compression. Defaults
129
- # to the default TextLine compression (only applies to c.s.h.TextLine).
130
- def text_line_scheme(*args)
131
- options = args.extract_options!
132
- source_fields = fields(options[:source_fields] || (args.empty? ? ['offset', 'line'] : args))
200
+ # [source_fields] Fields to be read from a source with this scheme. Defaults
201
+ # to ['offset', 'line'].
202
+ # [sink_fields] Fields to be written to a sink with this scheme. Defaults to
203
+ # all_fields.
204
+ # [compression] A symbol, either :enable or :disable, that
205
+ # governs the TextLine scheme's compression. Defaults to the
206
+ # default TextLine compression (only applies to c.s.h.TextLine).
207
+ def text_line_scheme(*args_with_options)
208
+ options, source_fields = args_with_options.extract_options!, args_with_options
209
+ source_fields = fields(options[:source_fields] || (source_fields.empty? ? ['offset', 'line'] : source_fields))
133
210
  sink_fields = fields(options[:sink_fields]) || all_fields
134
211
  sink_compression = case options[:compression]
135
212
  when :enable then Java::CascadingSchemeHadoop::TextLine::Compress::ENABLE
@@ -153,17 +230,30 @@ module Cascading
153
230
  }
154
231
  end
155
232
 
233
+ # Convenience access to MultiTap.multi_source_tap. This constructor is more
234
+ # "DSL-like" because it allows you to pass taps directly as actual args rather
235
+ # than in an array:
236
+ # multi_source_tap tap1, tap2, tap3, ..., tapn
237
+ #
238
+ # See MultiTap.multi_source_tap for more details.
156
239
  def multi_source_tap(*taps)
157
240
  MultiTap.multi_source_tap(taps)
158
241
  end
159
242
 
243
+ # Convenience access to MultiTap.multi_sink_tap. This constructor is more
244
+ # "DSL-like" because it allows you to pass taps directly as actual args rather
245
+ # than in an array:
246
+ # multi_sink_tap tap1, tap2, tap3, ..., tapn
247
+ #
248
+ # See MultiTap.multi_sink_tap for more details.
160
249
  def multi_sink_tap(*taps)
161
250
  MultiTap.multi_sink_tap(taps)
162
251
  end
163
252
 
164
- # Creates a Cascading::Tap given a path and optional :scheme and :sink_mode.
165
- def tap(path, params = {})
166
- Tap.new(path, params)
253
+ # Convenience constructor for a Tap, that accepts the same options as that
254
+ # class' constructor. See Tap for more details.
255
+ def tap(path, options = {})
256
+ Tap.new(path, options)
167
257
  end
168
258
 
169
259
  # Constructs properties to be passed to Flow#complete or Cascade#complete
@@ -3,15 +3,15 @@ module Cascading
3
3
  attr_accessor :expression, :types, :input_expression
4
4
 
5
5
  # ExprStub requires a Janino expression decorated with field types. For
6
- # example: '"Found: " + (x:int + y:int) + " " + z:string'. Type names are
7
- # defined in Cascading::JAVA_TYPE_MAP.
6
+ # example:
7
+ # expr('"Found: " + (x:int + y:int) + " " + z:string')
8
+ # Type names are defined in Cascading::JAVA_TYPE_MAP.
8
9
  def initialize(expression)
9
10
  @input_expression = expression
10
11
  @expression = expression.dup
11
12
  @types = {}
12
13
 
13
14
  # Simple regexp based parser for types
14
-
15
15
  JAVA_TYPE_MAP.each do |sym, klass|
16
16
  @expression.gsub!(/[A-Za-z0-9_]+:#{sym.to_s}/) do |match|
17
17
  name = match.split(/:/).first.gsub(/\s+/, "")
@@ -21,21 +21,38 @@ module Cascading
21
21
  end
22
22
  end
23
23
 
24
+ # Extract Java names and types from @types hash. Cascading constructors
25
+ # often require two separate Java Arrays in this fashion.
26
+ def names_and_types
27
+ names, types = split_hash(@types)
28
+ [names.to_java(java.lang.String), types.to_java(java.lang.Class)]
29
+ end
30
+
31
+ # Prints the original input expression.
24
32
  def to_s
25
33
  @input_expression
26
34
  end
27
35
 
28
36
  # Convenience constructor for an ExprStub that optionally performs
29
37
  # validation. Takes a string to use as a Janino expression and an optional
30
- # params hash. By default, the param :validate is set to true (performs
31
- # expression validation using default actual argument values) and the param
32
- # :validate_with is set to {} (which doesn't override any of the default
33
- # actual argument values used for validation).
34
- def self.expr(expression, params = {})
35
- params = { :validate => true, :validate_with => {} }.merge(params)
38
+ # options hash.
39
+ #
40
+ # The named options are:
41
+ # [validate] A boolean indicating whether expression validation using
42
+ # default actual argument values should be performed. Defaults
43
+ # to true.
44
+ # [validate_with] A hash mapping field names (or symbols) to the value that
45
+ # should be used for validation. Strings default to nil,
46
+ # so if you have previously filtered nulls you might use a
47
+ # marker value like 'nulls_filtered'. Defaults to {}.
48
+ #
49
+ # Example:
50
+ # insert 'x_eq_y' => expr('x:string.equals(y:string)', :validate_with => { :x => 'nulls_filtered' })
51
+ def self.expr(expression, options = {})
52
+ options = { :validate => true, :validate_with => {} }.merge(options)
36
53
  expr_stub = expression.kind_of?(ExprStub) ? expression : ExprStub.new(expression).compile
37
- expr_stub.validate(params[:validate_with]) if params[:validate]
38
- puts "Expression validation is disabled for '#{expression}'" unless params[:validate]
54
+ expr_stub.validate(options[:validate_with]) if options[:validate]
55
+ puts "Expression validation is disabled for '#{expression}'" unless options[:validate]
39
56
  expr_stub
40
57
  end
41
58
 
@@ -68,6 +85,9 @@ module Cascading
68
85
  self.eval(test_values.merge(actual_args))
69
86
  end
70
87
 
88
+ # Given a scope, validates that the fields required by this ExprStub are
89
+ # available in the values fields of the scope. Returns those values fields
90
+ # which are unused in the expression.
71
91
  def validate_scope(scope)
72
92
  validate_fields(scope.values_fields.to_a)
73
93
  end
@@ -113,12 +133,6 @@ module Cascading
113
133
  end
114
134
  end
115
135
 
116
- # Extract Java names and types from @types hash
117
- def names_and_types
118
- names, types = split_hash(@types)
119
- [names.to_java(java.lang.String), types.to_java(java.lang.Class)]
120
- end
121
-
122
136
  # Makes best effort to convert Ruby numbers into the Java numeric type
123
137
  # exepcted by a Janino expression. However, if the conversion fails, it
124
138
  # returns the original value so that the exception thrown will be from