cascading.jruby 0.0.10 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,22 @@
1
1
  module Cascading
2
+ # A Node is a Cascade, Flow, or Assembly, all of which are composite
3
+ # structures that describe the hierarchical structure of your job. A Cascade
4
+ # may contain many Flows and a Flow and Assembly may contain many Assemblies
5
+ # (branches in the case of the Assembly). Nodes are named, contain parent
6
+ # and child pointers, and keep track of their children both by name and by
7
+ # insertion order.
8
+ #
9
+ # Nodes must be uniquely named within the scope of their parent so that they
10
+ # unambiguously looked up for connecting pipes within a flow. However, we
11
+ # only ensure that children are uniquely named upon insertion; full
12
+ # uniqueness isn't required until Node#find_child is called (this allows for
13
+ # name reuse in a few limited circumstances that was important when migrating
14
+ # the Etsy workload to enforce these constraints).
2
15
  class Node
3
16
  attr_accessor :name, :parent, :children, :child_names, :last_child
4
17
 
18
+ # A Node requires a name and a parent when it is constructed. Children are
19
+ # added later with Node#add_child.
5
20
  def initialize(name, parent)
6
21
  @name = name
7
22
  @parent = parent
@@ -23,10 +38,15 @@ module Cascading
23
38
  node
24
39
  end
25
40
 
41
+ # The qualified name of a node is formed from the name of all nodes in the
42
+ # path from the root to that node.
26
43
  def qualified_name
27
44
  parent ? "#{parent.qualified_name}.#{name}" : name
28
45
  end
29
46
 
47
+ # Produces a textual description of this Node. This method is overridden
48
+ # by all classes inheriting Node, so it serves mainly as a template for
49
+ # describing a node with children.
30
50
  def describe(offset = '')
31
51
  "#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
32
52
  end
@@ -44,6 +64,8 @@ module Cascading
44
64
  all_children_with_name.first
45
65
  end
46
66
 
67
+ # Returns the root Node, the topmost parent of the hierarchy (typically a
68
+ # Cascade or Flow).
47
69
  def root
48
70
  return self unless parent
49
71
  parent.root
@@ -2,6 +2,13 @@ require 'cascading/base'
2
2
  require 'yaml'
3
3
 
4
4
  module Cascading
5
+ # A Cascade wraps a c.c.Cascade. A Cascade is composed of Flows, which are
6
+ # constructed using the Cascade#flow method within the block passed to the
7
+ # Cascading::cascade constructor. Many flows may be nested within a Cascade.
8
+ #
9
+ # Note that you are not required to use a Cascade to wrap your job. Instead,
10
+ # you could start with a top-level Flow, which you might prefer if you have
11
+ # no need of a c.c.Cascade's make-like semantics wrt sinks.
5
12
  class Cascade < Cascading::Node
6
13
  extend Registerable
7
14
 
@@ -10,46 +17,72 @@ module Cascading
10
17
  # Do not use this constructor directly; instead, use Cascading::cascade to
11
18
  # build cascades.
12
19
  #
13
- # Builds a cascade given the specified name. Optionally accepts
14
- # :properties which will be used as the default properties for all child
15
- # flows. Properties must be a Ruby Hash with string keys and values and
16
- # will be copied before being passed into each flow in the cascade. See
17
- # Cascading::Flow#initialize for details on how flows handle properties.
18
- # Optionally accepts a :mode which will be used as the default mode for all
19
- # child flows. See Cascading::Mode.parse for details.
20
- def initialize(name, params = {})
21
- @properties = params[:properties] || {}
22
- @mode = params[:mode]
20
+ # Builds a Cascade given a name.
21
+ #
22
+ # The named options are:
23
+ # [properties] Properties hash which will be used as the default properties
24
+ # for all child flows. Properties must be a Ruby Hash with
25
+ # string keys and values and will be copied before being
26
+ # passed into each flow in the cascade. See Flow#initialize
27
+ # for details on how flows handle properties.
28
+ # [mode] Mode which will be used as the default mode for all child flows.
29
+ # See Mode.parse for details.
30
+ def initialize(name, options = {})
31
+ @properties = options[:properties] || {}
32
+ @mode = options[:mode]
23
33
  super(name, nil) # A Cascade cannot have a parent
24
34
  self.class.add(name, self)
25
35
  end
26
36
 
27
- # Builds a child flow given a name and block. Optionally accepts
28
- # :properties which will override the default properties stroed in this
29
- # cascade. Optionally accepts a :mode, which will override the default
30
- # mode stored in this cascade.
31
- def flow(name, params = {}, &block)
37
+ # Builds a child Flow in this Cascade given a name and block.
38
+ #
39
+ # The named options are:
40
+ # [properties] Properties hash which will override the default properties
41
+ # stored in this cascade.
42
+ # [mode] Mode which will override the default mode stored in this cascade.
43
+ #
44
+ # Example:
45
+ # cascade 'wordcount', :mode => :local do
46
+ # flow 'first_step' do
47
+ # ...
48
+ # end
49
+ #
50
+ # flow 'second_step' do
51
+ # ...
52
+ # end
53
+ # end
54
+ def flow(name, options = {}, &block)
32
55
  raise "Could not build flow '#{name}'; block required" unless block_given?
33
56
 
34
- params[:properties] ||= properties.dup
35
- params[:mode] ||= mode
57
+ options[:properties] ||= properties.dup
58
+ options[:mode] ||= mode
36
59
 
37
- flow = Flow.new(name, self, params)
60
+ flow = Flow.new(name, self, options)
38
61
  add_child(flow)
39
62
  flow.instance_eval(&block)
40
63
  flow
41
64
  end
42
65
 
66
+ # Produces a textual description of this Cascade. The description details
67
+ # the structure of the Cascade, the sources and sinks of each Flow, and the
68
+ # input and output fields of each Assembly. The offset parameter allows
69
+ # for this describe to be nested within a calling context, which lets us
70
+ # indent the structural hierarchy of a job.
43
71
  def describe(offset = '')
44
72
  "#{offset}#{name}:cascade\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
45
73
  end
46
74
 
75
+ # Writes out the DOT file describing the structure of this Cascade.
76
+ #
77
+ # NOTE: will be at Job in later version and also present on Flow
47
78
  def draw(dir)
48
79
  @children.each do |name, flow|
49
80
  flow.connect.writeDOT("#{dir}/#{name}.dot")
50
81
  end
51
82
  end
52
83
 
84
+ # Builds a map, keyed by flow name, of the sink metadata for each child
85
+ # flow. Currently, this contains only the field names of each sink.
53
86
  def sink_metadata
54
87
  @children.inject({}) do |sink_fields, (name, flow)|
55
88
  sink_fields[name] = flow.sink_metadata
@@ -57,12 +90,16 @@ module Cascading
57
90
  end
58
91
  end
59
92
 
93
+ # Writes the mapping produced by Cascade#sink_metadata to a file at the
94
+ # given path in YAML.
60
95
  def write_sink_metadata(file_name)
61
96
  File.open(file_name, 'w') do |file|
62
97
  YAML.dump(sink_metadata, file)
63
98
  end
64
99
  end
65
100
 
101
+ # Connects this Cascade, producing a c.c.Cascade, which is then completed,
102
+ # executing it. Child flows are connected, so no parameters are required.
66
103
  def complete
67
104
  begin
68
105
  Java::CascadingCascade::CascadeConnector.new.connect(name, make_flows(@children)).complete
@@ -1,6 +1,33 @@
1
+ require 'cascading/cascade'
2
+ require 'cascading/flow'
1
3
  require 'cascading/expr_stub'
2
4
 
5
+ # The Cascading module contains all of the cascading.jruby DSL. Inserting the
6
+ # following into your script:
7
+ # require 'rubygems'
8
+ # require 'cascading'
9
+ # includes this module at the top level, making all of its features available.
10
+ #
11
+ # To build a dataflow like the one in the README.md or
12
+ # {samples}[http://github.com/mrwalker/cascading.jruby/tree/master/samples],
13
+ # start by looking at Cascade or Flow. These are the
14
+ # highest level structures you'll use to put together your job.
15
+ #
16
+ # Within a flow, you'll connect sources to sinks by way of Assembly, which
17
+ # refers to "pipe assemblies" from Cascading. Within an Assembly, you'll use
18
+ # functions and filters (see Operations, IdentityOperations, RegexOperations,
19
+ # FilterOperations, and TextOperations) as well as Assembly#group_by,
20
+ # Assembly#union, and Assembly#join. You can provide those last pipes with a
21
+ # block that can select operations from Aggregations.
22
+ #
23
+ # Finally, you'll want to address the execution of your job, whether it be
24
+ # locally testing or running remotely on a Hadoop cluster. See the Mode class
25
+ # for the available modes, and parameterize your script such that it can operate
26
+ # in Cascading local mode locally and in Hadoop mode when run in a jar produced
27
+ # with {Jading}[http://github.com/mrwalker/jading].
3
28
  module Cascading
29
+ # Mapping that defines a convenient syntax for specifying Java classes, used
30
+ # in Janino expressions and elsewhere.
4
31
  JAVA_TYPE_MAP = {
5
32
  :int => java.lang.Integer.java_class, :long => java.lang.Long.java_class,
6
33
  :bool => java.lang.Boolean.java_class, :double => java.lang.Double.java_class,
@@ -24,44 +51,84 @@ module Cascading
24
51
  # directly building their own cascades and flows so that jading can send them
25
52
  # default properties.
26
53
 
27
- # Builds a top-level cascade given a name and a block. Optionally accepts a
28
- # :mode, as explained in Cascading::Cascade#initialize.
29
- def cascade(name, params = {}, &block)
54
+ # Builds a top-level Cascade given a name and a block.
55
+ #
56
+ # The named options are:
57
+ # [properties] See Cascade#initialize
58
+ # [mode] See Cascade#initialize
59
+ #
60
+ # Example:
61
+ # cascade 'wordcount', :mode => :local do
62
+ # flow 'first_step' do
63
+ # ...
64
+ # end
65
+ #
66
+ # flow 'second_step' do
67
+ # ...
68
+ # end
69
+ # end
70
+ def cascade(name, options = {}, &block)
30
71
  raise "Could not build cascade '#{name}'; block required" unless block_given?
31
- raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if params[:properties]
72
+ raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if options[:properties]
32
73
 
33
- params[:properties] = $jobconf_properties.dup if $jobconf_properties
74
+ options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
34
75
 
35
- cascade = Cascade.new(name, params)
76
+ cascade = Cascade.new(name, options)
36
77
  cascade.instance_eval(&block)
37
78
  cascade
38
79
  end
39
80
 
40
- # Builds a top-level flow given a name and block for applications built of
41
- # flows with no cascades. Optionally accepts a :mode, as explained in
42
- # Cascading::Flow#initialize.
43
- def flow(name, params = {}, &block)
81
+ # Builds a top-level Flow given a name and block for applications built of
82
+ # flows with no cascades.
83
+ #
84
+ # The named options are:
85
+ # [properties] See Flow#initialize
86
+ # [mode] See Flow#initialize
87
+ #
88
+ # Example:
89
+ # flow 'wordcount', :mode => :local do
90
+ # assembly 'first_step' do
91
+ # ...
92
+ # end
93
+ #
94
+ # assembly 'second_step' do
95
+ # ...
96
+ # end
97
+ # end
98
+ def flow(name, options = {}, &block)
44
99
  raise "Could not build flow '#{name}'; block required" unless block_given?
45
- raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if params[:properties]
100
+ raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if options[:properties]
46
101
 
47
- params[:properties] = $jobconf_properties.dup if $jobconf_properties
102
+ options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
48
103
 
49
- flow = Flow.new(name, nil, params)
104
+ flow = Flow.new(name, nil, options)
50
105
  flow.instance_eval(&block)
51
106
  flow
52
107
  end
53
108
 
109
+ # Produces a textual description of all Cascades in the global registry. The
110
+ # description details the structure of the Cascades, the sources and sinks of
111
+ # each Flow, and the input and output fields of each Assembly.
112
+ #
113
+ # NOTE: will be moved to Job in later version
54
114
  def describe
55
115
  Cascade.all.map{ |cascade| cascade.describe }.join("\n")
56
116
  end
57
117
  alias desc describe
58
118
 
59
119
  # See ExprStub.expr
60
- def expr(expression, params = {})
61
- ExprStub.expr(expression, params)
120
+ def expr(expression, options = {})
121
+ ExprStub.expr(expression, options)
62
122
  end
63
123
 
64
- # Creates a cascading.tuple.Fields instance from a string or an array of strings.
124
+ # Utility method for creating Cascading c.t.Fields from a field name (string)
125
+ # or list of field names (array of strings). If the input fields is already a
126
+ # c.t.Fields or nil, it is passed through. This allows for flexible use of
127
+ # the method at multiple layers in the DSL.
128
+ #
129
+ # Example:
130
+ # cascading_fields = fields(['first', 'second', 'third'])
131
+ # # cascading_fields.to_a == ['first', 'second', 'third']
65
132
  def fields(fields)
66
133
  if fields.nil?
67
134
  return nil
@@ -76,27 +143,45 @@ module Cascading
76
143
  return Java::CascadingTuple::Fields.new([fields].flatten.map{ |f| f.kind_of?(Fixnum) ? java.lang.Integer.new(f) : f }.to_java(java.lang.Comparable))
77
144
  end
78
145
 
146
+ # Convenience method wrapping c.t.Fields::ALL
79
147
  def all_fields
80
148
  Java::CascadingTuple::Fields::ALL
81
149
  end
82
150
 
83
- def union_fields(*fields)
84
- fields(fields.inject([]){ |acc, arr| acc | arr.to_a })
85
- end
86
-
87
- def difference_fields(*fields)
88
- fields(fields[1..-1].inject(fields.first.to_a){ |acc, arr| acc - arr.to_a })
151
+ # Convenience method wrapping c.t.Fields::VALUES
152
+ def last_grouping_fields
153
+ Java::CascadingTuple::Fields::VALUES
89
154
  end
90
155
 
91
- def copy_fields(fields)
92
- fields.select(all_fields)
156
+ # Computes fields formed by removing remove_fields from base_fields. Operates
157
+ # only on named fields, not positional fields.
158
+ #
159
+ # Example:
160
+ # base_fields = fields(['a', 'b', 'c'])
161
+ # remove_fields = fields(['b'])
162
+ # result_fields = difference_fields(base_fields, remove_fields)
163
+ # # results_fields.to_a == ['a', 'c']
164
+ def difference_fields(base_fields, remove_fields)
165
+ fields(base_fields.to_a - remove_fields.to_a)
93
166
  end
94
167
 
168
+ # Combines fields deduplicating them with trailing underscores as necessary.
169
+ # This is used in joins to avoid requiring the caller to unique fields before
170
+ # they are joined.
95
171
  def dedup_fields(*fields)
96
172
  raise 'Can only be applied to declarators' unless fields.all?{ |f| f.is_declarator? }
97
173
  fields(dedup_field_names(*fields.map{ |f| f.to_a }))
98
174
  end
99
175
 
176
+ # Helper used by dedup_fields that operates on arrays of field names rather
177
+ # than fields objects.
178
+ #
179
+ # Example:
180
+ # left_names = ['a', 'b']
181
+ # mid_names = ['a', 'c']
182
+ # right_names = ['a', 'd']
183
+ # deduped_names = dedup_field_names(left_names, mid_names, right_names)
184
+ # # deduped_names == ['a', 'b', 'a_', 'c', 'a__', 'd']
100
185
  def dedup_field_names(*names)
101
186
  names.inject([]) do |acc, arr|
102
187
  acc + arr.map{ |e| search_field_name(acc, e) }
@@ -106,30 +191,22 @@ module Cascading
106
191
  def search_field_name(names, candidate)
107
192
  names.include?(candidate) ? search_field_name(names, "#{candidate}_") : candidate
108
193
  end
109
-
110
- def last_grouping_fields
111
- Java::CascadingTuple::Fields::VALUES
112
- end
113
-
114
- def results_fields
115
- Java::CascadingTuple::Fields::RESULTS
116
- end
194
+ private :search_field_name
117
195
 
118
196
  # Creates a TextLine scheme (can be used in both Cascading local and hadoop
119
- # modes). Positional args are used if <tt>:source_fields</tt> is not
120
- # provided.
197
+ # modes). Positional args are used if :source_fields is not provided.
121
198
  #
122
199
  # The named options are:
123
- # * <tt>:source_fields</tt> a string or array of strings. Specifies the
124
- # fields to be read from a source with this scheme. Defaults to ['offset', 'line'].
125
- # * <tt>:sink_fields</tt> a string or array of strings. Specifies the fields
126
- # to be written to a sink with this scheme. Defaults to all_fields.
127
- # * <tt>:compression</tt> a symbol, either <tt>:enable</tt> or
128
- # <tt>:disable</tt>, that governs the TextLine scheme's compression. Defaults
129
- # to the default TextLine compression (only applies to c.s.h.TextLine).
130
- def text_line_scheme(*args)
131
- options = args.extract_options!
132
- source_fields = fields(options[:source_fields] || (args.empty? ? ['offset', 'line'] : args))
200
+ # [source_fields] Fields to be read from a source with this scheme. Defaults
201
+ # to ['offset', 'line'].
202
+ # [sink_fields] Fields to be written to a sink with this scheme. Defaults to
203
+ # all_fields.
204
+ # [compression] A symbol, either :enable or :disable, that
205
+ # governs the TextLine scheme's compression. Defaults to the
206
+ # default TextLine compression (only applies to c.s.h.TextLine).
207
+ def text_line_scheme(*args_with_options)
208
+ options, source_fields = args_with_options.extract_options!, args_with_options
209
+ source_fields = fields(options[:source_fields] || (source_fields.empty? ? ['offset', 'line'] : source_fields))
133
210
  sink_fields = fields(options[:sink_fields]) || all_fields
134
211
  sink_compression = case options[:compression]
135
212
  when :enable then Java::CascadingSchemeHadoop::TextLine::Compress::ENABLE
@@ -153,17 +230,30 @@ module Cascading
153
230
  }
154
231
  end
155
232
 
233
+ # Convenience access to MultiTap.multi_source_tap. This constructor is more
234
+ # "DSL-like" because it allows you to pass taps directly as actual args rather
235
+ # than in an array:
236
+ # multi_source_tap tap1, tap2, tap3, ..., tapn
237
+ #
238
+ # See MultiTap.multi_source_tap for more details.
156
239
  def multi_source_tap(*taps)
157
240
  MultiTap.multi_source_tap(taps)
158
241
  end
159
242
 
243
+ # Convenience access to MultiTap.multi_sink_tap. This constructor is more
244
+ # "DSL-like" because it allows you to pass taps directly as actual args rather
245
+ # than in an array:
246
+ # multi_sink_tap tap1, tap2, tap3, ..., tapn
247
+ #
248
+ # See MultiTap.multi_sink_tap for more details.
160
249
  def multi_sink_tap(*taps)
161
250
  MultiTap.multi_sink_tap(taps)
162
251
  end
163
252
 
164
- # Creates a Cascading::Tap given a path and optional :scheme and :sink_mode.
165
- def tap(path, params = {})
166
- Tap.new(path, params)
253
+ # Convenience constructor for a Tap, that accepts the same options as that
254
+ # class' constructor. See Tap for more details.
255
+ def tap(path, options = {})
256
+ Tap.new(path, options)
167
257
  end
168
258
 
169
259
  # Constructs properties to be passed to Flow#complete or Cascade#complete
@@ -3,15 +3,15 @@ module Cascading
3
3
  attr_accessor :expression, :types, :input_expression
4
4
 
5
5
  # ExprStub requires a Janino expression decorated with field types. For
6
- # example: '"Found: " + (x:int + y:int) + " " + z:string'. Type names are
7
- # defined in Cascading::JAVA_TYPE_MAP.
6
+ # example:
7
+ # expr('"Found: " + (x:int + y:int) + " " + z:string')
8
+ # Type names are defined in Cascading::JAVA_TYPE_MAP.
8
9
  def initialize(expression)
9
10
  @input_expression = expression
10
11
  @expression = expression.dup
11
12
  @types = {}
12
13
 
13
14
  # Simple regexp based parser for types
14
-
15
15
  JAVA_TYPE_MAP.each do |sym, klass|
16
16
  @expression.gsub!(/[A-Za-z0-9_]+:#{sym.to_s}/) do |match|
17
17
  name = match.split(/:/).first.gsub(/\s+/, "")
@@ -21,21 +21,38 @@ module Cascading
21
21
  end
22
22
  end
23
23
 
24
+ # Extract Java names and types from @types hash. Cascading constructors
25
+ # often require two separate Java Arrays in this fashion.
26
+ def names_and_types
27
+ names, types = split_hash(@types)
28
+ [names.to_java(java.lang.String), types.to_java(java.lang.Class)]
29
+ end
30
+
31
+ # Prints the original input expression.
24
32
  def to_s
25
33
  @input_expression
26
34
  end
27
35
 
28
36
  # Convenience constructor for an ExprStub that optionally performs
29
37
  # validation. Takes a string to use as a Janino expression and an optional
30
- # params hash. By default, the param :validate is set to true (performs
31
- # expression validation using default actual argument values) and the param
32
- # :validate_with is set to {} (which doesn't override any of the default
33
- # actual argument values used for validation).
34
- def self.expr(expression, params = {})
35
- params = { :validate => true, :validate_with => {} }.merge(params)
38
+ # options hash.
39
+ #
40
+ # The named options are:
41
+ # [validate] A boolean indicating whether expression validation using
42
+ # default actual argument values should be performed. Defaults
43
+ # to true.
44
+ # [validate_with] A hash mapping field names (or symbols) to the value that
45
+ # should be used for validation. Strings default to nil,
46
+ # so if you have previously filtered nulls you might use a
47
+ # marker value like 'nulls_filtered'. Defaults to {}.
48
+ #
49
+ # Example:
50
+ # insert 'x_eq_y' => expr('x:string.equals(y:string)', :validate_with => { :x => 'nulls_filtered' })
51
+ def self.expr(expression, options = {})
52
+ options = { :validate => true, :validate_with => {} }.merge(options)
36
53
  expr_stub = expression.kind_of?(ExprStub) ? expression : ExprStub.new(expression).compile
37
- expr_stub.validate(params[:validate_with]) if params[:validate]
38
- puts "Expression validation is disabled for '#{expression}'" unless params[:validate]
54
+ expr_stub.validate(options[:validate_with]) if options[:validate]
55
+ puts "Expression validation is disabled for '#{expression}'" unless options[:validate]
39
56
  expr_stub
40
57
  end
41
58
 
@@ -68,6 +85,9 @@ module Cascading
68
85
  self.eval(test_values.merge(actual_args))
69
86
  end
70
87
 
88
+ # Given a scope, validates that the fields required by this ExprStub are
89
+ # available in the values fields of the scope. Returns those values fields
90
+ # which are unused in the expression.
71
91
  def validate_scope(scope)
72
92
  validate_fields(scope.values_fields.to_a)
73
93
  end
@@ -113,12 +133,6 @@ module Cascading
113
133
  end
114
134
  end
115
135
 
116
- # Extract Java names and types from @types hash
117
- def names_and_types
118
- names, types = split_hash(@types)
119
- [names.to_java(java.lang.String), types.to_java(java.lang.Class)]
120
- end
121
-
122
136
  # Makes best effort to convert Ruby numbers into the Java numeric type
123
137
  # exepcted by a Janino expression. However, if the conversion fails, it
124
138
  # returns the original value so that the exception thrown will be from