cascading.jruby 0.0.10 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,25 @@
1
+ # Extensions to Arrays in support of variable length lists of field names. This
2
+ # is not pretty, but supports DSL features like:
3
+ # group_by 'field1', 'field2', :sort_by => 'field3' do
4
+ # ...
5
+ # end
6
+ #
7
+ # The most obvious limitation of the approach is that function definitions of
8
+ # the form f(*args_with_options) are not self-documenting. To compensate for
9
+ # this, documentation of all arguments and optional parameters must be provided
10
+ # on the DSL method.
1
11
  class Array
12
+ # Use this extension to extract the optional parameters from a
13
+ # *args_with_options argument.
14
+ # So if you have a function:
15
+ # def f(*args_with_options)
16
+ # You can destructively process the args_with_options as follows:
17
+ # options, just_args = args_with_options.extract_options!, args_with_options
2
18
  def extract_options!
3
19
  last.is_a?(::Hash) ? pop : {}
4
20
  end
5
21
 
22
+ # Non-destructive form of Array#extract_options!
6
23
  def extract_options
7
24
  last.is_a?(::Hash) ? last : {}
8
25
  end
@@ -0,0 +1,101 @@
1
+ module Cascading
2
+ # Module of filtering operations. Unlike some of the other functional
3
+ # operations modules, this one does not just wrap operations defined by
4
+ # Cascading in cascading.operation.filter. Instead, it provides some useful
5
+ # high-level DSL pipes which map many Cascading operations into a smaller
6
+ # number of DSL statements.
7
+ #
8
+ # Still, some are direct wrappers:
9
+ # filter\_null:: {FilterNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNull.html]
10
+ # filter\_not\_null:: {FilterNotNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNotNull.html]
11
+ module FilterOperations
12
+ # Filter the current assembly based on an expression or regex, but not both.
13
+ #
14
+ # The named options are:
15
+ # [expression] A Janino expression used to filter. Has access to all :input
16
+ # fields.
17
+ # [validate] Boolean passed to Cascading#expr to enable or disable
18
+ # expression validation. Defaults to true.
19
+ # [validate_with] Hash mapping field names to actual arguments used by
20
+ # Cascading#expr for expression validation. Defaults to {}.
21
+ # [regex] A regular expression used to filter.
22
+ # [remove_match] Boolean indicating if regex matches should be removed or
23
+ # kept. Defaults to false, which is a bit counterintuitive.
24
+ # [match_each_element] Boolean indicating if regex should match entire
25
+ # incoming tuple (joined with tabs) or each field
26
+ # individually. Defaults to false.
27
+ #
28
+ # Example:
29
+ # filter :input => 'field1', :regex => /\t/, :remove_match => true
30
+ # filter :expression => 'field1:long > 0 && "".equals(field2:string)'
31
+ def filter(options = {})
32
+ input_fields = options[:input] || all_fields
33
+ expression = options[:expression]
34
+ regex = options[:regex]
35
+
36
+ if expression
37
+ validate = options.has_key?(:validate) ? options[:validate] : true
38
+ validate_with = options[:validate_with] || {}
39
+
40
+ stub = expr(expression, { :validate => validate, :validate_with => validate_with })
41
+ stub.validate_scope(scope)
42
+
43
+ names, types = stub.names_and_types
44
+ each input_fields, :filter => Java::CascadingOperationExpression::ExpressionFilter.new(
45
+ stub.expression,
46
+ names,
47
+ types
48
+ )
49
+ elsif regex
50
+ parameters = [regex.to_s, options[:remove_match], options[:match_each_element]].compact
51
+ each input_fields, :filter => Java::CascadingOperationRegex::RegexFilter.new(*parameters)
52
+ else
53
+ raise 'filter requires one of :expression or :regex'
54
+ end
55
+ end
56
+
57
+ # Rejects tuples from the current assembly based on a Janino expression.
58
+ # This is just a wrapper for FilterOperations.filter.
59
+ #
60
+ # Example:
61
+ # reject 'field1:long > 0 && "".equals(field2:string)'
62
+ def reject(expression, options = {})
63
+ options[:expression] = expression
64
+ filter(options)
65
+ end
66
+
67
+ # Keeps tuples from the current assembly based on a Janino expression. This
68
+ # is a wrapper for FilterOperations.filter.
69
+ #
70
+ # Note that this is accomplished by inverting the given expression, and best
71
+ # attempt is made to support import statements prior to the expression. If
72
+ # this support should break, simply negate your expression and use
73
+ # FilterOperations.reject.
74
+ #
75
+ # Example:
76
+ # where 'field1:long > 0 && "".equals(field2:string)'
77
+ def where(expression, options = {})
78
+ _, imports, expr = expression.match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
79
+ options[:expression] = "#{imports}!(#{expr})"
80
+ filter(options)
81
+ end
82
+
83
+ # Rejects tuples from the current assembly if any input field is null.
84
+ #
85
+ # Example:
86
+ # filter_null 'field1', 'field2'
87
+ def filter_null(*input_fields)
88
+ each(input_fields, :filter => Java::CascadingOperationFilter::FilterNull.new)
89
+ end
90
+ alias reject_null filter_null
91
+
92
+ # Rejects tuples from the current assembly if any input field is not null.
93
+ #
94
+ # Example:
95
+ # filter_not_null 'field1', 'field2'
96
+ def filter_not_null(*input_fields)
97
+ each(input_fields, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
98
+ end
99
+ alias where_null filter_not_null
100
+ end
101
+ end
@@ -1,6 +1,10 @@
1
1
  require 'cascading/assembly'
2
2
 
3
3
  module Cascading
4
+ # A Flow wraps a c.f.Flow. A Flow is composed of Assemblies, which are
5
+ # constructed using the Flow#assembly method within the block passed to the
6
+ # Cascading::flow or Cascade#flow constructor. Many Assemblies may be nested
7
+ # within a Flow.
4
8
  class Flow < Cascading::Node
5
9
  extend Registerable
6
10
 
@@ -10,23 +14,46 @@ module Cascading
10
14
  # Do not use this constructor directly. Instead, use Cascading::flow to
11
15
  # build top-level flows and Cascade#flow to build flows within a Cascade.
12
16
  #
13
- # Builds a flow given a name and a parent node (a cascade or nil).
14
- # Optionally accepts :properties which allows external configuration of
15
- # this flow. The flow will side-effect the properties during composition,
16
- # then pass the modified properties along to the FlowConnector for
17
- # execution. See Cascading::Cascade#initialize for details on how
18
- # properties are propagated through cascades. Optionally accepts a :mode
19
- # which will determine the execution mode of this flow. See
20
- # Cascading::Mode.parse for details.
21
- def initialize(name, parent, params = {})
17
+ # Builds a Flow given a name and a parent node (a Cascade or nil).
18
+ #
19
+ # The named options are:
20
+ # [properties] Properties hash which allows external configuration of this
21
+ # flow. The flow will side-effect the properties during
22
+ # composition, then pass the modified properties along to the
23
+ # FlowConnector for execution. See Cascade#initialize for
24
+ # details on how properties are propagated through cascades.
25
+ # [mode] Mode which will determine the execution mode of this flow. See
26
+ # Mode.parse for details.
27
+ def initialize(name, parent, options = {})
22
28
  @sources, @sinks, @incoming_scopes, @outgoing_scopes, @listeners = {}, {}, {}, {}, []
23
- @properties = params[:properties] || {}
24
- @mode = Mode.parse(params[:mode])
29
+ @properties = options[:properties] || {}
30
+ @mode = Mode.parse(options[:mode])
25
31
  @flow_scope = Scope.flow_scope(name)
26
32
  super(name, parent)
27
33
  self.class.add(name, self)
28
34
  end
29
35
 
36
+ # Builds a child Assembly in this Flow given a name and block.
37
+ #
38
+ # An assembly's name is quite important as it will determine:
39
+ # * The sources from which it will read, if any
40
+ # * The name to be used in joins or unions downstream
41
+ # * The name to be used to sink the output of the assembly downstream
42
+ #
43
+ # Many assemblies may be built within a flow. The Assembly#branch method
44
+ # is used for creating nested assemblies and produces objects of the same
45
+ # type as this constructor.
46
+ #
47
+ # Example:
48
+ # flow 'wordcount', :mode => :local do
49
+ # assembly 'first_step' do
50
+ # ...
51
+ # end
52
+ #
53
+ # assembly 'second_step' do
54
+ # ...
55
+ # end
56
+ # end
30
57
  def assembly(name, &block)
31
58
  raise "Could not build assembly '#{name}'; block required" unless block_given?
32
59
  assembly = Assembly.new(name, self, @outgoing_scopes)
@@ -49,6 +76,11 @@ module Cascading
49
76
  sinks[name] = tap
50
77
  end
51
78
 
79
+ # Produces a textual description of this Flow. The description details the
80
+ # structure of the Flow, its sources and sinks, and the input and output
81
+ # fields of each Assembly. The offset parameter allows for this describe
82
+ # to be nested within a calling context, which lets us indent the
83
+ # structural hierarchy of a job.
52
84
  def describe(offset = '')
53
85
  description = "#{offset}#{name}:flow\n"
54
86
  description += "#{sources.keys.map{ |source| "#{offset} #{source}:source :: #{incoming_scopes[source].values_fields.to_a.inspect}" }.join("\n")}\n"
@@ -57,18 +89,28 @@ module Cascading
57
89
  description
58
90
  end
59
91
 
92
+ # Accesses the outgoing scope of this Flow at the point at which it is
93
+ # called by default, or for the child specified by the given name, if
94
+ # specified. This is useful for grabbing the values_fields at any point in
95
+ # the construction of the Flow. See Scope for details.
60
96
  def scope(name = nil)
61
97
  raise 'Must specify name if no children have been defined yet' unless name || last_child
62
98
  name ||= last_child.name
63
99
  @outgoing_scopes[name]
64
100
  end
65
101
 
102
+ # Prints information about the scope of this Flow at the point at which it
103
+ # is called by default, or for the child specified by the given name, if
104
+ # specified. This allows you to trace the propagation of field names
105
+ # through your job and is handy for debugging. See Scope for details.
66
106
  def debug_scope(name = nil)
67
107
  scope = scope(name)
68
108
  name ||= last_child.name
69
109
  puts "Scope for '#{name}':\n #{scope}"
70
110
  end
71
111
 
112
+ # Builds a map, keyed by sink name, of the sink metadata for each sink.
113
+ # Currently, this contains only the field names of each sink.
72
114
  def sink_metadata
73
115
  @sinks.keys.inject({}) do |sink_metadata, sink_name|
74
116
  raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
@@ -79,7 +121,16 @@ module Cascading
79
121
  end
80
122
  end
81
123
 
82
- # TODO: support all codecs, support list of codecs
124
+ # Property modifier that sets the codec and type of the compression for all
125
+ # sinks in this flow. Currently only supports o.a.h.i.c.DefaultCodec and
126
+ # o.a.h.i.c.GzipCodec, and the the NONE, RECORD, or BLOCK compressions
127
+ # types defined in o.a.h.i.SequenceFile.
128
+ #
129
+ # codec may be symbols like :default or :gzip and type may be symbols like
130
+ # :none, :record, or :block.
131
+ #
132
+ # Example:
133
+ # compress_output :default, :block
83
134
  def compress_output(codec, type)
84
135
  properties['mapred.output.compress'] = 'true'
85
136
  properties['mapred.output.compression.codec'] = case codec
@@ -95,22 +146,28 @@ module Cascading
95
146
  end
96
147
  end
97
148
 
149
+ # Set the cascading.spill.list.threshold property in this flow's
150
+ # properties. See c.t.c.SpillableProps for details.
98
151
  def set_spill_threshold(threshold)
99
- properties['cascading.cogroup.spill.threshold'] = threshold.to_s
152
+ properties['cascading.spill.list.threshold'] = threshold.to_s
100
153
  end
101
154
 
155
+ # Adds the given path to the mapred.cache.files list property.
102
156
  def add_file_to_distributed_cache(file)
103
157
  add_to_distributed_cache(file, "mapred.cache.files")
104
158
  end
105
159
 
160
+ # Adds the given path to the mapred.cache.archives list property.
106
161
  def add_archive_to_distributed_cache(file)
107
162
  add_to_distributed_cache(file, "mapred.cache.archives")
108
163
  end
109
164
 
165
+ # Appends a FlowListener to the list of listeners for this flow.
110
166
  def add_listener(listener)
111
167
  @listeners << listener
112
168
  end
113
169
 
170
+ # Handles locating a file cached from S3 on local disk. TODO: remove
114
171
  def emr_local_path_for_distributed_cache_file(file)
115
172
  # NOTE this needs to be *appended* to the property mapred.local.dir
116
173
  if file =~ /^s3n?:\/\//
@@ -122,16 +179,9 @@ module Cascading
122
179
  end
123
180
  end
124
181
 
125
- def add_to_distributed_cache(file, property)
126
- v = properties[property]
127
-
128
- if v
129
- properties[property] = [v.split(/,/), file].flatten.join(",")
130
- else
131
- properties[property] = file
132
- end
133
- end
134
-
182
+ # Connects this Flow, producing a c.f.Flow without completing it (the Flow
183
+ # is not executed). This method is used by Cascade to connect its child
184
+ # Flows. To connect and complete a Flow, see Flow#complete.
135
185
  def connect
136
186
  puts "Connecting flow '#{name}' with properties:"
137
187
  properties.keys.sort.each do |key|
@@ -141,6 +191,7 @@ module Cascading
141
191
  # FIXME: why do I have to do this in 2.0 wip-255?
142
192
  Java::CascadingProperty::AppProps.setApplicationName(properties, name)
143
193
  Java::CascadingProperty::AppProps.setApplicationVersion(properties, '0.0.0')
194
+ Java::CascadingProperty::AppProps.setApplicationJarClass(properties, Java::CascadingFlow::Flow.java_class)
144
195
 
145
196
  sources = make_tap_parameter(@sources, :head_pipe)
146
197
  sinks = make_tap_parameter(@sinks, :tail_pipe)
@@ -148,6 +199,9 @@ module Cascading
148
199
  mode.connect_flow(properties, name, sources, sinks, pipes)
149
200
  end
150
201
 
202
+ # Completes this Flow after connecting it. This results in execution of
203
+ # the c.f.Flow built from this Flow. Use this method when executing a
204
+ # top-level Flow.
151
205
  def complete
152
206
  begin
153
207
  flow = connect
@@ -160,6 +214,16 @@ module Cascading
160
214
 
161
215
  private
162
216
 
217
+ def add_to_distributed_cache(file, property)
218
+ v = properties[property]
219
+
220
+ if v
221
+ properties[property] = [v.split(/,/), file].flatten.join(",")
222
+ else
223
+ properties[property] = file
224
+ end
225
+ end
226
+
163
227
  def make_tap_parameter(taps, pipe_accessor)
164
228
  taps.inject({}) do |map, (name, tap)|
165
229
  assembly = find_child(name)
@@ -0,0 +1,82 @@
1
+ module Cascading
2
+ # Module of pipe assemblies that wrap the Cascading Identity operation. These
3
+ # are split out only to group similar functionality.
4
+ module IdentityOperations
5
+ # Restricts the current assembly to the specified fields in the order in
6
+ # which they are specified (can be used to reorder fields).
7
+ #
8
+ # Example:
9
+ # project 'field1', 'field2'
10
+ def project(*input_fields)
11
+ each fields(input_fields), :function => Java::CascadingOperation::Identity.new
12
+ end
13
+
14
+ # Removes the specified fields from the current assembly.
15
+ #
16
+ # Example:
17
+ # discard 'field1', 'field2'
18
+ def discard(*input_fields)
19
+ discard_fields = fields(input_fields)
20
+ keep_fields = difference_fields(scope.values_fields, discard_fields)
21
+ project(*keep_fields.to_a)
22
+ end
23
+
24
+ # Renames fields according to the mapping provided, preserving the original
25
+ # field order. Throws an exception if non-existent fields are specified.
26
+ #
27
+ # Example:
28
+ # rename 'field1' => 'fieldA', 'field2' => 'fieldB'
29
+ #
30
+ # Produces: ['fieldA', 'fieldB'], assuming those were the only 2 input
31
+ # fields.
32
+ def rename(name_map)
33
+ original_fields = scope.values_fields.to_a
34
+ invalid = name_map.keys - original_fields
35
+ raise "Invalid field names in rename: #{invalid.inspect}" unless invalid.empty?
36
+
37
+ renamed_fields = original_fields.map{ |name| name_map[name] || name }
38
+
39
+ each original_fields, :function => Java::CascadingOperation::Identity.new(fields(renamed_fields))
40
+ end
41
+
42
+ # Coerces fields to the Java type selected from Cascading::JAVA_TYPE_MAP.
43
+ #
44
+ # Example:
45
+ # cast 'field1' => :int, 'field2' => :double
46
+ def cast(type_map)
47
+ input_fields = type_map.keys.sort
48
+ types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*input_fields))
49
+ input_fields = fields(input_fields)
50
+ types = types.to_java(java.lang.Class)
51
+ each input_fields, :function => Java::CascadingOperation::Identity.new(input_fields, types)
52
+ end
53
+
54
+ # A field copy (not a pipe copy). Renames fields according to name_map,
55
+ # appending them to the fields in the assembly in the same order as the
56
+ # original fields from which they are copied. Throws an exception if
57
+ # non-existent fields are specified.
58
+ #
59
+ # Example:
60
+ # copy 'field1' => 'fieldA', 'field2' => 'fieldB'
61
+ #
62
+ # Produces: ['field1', 'field2', 'fieldA', 'fieldB'], assuming those were
63
+ # the only input fields.
64
+ def copy(name_map)
65
+ original_fields = scope.values_fields.to_a
66
+ invalid = name_map.keys - original_fields
67
+ raise "Invalid field names in copy: #{invalid.inspect}" unless invalid.empty?
68
+
69
+ # Original fields in name_map in their original order
70
+ input_fields = original_fields - (original_fields - name_map.keys)
71
+ into_fields = name_map.values_at(*input_fields)
72
+
73
+ each input_fields, :function => Java::CascadingOperation::Identity.new(fields(into_fields)), :output => all_fields
74
+ end
75
+
76
+ # A pipe copy (not a field copy). Can be used within a branch to copy a
77
+ # pipe.
78
+ def pass
79
+ each all_fields, :function => Java::CascadingOperation::Identity.new
80
+ end
81
+ end
82
+ end
@@ -1,21 +1,25 @@
1
1
  module Cascading
2
- # A Cascading::Mode encapsulates the idea of the execution mode for your
3
- # flows. The default is Hadoop mode, but you can request that your code run
4
- # in Cascading local mode. If you subsequently use a tap or a scheme that
5
- # has no local implementation, the mode will be converted back to Hadoop
6
- # mode.
2
+ # A Mode encapsulates the idea of the execution mode for your flows. The
3
+ # default is Hadoop mode, but you can request that your code run in Cascading
4
+ # local mode. If you subsequently use a tap or a scheme that has no local
5
+ # implementation, the mode will be converted back to Hadoop mode.
7
6
  class Mode
8
7
  attr_reader :local
9
8
 
10
- # Hadoop mode is the default. You must explicitly request Cascading local
11
- # mode with values 'local' or :local.
9
+ # Parses a specification of which mode, Cascading local mode or Hadoop mode,
10
+ # to execute in. Defaults to Hadoop mode. You may explicitly request
11
+ # Cascading local mode with values 'local' or :local. If you pass a Mode
12
+ # object to this method, it will be passed through.
12
13
  def self.parse(mode)
13
14
  case mode
15
+ when Mode then mode
14
16
  when 'local', :local then Mode.new(true)
15
17
  else Mode.new(false)
16
18
  end
17
19
  end
18
20
 
21
+ # Constructs a Mode given a flag indicating if it should be Cascading local
22
+ # mode.
19
23
  def initialize(local)
20
24
  @local = local
21
25
  end
@@ -34,9 +38,9 @@ module Cascading
34
38
  end
35
39
 
36
40
  # Builds a c.f.Flow given properties, name, sources, sinks, and pipes from
37
- # a Cascading::Flow. The current mode is adjusted based on the taps and
38
- # schemes of the sources and sinks, then the correct taps are selected
39
- # before building the flow.
41
+ # a Flow. The current mode is adjusted based on the taps and schemes of
42
+ # the sources and sinks, then the correct taps are selected before building
43
+ # the flow.
40
44
  def connect_flow(properties, name, sources, sinks, pipes)
41
45
  update_local_mode(sources, sinks)
42
46
  sources = select_taps(sources)