cascading.jruby 0.0.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,25 @@
1
+ # Extensions to Arrays in support of variable length lists of field names. This
2
+ # is not pretty, but supports DSL features like:
3
+ # group_by 'field1', 'field2', :sort_by => 'field3' do
4
+ # ...
5
+ # end
6
+ #
7
+ # The most obvious limitation of the approach is that function definitions of
8
+ # the form f(*args_with_options) are not self-documenting. To compensate for
9
+ # this, documentation of all arguments and optional parameters must be provided
10
+ # on the DSL method.
1
11
  class Array
12
+ # Use this extension to extract the optional parameters from a
13
+ # *args_with_options argument.
14
+ # So if you have a function:
15
+ # def f(*args_with_options)
16
+ # You can destructively process the args_with_options as follows:
17
+ # options, just_args = args_with_options.extract_options!, args_with_options
2
18
  def extract_options!
3
19
  last.is_a?(::Hash) ? pop : {}
4
20
  end
5
21
 
22
+ # Non-destructive form of Array#extract_options!
6
23
  def extract_options
7
24
  last.is_a?(::Hash) ? last : {}
8
25
  end
@@ -0,0 +1,101 @@
1
+ module Cascading
2
+ # Module of filtering operations. Unlike some of the other functional
3
+ # operations modules, this one does not just wrap operations defined by
4
+ # Cascading in cascading.operation.filter. Instead, it provides some useful
5
+ # high-level DSL pipes which map many Cascading operations into a smaller
6
+ # number of DSL statements.
7
+ #
8
+ # Still, some are direct wrappers:
9
+ # filter\_null:: {FilterNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNull.html]
10
+ # filter\_not\_null:: {FilterNotNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNotNull.html]
11
+ module FilterOperations
12
+ # Filter the current assembly based on an expression or regex, but not both.
13
+ #
14
+ # The named options are:
15
+ # [expression] A Janino expression used to filter. Has access to all :input
16
+ # fields.
17
+ # [validate] Boolean passed to Cascading#expr to enable or disable
18
+ # expression validation. Defaults to true.
19
+ # [validate_with] Hash mapping field names to actual arguments used by
20
+ # Cascading#expr for expression validation. Defaults to {}.
21
+ # [regex] A regular expression used to filter.
22
+ # [remove_match] Boolean indicating if regex matches should be removed or
23
+ # kept. Defaults to false, which is a bit counterintuitive.
24
+ # [match_each_element] Boolean indicating if regex should match entire
25
+ # incoming tuple (joined with tabs) or each field
26
+ # individually. Defaults to false.
27
+ #
28
+ # Example:
29
+ # filter :input => 'field1', :regex => /\t/, :remove_match => true
30
+ # filter :expression => 'field1:long > 0 && "".equals(field2:string)'
31
+ def filter(options = {})
32
+ input_fields = options[:input] || all_fields
33
+ expression = options[:expression]
34
+ regex = options[:regex]
35
+
36
+ if expression
37
+ validate = options.has_key?(:validate) ? options[:validate] : true
38
+ validate_with = options[:validate_with] || {}
39
+
40
+ stub = expr(expression, { :validate => validate, :validate_with => validate_with })
41
+ stub.validate_scope(scope)
42
+
43
+ names, types = stub.names_and_types
44
+ each input_fields, :filter => Java::CascadingOperationExpression::ExpressionFilter.new(
45
+ stub.expression,
46
+ names,
47
+ types
48
+ )
49
+ elsif regex
50
+ parameters = [regex.to_s, options[:remove_match], options[:match_each_element]].compact
51
+ each input_fields, :filter => Java::CascadingOperationRegex::RegexFilter.new(*parameters)
52
+ else
53
+ raise 'filter requires one of :expression or :regex'
54
+ end
55
+ end
56
+
57
+ # Rejects tuples from the current assembly based on a Janino expression.
58
+ # This is just a wrapper for FilterOperations.filter.
59
+ #
60
+ # Example:
61
+ # reject 'field1:long > 0 && "".equals(field2:string)'
62
+ def reject(expression, options = {})
63
+ options[:expression] = expression
64
+ filter(options)
65
+ end
66
+
67
+ # Keeps tuples from the current assembly based on a Janino expression. This
68
+ # is a wrapper for FilterOperations.filter.
69
+ #
70
+ # Note that this is accomplished by inverting the given expression, and best
71
+ # attempt is made to support import statements prior to the expression. If
72
+ # this support should break, simply negate your expression and use
73
+ # FilterOperations.reject.
74
+ #
75
+ # Example:
76
+ # where 'field1:long > 0 && "".equals(field2:string)'
77
+ def where(expression, options = {})
78
+ _, imports, expr = expression.match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
79
+ options[:expression] = "#{imports}!(#{expr})"
80
+ filter(options)
81
+ end
82
+
83
+ # Rejects tuples from the current assembly if any input field is null.
84
+ #
85
+ # Example:
86
+ # filter_null 'field1', 'field2'
87
+ def filter_null(*input_fields)
88
+ each(input_fields, :filter => Java::CascadingOperationFilter::FilterNull.new)
89
+ end
90
+ alias reject_null filter_null
91
+
92
+ # Rejects tuples from the current assembly if any input field is not null.
93
+ #
94
+ # Example:
95
+ # filter_not_null 'field1', 'field2'
96
+ def filter_not_null(*input_fields)
97
+ each(input_fields, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
98
+ end
99
+ alias where_null filter_not_null
100
+ end
101
+ end
@@ -1,6 +1,10 @@
1
1
  require 'cascading/assembly'
2
2
 
3
3
  module Cascading
4
+ # A Flow wraps a c.f.Flow. A Flow is composed of Assemblies, which are
5
+ # constructed using the Flow#assembly method within the block passed to the
6
+ # Cascading::flow or Cascade#flow constructor. Many Assemblies may be nested
7
+ # within a Flow.
4
8
  class Flow < Cascading::Node
5
9
  extend Registerable
6
10
 
@@ -10,23 +14,46 @@ module Cascading
10
14
  # Do not use this constructor directly. Instead, use Cascading::flow to
11
15
  # build top-level flows and Cascade#flow to build flows within a Cascade.
12
16
  #
13
- # Builds a flow given a name and a parent node (a cascade or nil).
14
- # Optionally accepts :properties which allows external configuration of
15
- # this flow. The flow will side-effect the properties during composition,
16
- # then pass the modified properties along to the FlowConnector for
17
- # execution. See Cascading::Cascade#initialize for details on how
18
- # properties are propagated through cascades. Optionally accepts a :mode
19
- # which will determine the execution mode of this flow. See
20
- # Cascading::Mode.parse for details.
21
- def initialize(name, parent, params = {})
17
+ # Builds a Flow given a name and a parent node (a Cascade or nil).
18
+ #
19
+ # The named options are:
20
+ # [properties] Properties hash which allows external configuration of this
21
+ # flow. The flow will side-effect the properties during
22
+ # composition, then pass the modified properties along to the
23
+ # FlowConnector for execution. See Cascade#initialize for
24
+ # details on how properties are propagated through cascades.
25
+ # [mode] Mode which will determine the execution mode of this flow. See
26
+ # Mode.parse for details.
27
+ def initialize(name, parent, options = {})
22
28
  @sources, @sinks, @incoming_scopes, @outgoing_scopes, @listeners = {}, {}, {}, {}, []
23
- @properties = params[:properties] || {}
24
- @mode = Mode.parse(params[:mode])
29
+ @properties = options[:properties] || {}
30
+ @mode = Mode.parse(options[:mode])
25
31
  @flow_scope = Scope.flow_scope(name)
26
32
  super(name, parent)
27
33
  self.class.add(name, self)
28
34
  end
29
35
 
36
+ # Builds a child Assembly in this Flow given a name and block.
37
+ #
38
+ # An assembly's name is quite important as it will determine:
39
+ # * The sources from which it will read, if any
40
+ # * The name to be used in joins or unions downstream
41
+ # * The name to be used to sink the output of the assembly downstream
42
+ #
43
+ # Many assemblies may be built within a flow. The Assembly#branch method
44
+ # is used for creating nested assemblies and produces objects of the same
45
+ # type as this constructor.
46
+ #
47
+ # Example:
48
+ # flow 'wordcount', :mode => :local do
49
+ # assembly 'first_step' do
50
+ # ...
51
+ # end
52
+ #
53
+ # assembly 'second_step' do
54
+ # ...
55
+ # end
56
+ # end
30
57
  def assembly(name, &block)
31
58
  raise "Could not build assembly '#{name}'; block required" unless block_given?
32
59
  assembly = Assembly.new(name, self, @outgoing_scopes)
@@ -49,6 +76,11 @@ module Cascading
49
76
  sinks[name] = tap
50
77
  end
51
78
 
79
+ # Produces a textual description of this Flow. The description details the
80
+ # structure of the Flow, its sources and sinks, and the input and output
81
+ # fields of each Assembly. The offset parameter allows for this describe
82
+ # to be nested within a calling context, which lets us indent the
83
+ # structural hierarchy of a job.
52
84
  def describe(offset = '')
53
85
  description = "#{offset}#{name}:flow\n"
54
86
  description += "#{sources.keys.map{ |source| "#{offset} #{source}:source :: #{incoming_scopes[source].values_fields.to_a.inspect}" }.join("\n")}\n"
@@ -57,18 +89,28 @@ module Cascading
57
89
  description
58
90
  end
59
91
 
92
+ # Accesses the outgoing scope of this Flow at the point at which it is
93
+ # called by default, or for the child specified by the given name, if
94
+ # specified. This is useful for grabbing the values_fields at any point in
95
+ # the construction of the Flow. See Scope for details.
60
96
  def scope(name = nil)
61
97
  raise 'Must specify name if no children have been defined yet' unless name || last_child
62
98
  name ||= last_child.name
63
99
  @outgoing_scopes[name]
64
100
  end
65
101
 
102
+ # Prints information about the scope of this Flow at the point at which it
103
+ # is called by default, or for the child specified by the given name, if
104
+ # specified. This allows you to trace the propagation of field names
105
+ # through your job and is handy for debugging. See Scope for details.
66
106
  def debug_scope(name = nil)
67
107
  scope = scope(name)
68
108
  name ||= last_child.name
69
109
  puts "Scope for '#{name}':\n #{scope}"
70
110
  end
71
111
 
112
+ # Builds a map, keyed by sink name, of the sink metadata for each sink.
113
+ # Currently, this contains only the field names of each sink.
72
114
  def sink_metadata
73
115
  @sinks.keys.inject({}) do |sink_metadata, sink_name|
74
116
  raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
@@ -79,7 +121,16 @@ module Cascading
79
121
  end
80
122
  end
81
123
 
82
- # TODO: support all codecs, support list of codecs
124
+ # Property modifier that sets the codec and type of the compression for all
125
+ # sinks in this flow. Currently only supports o.a.h.i.c.DefaultCodec and
126
+ # o.a.h.i.c.GzipCodec, and the the NONE, RECORD, or BLOCK compressions
127
+ # types defined in o.a.h.i.SequenceFile.
128
+ #
129
+ # codec may be symbols like :default or :gzip and type may be symbols like
130
+ # :none, :record, or :block.
131
+ #
132
+ # Example:
133
+ # compress_output :default, :block
83
134
  def compress_output(codec, type)
84
135
  properties['mapred.output.compress'] = 'true'
85
136
  properties['mapred.output.compression.codec'] = case codec
@@ -95,22 +146,28 @@ module Cascading
95
146
  end
96
147
  end
97
148
 
149
+ # Set the cascading.spill.list.threshold property in this flow's
150
+ # properties. See c.t.c.SpillableProps for details.
98
151
  def set_spill_threshold(threshold)
99
- properties['cascading.cogroup.spill.threshold'] = threshold.to_s
152
+ properties['cascading.spill.list.threshold'] = threshold.to_s
100
153
  end
101
154
 
155
+ # Adds the given path to the mapred.cache.files list property.
102
156
  def add_file_to_distributed_cache(file)
103
157
  add_to_distributed_cache(file, "mapred.cache.files")
104
158
  end
105
159
 
160
+ # Adds the given path to the mapred.cache.archives list property.
106
161
  def add_archive_to_distributed_cache(file)
107
162
  add_to_distributed_cache(file, "mapred.cache.archives")
108
163
  end
109
164
 
165
+ # Appends a FlowListener to the list of listeners for this flow.
110
166
  def add_listener(listener)
111
167
  @listeners << listener
112
168
  end
113
169
 
170
+ # Handles locating a file cached from S3 on local disk. TODO: remove
114
171
  def emr_local_path_for_distributed_cache_file(file)
115
172
  # NOTE this needs to be *appended* to the property mapred.local.dir
116
173
  if file =~ /^s3n?:\/\//
@@ -122,16 +179,9 @@ module Cascading
122
179
  end
123
180
  end
124
181
 
125
- def add_to_distributed_cache(file, property)
126
- v = properties[property]
127
-
128
- if v
129
- properties[property] = [v.split(/,/), file].flatten.join(",")
130
- else
131
- properties[property] = file
132
- end
133
- end
134
-
182
+ # Connects this Flow, producing a c.f.Flow without completing it (the Flow
183
+ # is not executed). This method is used by Cascade to connect its child
184
+ # Flows. To connect and complete a Flow, see Flow#complete.
135
185
  def connect
136
186
  puts "Connecting flow '#{name}' with properties:"
137
187
  properties.keys.sort.each do |key|
@@ -141,6 +191,7 @@ module Cascading
141
191
  # FIXME: why do I have to do this in 2.0 wip-255?
142
192
  Java::CascadingProperty::AppProps.setApplicationName(properties, name)
143
193
  Java::CascadingProperty::AppProps.setApplicationVersion(properties, '0.0.0')
194
+ Java::CascadingProperty::AppProps.setApplicationJarClass(properties, Java::CascadingFlow::Flow.java_class)
144
195
 
145
196
  sources = make_tap_parameter(@sources, :head_pipe)
146
197
  sinks = make_tap_parameter(@sinks, :tail_pipe)
@@ -148,6 +199,9 @@ module Cascading
148
199
  mode.connect_flow(properties, name, sources, sinks, pipes)
149
200
  end
150
201
 
202
+ # Completes this Flow after connecting it. This results in execution of
203
+ # the c.f.Flow built from this Flow. Use this method when executing a
204
+ # top-level Flow.
151
205
  def complete
152
206
  begin
153
207
  flow = connect
@@ -160,6 +214,16 @@ module Cascading
160
214
 
161
215
  private
162
216
 
217
+ def add_to_distributed_cache(file, property)
218
+ v = properties[property]
219
+
220
+ if v
221
+ properties[property] = [v.split(/,/), file].flatten.join(",")
222
+ else
223
+ properties[property] = file
224
+ end
225
+ end
226
+
163
227
  def make_tap_parameter(taps, pipe_accessor)
164
228
  taps.inject({}) do |map, (name, tap)|
165
229
  assembly = find_child(name)
@@ -0,0 +1,82 @@
1
+ module Cascading
2
+ # Module of pipe assemblies that wrap the Cascading Identity operation. These
3
+ # are split out only to group similar functionality.
4
+ module IdentityOperations
5
+ # Restricts the current assembly to the specified fields in the order in
6
+ # which they are specified (can be used to reorder fields).
7
+ #
8
+ # Example:
9
+ # project 'field1', 'field2'
10
+ def project(*input_fields)
11
+ each fields(input_fields), :function => Java::CascadingOperation::Identity.new
12
+ end
13
+
14
+ # Removes the specified fields from the current assembly.
15
+ #
16
+ # Example:
17
+ # discard 'field1', 'field2'
18
+ def discard(*input_fields)
19
+ discard_fields = fields(input_fields)
20
+ keep_fields = difference_fields(scope.values_fields, discard_fields)
21
+ project(*keep_fields.to_a)
22
+ end
23
+
24
+ # Renames fields according to the mapping provided, preserving the original
25
+ # field order. Throws an exception if non-existent fields are specified.
26
+ #
27
+ # Example:
28
+ # rename 'field1' => 'fieldA', 'field2' => 'fieldB'
29
+ #
30
+ # Produces: ['fieldA', 'fieldB'], assuming those were the only 2 input
31
+ # fields.
32
+ def rename(name_map)
33
+ original_fields = scope.values_fields.to_a
34
+ invalid = name_map.keys - original_fields
35
+ raise "Invalid field names in rename: #{invalid.inspect}" unless invalid.empty?
36
+
37
+ renamed_fields = original_fields.map{ |name| name_map[name] || name }
38
+
39
+ each original_fields, :function => Java::CascadingOperation::Identity.new(fields(renamed_fields))
40
+ end
41
+
42
+ # Coerces fields to the Java type selected from Cascading::JAVA_TYPE_MAP.
43
+ #
44
+ # Example:
45
+ # cast 'field1' => :int, 'field2' => :double
46
+ def cast(type_map)
47
+ input_fields = type_map.keys.sort
48
+ types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*input_fields))
49
+ input_fields = fields(input_fields)
50
+ types = types.to_java(java.lang.Class)
51
+ each input_fields, :function => Java::CascadingOperation::Identity.new(input_fields, types)
52
+ end
53
+
54
+ # A field copy (not a pipe copy). Renames fields according to name_map,
55
+ # appending them to the fields in the assembly in the same order as the
56
+ # original fields from which they are copied. Throws an exception if
57
+ # non-existent fields are specified.
58
+ #
59
+ # Example:
60
+ # copy 'field1' => 'fieldA', 'field2' => 'fieldB'
61
+ #
62
+ # Produces: ['field1', 'field2', 'fieldA', 'fieldB'], assuming those were
63
+ # the only input fields.
64
+ def copy(name_map)
65
+ original_fields = scope.values_fields.to_a
66
+ invalid = name_map.keys - original_fields
67
+ raise "Invalid field names in copy: #{invalid.inspect}" unless invalid.empty?
68
+
69
+ # Original fields in name_map in their original order
70
+ input_fields = original_fields - (original_fields - name_map.keys)
71
+ into_fields = name_map.values_at(*input_fields)
72
+
73
+ each input_fields, :function => Java::CascadingOperation::Identity.new(fields(into_fields)), :output => all_fields
74
+ end
75
+
76
+ # A pipe copy (not a field copy). Can be used within a branch to copy a
77
+ # pipe.
78
+ def pass
79
+ each all_fields, :function => Java::CascadingOperation::Identity.new
80
+ end
81
+ end
82
+ end
@@ -1,21 +1,25 @@
1
1
  module Cascading
2
- # A Cascading::Mode encapsulates the idea of the execution mode for your
3
- # flows. The default is Hadoop mode, but you can request that your code run
4
- # in Cascading local mode. If you subsequently use a tap or a scheme that
5
- # has no local implementation, the mode will be converted back to Hadoop
6
- # mode.
2
+ # A Mode encapsulates the idea of the execution mode for your flows. The
3
+ # default is Hadoop mode, but you can request that your code run in Cascading
4
+ # local mode. If you subsequently use a tap or a scheme that has no local
5
+ # implementation, the mode will be converted back to Hadoop mode.
7
6
  class Mode
8
7
  attr_reader :local
9
8
 
10
- # Hadoop mode is the default. You must explicitly request Cascading local
11
- # mode with values 'local' or :local.
9
+ # Parses a specification of which mode, Cascading local mode or Hadoop mode,
10
+ # to execute in. Defaults to Hadoop mode. You may explicitly request
11
+ # Cascading local mode with values 'local' or :local. If you pass a Mode
12
+ # object to this method, it will be passed through.
12
13
  def self.parse(mode)
13
14
  case mode
15
+ when Mode then mode
14
16
  when 'local', :local then Mode.new(true)
15
17
  else Mode.new(false)
16
18
  end
17
19
  end
18
20
 
21
+ # Constructs a Mode given a flag indicating if it should be Cascading local
22
+ # mode.
19
23
  def initialize(local)
20
24
  @local = local
21
25
  end
@@ -34,9 +38,9 @@ module Cascading
34
38
  end
35
39
 
36
40
  # Builds a c.f.Flow given properties, name, sources, sinks, and pipes from
37
- # a Cascading::Flow. The current mode is adjusted based on the taps and
38
- # schemes of the sources and sinks, then the correct taps are selected
39
- # before building the flow.
41
+ # a Flow. The current mode is adjusted based on the taps and schemes of
42
+ # the sources and sinks, then the correct taps are selected before building
43
+ # the flow.
40
44
  def connect_flow(properties, name, sources, sinks, pipes)
41
45
  update_local_mode(sources, sinks)
42
46
  sources = select_taps(sources)