cascading.jruby 0.0.10 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +13 -160
- data/README.md +35 -0
- data/lib/cascading.rb +8 -41
- data/lib/cascading/aggregations.rb +216 -71
- data/lib/cascading/assembly.rb +409 -606
- data/lib/cascading/base.rb +22 -0
- data/lib/cascading/cascade.rb +55 -18
- data/lib/cascading/cascading.rb +137 -47
- data/lib/cascading/expr_stub.rb +31 -17
- data/lib/cascading/ext/array.rb +17 -0
- data/lib/cascading/filter_operations.rb +101 -0
- data/lib/cascading/flow.rb +87 -23
- data/lib/cascading/identity_operations.rb +82 -0
- data/lib/cascading/mode.rb +14 -10
- data/lib/cascading/operations.rb +109 -174
- data/lib/cascading/regex_operations.rb +133 -0
- data/lib/cascading/scope.rb +32 -9
- data/lib/cascading/sub_assembly.rb +8 -5
- data/lib/cascading/tap.rb +41 -17
- data/lib/cascading/text_operations.rb +67 -0
- data/test/mock_assemblies.rb +55 -0
- data/test/test_assembly.rb +23 -25
- data/test/test_local_execution.rb +7 -7
- data/test/test_operations.rb +0 -10
- metadata +76 -74
- data/History.txt +0 -58
    
        data/lib/cascading/assembly.rb
    CHANGED
    
    | @@ -1,15 +1,50 @@ | |
| 1 1 | 
             
            require 'cascading/base'
         | 
| 2 2 | 
             
            require 'cascading/operations'
         | 
| 3 | 
            +
            require 'cascading/identity_operations'
         | 
| 4 | 
            +
            require 'cascading/filter_operations'
         | 
| 5 | 
            +
            require 'cascading/regex_operations'
         | 
| 6 | 
            +
            require 'cascading/text_operations'
         | 
| 3 7 | 
             
            require 'cascading/aggregations'
         | 
| 4 8 | 
             
            require 'cascading/sub_assembly'
         | 
| 5 9 | 
             
            require 'cascading/ext/array'
         | 
| 6 10 |  | 
| 7 11 | 
             
            module Cascading
         | 
| 12 | 
            +
              # An Assembly is a sequence of Cascading pipes (Each, GroupBy, CoGroup,
         | 
| 13 | 
            +
              # Every, and SubAssembly).  This class will serve as your primary mechanism
         | 
| 14 | 
            +
              # for doing work within a flow and contains all the functions and filters you
         | 
| 15 | 
            +
              # will apply to a pipe (Eaches), as well as group_by, union, and join.  For
         | 
| 16 | 
            +
              # aggregators and buffers, please see Aggregations.
         | 
| 17 | 
            +
              #
         | 
| 18 | 
            +
              # Function and filter DSL rules:
         | 
| 19 | 
            +
              # * Use positional arguments for required parameters
         | 
| 20 | 
            +
              # * Use options = {} for optional parameters
         | 
| 21 | 
            +
              # * Use *args sparingly, specifically when you need to accept a varying length list of fields
         | 
| 22 | 
            +
              # * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension
         | 
| 23 | 
            +
              # * If you choose to name a required parameter, add it to options = {} and throw an exception if the caller does not provide it
         | 
| 24 | 
            +
              # * If you have a require parameter that is provided by one of a set of options names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example)
         | 
| 25 | 
            +
              #
         | 
| 26 | 
            +
              # Function and filter DSL standard optional parameter names:
         | 
| 27 | 
            +
              # [input] c.p.Each argument selector
         | 
| 28 | 
            +
              # [into] c.o.Operation field declaration
         | 
| 29 | 
            +
              # [output] c.p.Each output selector
         | 
| 30 | 
            +
              #
         | 
| 31 | 
            +
              # A note on aliases: when a DSL method uniquely wraps a single Cascading
         | 
| 32 | 
            +
              # operation, we attempt to provide an alias that matches the Cascading
         | 
| 33 | 
            +
              # operation.  However, Cascading operations are often nouns rather than verbs,
         | 
| 34 | 
            +
              # and the latter are preferable for a dataflow DSL.
         | 
| 8 35 | 
             
              class Assembly < Cascading::Node
         | 
| 9 | 
            -
                include Operations
         | 
| 10 | 
            -
             | 
| 11 36 | 
             
                attr_reader :head_pipe, :tail_pipe
         | 
| 12 37 |  | 
| 38 | 
            +
                # Do not use this constructor directly; instead, use Flow#assembly or
         | 
| 39 | 
            +
                # Assembly#branch to build assemblies.
         | 
| 40 | 
            +
                #
         | 
| 41 | 
            +
                # Builds an Assembly given a name, parent, and optional outgoing_scopes
         | 
| 42 | 
            +
                # (necessary only for branching).
         | 
| 43 | 
            +
                #
         | 
| 44 | 
            +
                # An assembly's name is quite important as it will determine:
         | 
| 45 | 
            +
                # * The sources from which it will read, if any
         | 
| 46 | 
            +
                # * The name to be used in joins or unions downstream
         | 
| 47 | 
            +
                # * The name to be used to sink the output of the assembly downstream
         | 
| 13 48 | 
             
                def initialize(name, parent, outgoing_scopes = {})
         | 
| 14 49 | 
             
                  super(name, parent)
         | 
| 15 50 |  | 
| @@ -27,6 +62,11 @@ module Cascading | |
| 27 62 | 
             
                  @incoming_scopes = [scope]
         | 
| 28 63 | 
             
                end
         | 
| 29 64 |  | 
| 65 | 
            +
                # Produces a textual description of this Assembly.  The description details
         | 
| 66 | 
            +
                # the structure of the Assembly, its input and output fields and any
         | 
| 67 | 
            +
                # children (branches).  The offset parameter allows for this describe to be
         | 
| 68 | 
            +
                # nested within a calling context, which lets us indent the structural
         | 
| 69 | 
            +
                # hierarchy of a job.
         | 
| 30 70 | 
             
                def describe(offset = '')
         | 
| 31 71 | 
             
                  incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
         | 
| 32 72 | 
             
                  incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1
         | 
| @@ -35,199 +75,231 @@ module Cascading | |
| 35 75 | 
             
                  description
         | 
| 36 76 | 
             
                end
         | 
| 37 77 |  | 
| 78 | 
            +
                # Rather than the immediate parent, this method returns the parent flow of
         | 
| 79 | 
            +
                # this Assembly.  If this is a branch, we must traverse the parents of
         | 
| 80 | 
            +
                # parent assemblies.
         | 
| 38 81 | 
             
                def parent_flow
         | 
| 39 82 | 
             
                  return parent if parent.kind_of?(Flow)
         | 
| 40 83 | 
             
                  parent.parent_flow
         | 
| 41 84 | 
             
                end
         | 
| 42 85 |  | 
| 86 | 
            +
                # Accesses the outgoing scope of this Assembly at the point at which it is
         | 
| 87 | 
            +
                # called.  This is useful for grabbing the values_fields at any point in
         | 
| 88 | 
            +
                # the construction of the Assembly.  See Scope for details.
         | 
| 43 89 | 
             
                def scope
         | 
| 44 90 | 
             
                  @outgoing_scopes[name]
         | 
| 45 91 | 
             
                end
         | 
| 46 92 |  | 
| 93 | 
            +
                # Prints information about the scope of this Assembly at the point at which
         | 
| 94 | 
            +
                # it is called.  This allows you to trace the propagation of field names
         | 
| 95 | 
            +
                # through your job and is handy for debugging.  See Scope for details.
         | 
| 47 96 | 
             
                def debug_scope
         | 
| 48 97 | 
             
                  puts "Current scope for '#{name}':\n  #{scope}\n----------\n"
         | 
| 49 98 | 
             
                end
         | 
| 50 99 |  | 
| 51 | 
            -
                 | 
| 52 | 
            -
             | 
| 53 | 
            -
                  @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
         | 
| 54 | 
            -
             | 
| 55 | 
            -
                  tail_pipe
         | 
| 56 | 
            -
                end
         | 
| 57 | 
            -
                private :make_pipe
         | 
| 58 | 
            -
             | 
| 59 | 
            -
                def populate_incoming_scopes(assembly_names, group_fields_args = {})
         | 
| 60 | 
            -
                  # NOTE: this overrides the existing incoming_scopes, which changes the
         | 
| 61 | 
            -
                  # way describe will function on this assembly
         | 
| 62 | 
            -
                  pipes, @incoming_scopes, group_fields = [], [], []
         | 
| 63 | 
            -
                  assembly_names.each do |assembly_name|
         | 
| 64 | 
            -
                    assembly = parent_flow.find_child(assembly_name)
         | 
| 65 | 
            -
                    raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
         | 
| 66 | 
            -
             | 
| 67 | 
            -
                    pipes << assembly.tail_pipe
         | 
| 68 | 
            -
                    @incoming_scopes << assembly.scope
         | 
| 69 | 
            -
                    group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
         | 
| 70 | 
            -
                  end
         | 
| 71 | 
            -
                  [pipes, group_fields]
         | 
| 72 | 
            -
                end
         | 
| 73 | 
            -
                private :populate_incoming_scopes
         | 
| 74 | 
            -
             | 
| 75 | 
            -
                def apply_aggregations(group, incoming_scopes, &block)
         | 
| 76 | 
            -
                  aggregations = Aggregations.new(self, group, incoming_scopes)
         | 
| 77 | 
            -
                  aggregations.instance_eval(&block) if block_given?
         | 
| 78 | 
            -
             | 
| 79 | 
            -
                  # Sorting of any type means that we cannot use the AggregateBy optimization
         | 
| 80 | 
            -
                  if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
         | 
| 81 | 
            -
                    grouping_fields = group.key_selectors.values.first
         | 
| 82 | 
            -
                    group.key_selectors.values.each do |key_fields|
         | 
| 83 | 
            -
                      raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
         | 
| 84 | 
            -
                    end
         | 
| 85 | 
            -
             | 
| 86 | 
            -
                    aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
         | 
| 87 | 
            -
                      name,
         | 
| 88 | 
            -
                      group.previous,
         | 
| 89 | 
            -
                      grouping_fields,
         | 
| 90 | 
            -
                      aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
         | 
| 91 | 
            -
                    ), group.previous, incoming_scopes)
         | 
| 92 | 
            -
             | 
| 93 | 
            -
                    aggregate_by
         | 
| 94 | 
            -
                  else
         | 
| 95 | 
            -
                    aggregations.finalize if block_given?
         | 
| 96 | 
            -
                    @tail_pipe = aggregations.tail_pipe
         | 
| 97 | 
            -
                    @outgoing_scopes[name] = aggregations.scope
         | 
| 98 | 
            -
             | 
| 99 | 
            -
                    group
         | 
| 100 | 
            -
                  end
         | 
| 101 | 
            -
                end
         | 
| 102 | 
            -
                private :apply_aggregations
         | 
| 103 | 
            -
             | 
| 100 | 
            +
                # Prints detail about this Assembly including its name, head pipe, and tail
         | 
| 101 | 
            +
                # pipe.
         | 
| 104 102 | 
             
                def to_s
         | 
| 105 103 | 
             
                  "#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}"
         | 
| 106 104 | 
             
                end
         | 
| 107 105 |  | 
| 108 | 
            -
                def prepare_join(*args, &block)
         | 
| 109 | 
            -
                  options = args.extract_options!
         | 
| 110 | 
            -
             | 
| 111 | 
            -
                  pipes, _ = populate_incoming_scopes(args)
         | 
| 112 | 
            -
             | 
| 113 | 
            -
                  group_fields_args = options[:on]
         | 
| 114 | 
            -
                  raise 'join requires :on parameter' unless group_fields_args
         | 
| 115 | 
            -
             | 
| 116 | 
            -
                  if group_fields_args.kind_of?(String)
         | 
| 117 | 
            -
                    group_fields_args = [group_fields_args]
         | 
| 118 | 
            -
                  end
         | 
| 119 | 
            -
             | 
| 120 | 
            -
                  group_fields = []
         | 
| 121 | 
            -
                  if group_fields_args.kind_of?(Array)
         | 
| 122 | 
            -
                    pipes.size.times do
         | 
| 123 | 
            -
                      group_fields << fields(group_fields_args)
         | 
| 124 | 
            -
                    end
         | 
| 125 | 
            -
                  elsif group_fields_args.kind_of?(Hash)
         | 
| 126 | 
            -
                    pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
         | 
| 127 | 
            -
                  else
         | 
| 128 | 
            -
                    raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
         | 
| 129 | 
            -
                  end
         | 
| 130 | 
            -
             | 
| 131 | 
            -
                  raise 'join requires non-empty :on parameter' if group_fields_args.empty?
         | 
| 132 | 
            -
                  group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
         | 
| 133 | 
            -
                  incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
         | 
| 134 | 
            -
                  declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
         | 
| 135 | 
            -
                  joiner = options[:joiner]
         | 
| 136 | 
            -
                  is_hash_join = options[:hash] || false
         | 
| 137 | 
            -
             | 
| 138 | 
            -
                  case joiner
         | 
| 139 | 
            -
                  when :inner, 'inner', nil
         | 
| 140 | 
            -
                    joiner = Java::CascadingPipeJoiner::InnerJoin.new
         | 
| 141 | 
            -
                  when :left,  'left'
         | 
| 142 | 
            -
                    joiner = Java::CascadingPipeJoiner::LeftJoin.new
         | 
| 143 | 
            -
                  when :right, 'right'
         | 
| 144 | 
            -
                    joiner = Java::CascadingPipeJoiner::RightJoin.new
         | 
| 145 | 
            -
                  when :outer, 'outer'
         | 
| 146 | 
            -
                    joiner = Java::CascadingPipeJoiner::OuterJoin.new
         | 
| 147 | 
            -
                  when Array
         | 
| 148 | 
            -
                    joiner = joiner.map do |t|
         | 
| 149 | 
            -
                      case t
         | 
| 150 | 
            -
                      when true,  1, :inner then true
         | 
| 151 | 
            -
                      when false, 0, :outer then false
         | 
| 152 | 
            -
                      else fail "invalid mixed joiner entry: #{t}"
         | 
| 153 | 
            -
                      end
         | 
| 154 | 
            -
                    end
         | 
| 155 | 
            -
                    joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
         | 
| 156 | 
            -
                  end
         | 
| 157 | 
            -
             | 
| 158 | 
            -
                  if is_hash_join
         | 
| 159 | 
            -
                    raise ArgumentError, "hash joins don't support aggregations" if block_given?
         | 
| 160 | 
            -
                    parameters = [
         | 
| 161 | 
            -
                      pipes.to_java(Java::CascadingPipe::Pipe),
         | 
| 162 | 
            -
                      group_fields,
         | 
| 163 | 
            -
                      declared_fields,
         | 
| 164 | 
            -
                      joiner
         | 
| 165 | 
            -
                    ]
         | 
| 166 | 
            -
                    group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
         | 
| 167 | 
            -
                  else
         | 
| 168 | 
            -
                    result_group_fields = dedup_fields(*group_fields)
         | 
| 169 | 
            -
                    parameters = [
         | 
| 170 | 
            -
                      pipes.to_java(Java::CascadingPipe::Pipe),
         | 
| 171 | 
            -
                      group_fields,
         | 
| 172 | 
            -
                      declared_fields,
         | 
| 173 | 
            -
                      result_group_fields,
         | 
| 174 | 
            -
                      joiner
         | 
| 175 | 
            -
                    ]
         | 
| 176 | 
            -
                    group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
         | 
| 177 | 
            -
                  end
         | 
| 178 | 
            -
                  apply_aggregations(group_assembly, @incoming_scopes, &block)
         | 
| 179 | 
            -
                end
         | 
| 180 | 
            -
                private :prepare_join
         | 
| 181 | 
            -
             | 
| 182 106 | 
             
                # Builds a HashJoin pipe. This should be used carefully, as the right side
         | 
| 183 | 
            -
                # of the join is accumulated entirely in memory. Requires a list of | 
| 184 | 
            -
                # names to join and :on to specify the join_fields.
         | 
| 185 | 
            -
                 | 
| 186 | 
            -
             | 
| 107 | 
            +
                # of the join is accumulated entirely in memory. Requires a list of
         | 
| 108 | 
            +
                # assembly names to join and :on to specify the join_fields.  Note that a
         | 
| 109 | 
            +
                # hash_join "takes over" the Assembly in which it is built, so it is
         | 
| 110 | 
            +
                # typically the first statement within the block of the assembly or branch.
         | 
| 111 | 
            +
                # Additionally, a hash join does not accept a block for aggregations like
         | 
| 112 | 
            +
                # other joins; this restriction is enforced here, but comes directly from
         | 
| 113 | 
            +
                # Cascading.
         | 
| 114 | 
            +
                #
         | 
| 115 | 
            +
                # The named options are:
         | 
| 116 | 
            +
                # [on] The keys of the join, an array of strings if they are the same in
         | 
| 117 | 
            +
                #      all inputs, or a hash mapping assembly names to key names if they
         | 
| 118 | 
            +
                #      differ across inputs.
         | 
| 119 | 
            +
                # [declared_fields] By default, a deduplicated array of incoming field
         | 
| 120 | 
            +
                #                   names (see Cascading::dedup_fields).  Specifies the
         | 
| 121 | 
            +
                #                   names of the fields that will be available to
         | 
| 122 | 
            +
                #                   aggregations or post-join if no aggregations are
         | 
| 123 | 
            +
                #                   specified.
         | 
| 124 | 
            +
                # [joiner] A specification of the c.p.j.Joiner to use.  Values like :inner
         | 
| 125 | 
            +
                #          and 'inner', :right and 'right' are accepted, as well as an
         | 
| 126 | 
            +
                #          array specifying mixed joins.  Typically, this is not provided,
         | 
| 127 | 
            +
                #          but one of the higher level join methods on Assembly is used
         | 
| 128 | 
            +
                #          directly (like Assembly#inner_join or Assembly#right_join).
         | 
| 129 | 
            +
                #
         | 
| 130 | 
            +
                # Example:
         | 
| 131 | 
            +
                #     assembly 'join_left_right' do
         | 
| 132 | 
            +
                #       hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner
         | 
| 133 | 
            +
                #     end
         | 
| 134 | 
            +
                def hash_join(*args_with_options)
         | 
| 135 | 
            +
                  raise ArgumentError, "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored" if block_given?
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                  options, assembly_names = args_with_options.extract_options!, args_with_options
         | 
| 187 138 | 
             
                  options[:hash] = true
         | 
| 188 | 
            -
                   | 
| 189 | 
            -
                  prepare_join(*args, &block)
         | 
| 139 | 
            +
                  prepare_join(assembly_names, options)
         | 
| 190 140 | 
             
                end
         | 
| 191 141 |  | 
| 192 142 | 
             
                # Builds a join (CoGroup) pipe. Requires a list of assembly names to join
         | 
| 193 | 
            -
                # and :on to specify the group_fields.
         | 
| 194 | 
            -
                 | 
| 195 | 
            -
             | 
| 143 | 
            +
                # and :on to specify the group_fields.  Note that a join "takes over" the
         | 
| 144 | 
            +
                # Assembly in which it is built, so it is typically the first statement
         | 
| 145 | 
            +
                # within the block of the assembly or branch.  The block passed to this
         | 
| 146 | 
            +
                # method will be evaluated in the context of Aggregations, not Assembly.
         | 
| 147 | 
            +
                #
         | 
| 148 | 
            +
                # The named options are:
         | 
| 149 | 
            +
                # [on] The keys of the join, an array of strings if they are the same in
         | 
| 150 | 
            +
                #      all inputs, or a hash mapping assembly names to key names if they
         | 
| 151 | 
            +
                #      differ across inputs.
         | 
| 152 | 
            +
                # [declared_fields] By default, a deduplicated array of incoming field
         | 
| 153 | 
            +
                #                   names (see Cascading::dedup_fields).  Specifies the
         | 
| 154 | 
            +
                #                   names of the fields that will be available to
         | 
| 155 | 
            +
                #                   aggregations or post-join if no aggregations are
         | 
| 156 | 
            +
                #                   specified.
         | 
| 157 | 
            +
                # [joiner] A specification of the c.p.j.Joiner to use.  Values like :inner
         | 
| 158 | 
            +
                #          and 'inner', :right and 'right' are accepted, as well as an
         | 
| 159 | 
            +
                #          array specifying mixed joins.  Typically, this is not provided,
         | 
| 160 | 
            +
                #          but one of the higher level join methods on Assembly is used
         | 
| 161 | 
            +
                #          directly (like Assembly#inner_join or Assembly#right_join).
         | 
| 162 | 
            +
                #
         | 
| 163 | 
            +
                # Example:
         | 
| 164 | 
            +
                #     assembly 'join_left_right' do
         | 
| 165 | 
            +
                #       join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do
         | 
| 166 | 
            +
                #         sum 'val1', 'val2', :type => :long
         | 
| 167 | 
            +
                #       end
         | 
| 168 | 
            +
                #     end
         | 
| 169 | 
            +
                def join(*args_with_options, &block)
         | 
| 170 | 
            +
                  options, assembly_names = args_with_options.extract_options!, args_with_options
         | 
| 196 171 | 
             
                  options[:hash] = false
         | 
| 197 | 
            -
                   | 
| 198 | 
            -
                  prepare_join(*args, &block)
         | 
| 172 | 
            +
                  prepare_join(assembly_names, options, &block)
         | 
| 199 173 | 
             
                end
         | 
| 200 174 | 
             
                alias co_group join
         | 
| 201 175 |  | 
| 202 | 
            -
                 | 
| 203 | 
            -
             | 
| 176 | 
            +
                # Builds an inner join (CoGroup) pipe. Requires a list of assembly names to
         | 
| 177 | 
            +
                # join and :on to specify the group_fields.
         | 
| 178 | 
            +
                #
         | 
| 179 | 
            +
                # The named options are:
         | 
| 180 | 
            +
                # [on] The keys of the join, an array of strings if they are the same in
         | 
| 181 | 
            +
                #      all inputs, or a hash mapping assembly names to key names if they
         | 
| 182 | 
            +
                #      differ across inputs.
         | 
| 183 | 
            +
                # [declared_fields] By default, a deduplicated array of incoming field
         | 
| 184 | 
            +
                #                   names (see Cascading::dedup_fields).  Specifies the
         | 
| 185 | 
            +
                #                   names of the fields that will be available to
         | 
| 186 | 
            +
                #                   aggregations or post-join if no aggregations are
         | 
| 187 | 
            +
                #                   specified.
         | 
| 188 | 
            +
                #
         | 
| 189 | 
            +
                # Example:
         | 
| 190 | 
            +
                #     assembly 'join_left_right' do
         | 
| 191 | 
            +
                #       inner_join 'left', 'right', :on => ['key1', 'key2']
         | 
| 192 | 
            +
                #         sum 'val1', 'val2', :type => :long
         | 
| 193 | 
            +
                #       end
         | 
| 194 | 
            +
                #     end
         | 
| 195 | 
            +
                def inner_join(*args_with_options, &block)
         | 
| 196 | 
            +
                  options = args_with_options.extract_options!
         | 
| 204 197 | 
             
                  options[:joiner] = :inner
         | 
| 205 | 
            -
                   | 
| 206 | 
            -
                  join(* | 
| 198 | 
            +
                  args_with_options << options
         | 
| 199 | 
            +
                  join(*args_with_options, &block)
         | 
| 207 200 | 
             
                end
         | 
| 208 201 |  | 
| 209 | 
            -
                 | 
| 210 | 
            -
             | 
| 202 | 
            +
                # Builds a left join (CoGroup) pipe. Requires a list of assembly names to
         | 
| 203 | 
            +
                # join and :on to specify the group_fields.
         | 
| 204 | 
            +
                #
         | 
| 205 | 
            +
                # The named options are:
         | 
| 206 | 
            +
                # [on] The keys of the join, an array of strings if they are the same in
         | 
| 207 | 
            +
                #      all inputs, or a hash mapping assembly names to key names if they
         | 
| 208 | 
            +
                #      differ across inputs.
         | 
| 209 | 
            +
                # [declared_fields] By default, a deduplicated array of incoming field
         | 
| 210 | 
            +
                #                   names (see Cascading::dedup_fields).  Specifies the
         | 
| 211 | 
            +
                #                   names of the fields that will be available to
         | 
| 212 | 
            +
                #                   aggregations or post-join if no aggregations are
         | 
| 213 | 
            +
                #                   specified.
         | 
| 214 | 
            +
                #
         | 
| 215 | 
            +
                # Example:
         | 
| 216 | 
            +
                #     assembly 'join_left_right' do
         | 
| 217 | 
            +
                #       left_join 'left', 'right', :on => ['key1', 'key2'] do
         | 
| 218 | 
            +
                #         sum 'val1', 'val2', :type => :long
         | 
| 219 | 
            +
                #       end
         | 
| 220 | 
            +
                #     end
         | 
| 221 | 
            +
                def left_join(*args_with_options, &block)
         | 
| 222 | 
            +
                  options = args_with_options.extract_options!
         | 
| 211 223 | 
             
                  options[:joiner] = :left
         | 
| 212 | 
            -
                   | 
| 213 | 
            -
                  join(* | 
| 224 | 
            +
                  args_with_options << options
         | 
| 225 | 
            +
                  join(*args_with_options, &block)
         | 
| 214 226 | 
             
                end
         | 
| 215 227 |  | 
| 216 | 
            -
                 | 
| 217 | 
            -
             | 
| 228 | 
            +
                # Builds a right join (CoGroup) pipe. Requires a list of assembly names to
         | 
| 229 | 
            +
                # join and :on to specify the group_fields.
         | 
| 230 | 
            +
                #
         | 
| 231 | 
            +
                # The named options are:
         | 
| 232 | 
            +
                # [on] The keys of the join, an array of strings if they are the same in
         | 
| 233 | 
            +
                #      all inputs, or a hash mapping assembly names to key names if they
         | 
| 234 | 
            +
                #      differ across inputs.
         | 
| 235 | 
            +
                # [declared_fields] By default, a deduplicated array of incoming field
         | 
| 236 | 
            +
                #                   names (see Cascading::dedup_fields).  Specifies the
         | 
| 237 | 
            +
                #                   names of the fields that will be available to
         | 
| 238 | 
            +
                #                   aggregations or post-join if no aggregations are
         | 
| 239 | 
            +
                #                   specified.
         | 
| 240 | 
            +
                #
         | 
| 241 | 
            +
                # Example:
         | 
| 242 | 
            +
                #     assembly 'join_left_right' do
         | 
| 243 | 
            +
                #       right_join 'left', 'right', :on => ['key1', 'key2'] do
         | 
| 244 | 
            +
                #         sum 'val1', 'val2', :type => :long
         | 
| 245 | 
            +
                #       end
         | 
| 246 | 
            +
                #     end
         | 
| 247 | 
            +
                def right_join(*args_with_options, &block)
         | 
| 248 | 
            +
                  options = args_with_options.extract_options!
         | 
| 218 249 | 
             
                  options[:joiner] = :right
         | 
| 219 | 
            -
                   | 
| 220 | 
            -
                  join(* | 
| 250 | 
            +
                  args_with_options << options
         | 
| 251 | 
            +
                  join(*args_with_options, &block)
         | 
| 221 252 | 
             
                end
         | 
| 222 253 |  | 
| 223 | 
            -
                 | 
| 224 | 
            -
             | 
| 254 | 
            +
                # Builds an outer join (CoGroup) pipe. Requires a list of assembly names to
         | 
| 255 | 
            +
                # join and :on to specify the group_fields.
         | 
| 256 | 
            +
                #
         | 
| 257 | 
            +
                # The named options are:
         | 
| 258 | 
            +
                # [on] The keys of the join, an array of strings if they are the same in
         | 
| 259 | 
            +
                #      all inputs, or a hash mapping assembly names to key names if they
         | 
| 260 | 
            +
                #      differ across inputs.
         | 
| 261 | 
            +
                # [declared_fields] By default, a deduplicated array of incoming field
         | 
| 262 | 
            +
                #                   names (see Cascading::dedup_fields).  Specifies the
         | 
| 263 | 
            +
                #                   names of the fields that will be available to
         | 
| 264 | 
            +
                #                   aggregations or post-join if no aggregations are
         | 
| 265 | 
            +
                #                   specified.
         | 
| 266 | 
            +
                #
         | 
| 267 | 
            +
                # Example:
         | 
| 268 | 
            +
                #     assembly 'join_left_right' do
         | 
| 269 | 
            +
                #       outer_join 'left', 'right', :on => ['key1', 'key2'] do
         | 
| 270 | 
            +
                #         sum 'val1', 'val2', :type => :long
         | 
| 271 | 
            +
                #       end
         | 
| 272 | 
            +
                #     end
         | 
| 273 | 
            +
                def outer_join(*args_with_options, &block)
         | 
| 274 | 
            +
                  options = args_with_options.extract_options!
         | 
| 225 275 | 
             
                  options[:joiner] = :outer
         | 
| 226 | 
            -
                   | 
| 227 | 
            -
                  join(* | 
| 276 | 
            +
                  args_with_options << options
         | 
| 277 | 
            +
                  join(*args_with_options, &block)
         | 
| 228 278 | 
             
                end
         | 
| 229 279 |  | 
| 230 | 
            -
                # Builds a  | 
| 280 | 
            +
                # Builds a child Assembly that branches this Assembly given a name and
         | 
| 281 | 
            +
                # block.
         | 
| 282 | 
            +
                #
         | 
| 283 | 
            +
                # An assembly's name is quite important as it will determine:
         | 
| 284 | 
            +
                # * The sources from which it will read, if any
         | 
| 285 | 
            +
                # * The name to be used in joins or unions downstream
         | 
| 286 | 
            +
                # * The name to be used to sink the output of the assembly downstream
         | 
| 287 | 
            +
                #
         | 
| 288 | 
            +
                # Many branches may be built within an assembly.  The result of a branch is
         | 
| 289 | 
            +
                # the same as the Flow#assembly constructor, an Assembly object.
         | 
| 290 | 
            +
                #
         | 
| 291 | 
            +
                # Example:
         | 
| 292 | 
            +
                #     assembly 'some_work' do
         | 
| 293 | 
            +
                #       ...
         | 
| 294 | 
            +
                #
         | 
| 295 | 
            +
                #       branch 'more_work' do
         | 
| 296 | 
            +
                #         ...
         | 
| 297 | 
            +
                #       end
         | 
| 298 | 
            +
                #
         | 
| 299 | 
            +
                #       branch 'yet_more_work' do
         | 
| 300 | 
            +
                #         ...
         | 
| 301 | 
            +
                #       end
         | 
| 302 | 
            +
                #     end
         | 
| 231 303 | 
             
                def branch(name, &block)
         | 
| 232 304 | 
             
                  raise "Could not build branch '#{name}'; block required" unless block_given?
         | 
| 233 305 | 
             
                  assembly = Assembly.new(name, self, @outgoing_scopes)
         | 
| @@ -236,11 +308,27 @@ module Cascading | |
| 236 308 | 
             
                  assembly
         | 
| 237 309 | 
             
                end
         | 
| 238 310 |  | 
| 239 | 
            -
                # Builds a new GroupBy pipe that groups on the fields given in | 
| 240 | 
            -
                #  | 
| 241 | 
            -
                 | 
| 242 | 
            -
             | 
| 243 | 
            -
             | 
| 311 | 
            +
                # Builds a new GroupBy pipe that groups on the fields given in
         | 
| 312 | 
            +
                # args_with_options. The block passed to this method will be evaluated in
         | 
| 313 | 
            +
                # the context of Aggregations, not Assembly.
         | 
| 314 | 
            +
                #
         | 
| 315 | 
            +
                # The named options are:
         | 
| 316 | 
            +
                # [sort_by] Optional keys for within-group sort.
         | 
| 317 | 
            +
                # [reverse] Boolean that can reverse the order of within-group sorting
         | 
| 318 | 
            +
                #           (only makes sense given :sort_by keys).
         | 
| 319 | 
            +
                #
         | 
| 320 | 
            +
                # Example:
         | 
| 321 | 
            +
                #     assembly 'total' do
         | 
| 322 | 
            +
                #       ...
         | 
| 323 | 
            +
                #       insert 'const' => 1
         | 
| 324 | 
            +
                #       group_by 'const' do
         | 
| 325 | 
            +
                #         count
         | 
| 326 | 
            +
                #         sum 'val1', 'val2', :type => :long
         | 
| 327 | 
            +
                #       end
         | 
| 328 | 
            +
                #       discard 'const'
         | 
| 329 | 
            +
                #     end
         | 
| 330 | 
            +
                def group_by(*args_with_options, &block)
         | 
| 331 | 
            +
                  options, group_fields = args_with_options.extract_options!, fields(args_with_options)
         | 
| 244 332 | 
             
                  sort_fields = fields(options[:sort_by])
         | 
| 245 333 | 
             
                  reverse = options[:reverse]
         | 
| 246 334 |  | 
| @@ -251,16 +339,31 @@ module Cascading | |
| 251 339 | 
             
                # Unifies multiple incoming pipes sharing the same field structure using a
         | 
| 252 340 | 
             
                # GroupBy.  Accepts :on like join and :sort_by and :reverse like group_by,
         | 
| 253 341 | 
             
                # as well as a block which may be used for a sequence of Every
         | 
| 254 | 
            -
                # aggregations.
         | 
| 342 | 
            +
                # aggregations.  The block passed to this method will be evaluated in the
         | 
| 343 | 
            +
                # context of Aggregations, not Assembly.
         | 
| 255 344 | 
             
                #
         | 
| 256 345 | 
             
                # By default, groups only on the first field (see line 189 of GroupBy.java)
         | 
| 257 | 
            -
                 | 
| 258 | 
            -
             | 
| 346 | 
            +
                #
         | 
| 347 | 
            +
                # The named options are:
         | 
| 348 | 
            +
                # [on] The keys of the union, which defaults to the first field in the
         | 
| 349 | 
            +
                #      first input assembly.
         | 
| 350 | 
            +
                # [sort_by] Optional keys for sorting.
         | 
| 351 | 
            +
                # [reverse] Boolean that can reverse the order of sorting
         | 
| 352 | 
            +
                #           (only makes sense given :sort_by keys).
         | 
| 353 | 
            +
                #
         | 
| 354 | 
            +
                # Example:
         | 
| 355 | 
            +
                #     assembly 'union_left_right' do
         | 
| 356 | 
            +
                #       union 'left', 'right' do
         | 
| 357 | 
            +
                #         sum 'val1', 'val2', :type => :long
         | 
| 358 | 
            +
                #       end
         | 
| 359 | 
            +
                #     end
         | 
| 360 | 
            +
                def union(*args_with_options, &block)
         | 
| 361 | 
            +
                  options, assembly_names = args_with_options.extract_options!, args_with_options
         | 
| 259 362 | 
             
                  group_fields = fields(options[:on])
         | 
| 260 363 | 
             
                  sort_fields = fields(options[:sort_by])
         | 
| 261 364 | 
             
                  reverse = options[:reverse]
         | 
| 262 365 |  | 
| 263 | 
            -
                  pipes, _ = populate_incoming_scopes( | 
| 366 | 
            +
                  pipes, _ = populate_incoming_scopes(assembly_names)
         | 
| 264 367 |  | 
| 265 368 | 
             
                  # Must provide group_fields to ensure field name propagation
         | 
| 266 369 | 
             
                  group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields
         | 
| @@ -273,10 +376,15 @@ module Cascading | |
| 273 376 | 
             
                end
         | 
| 274 377 | 
             
                alias :union_pipes :union
         | 
| 275 378 |  | 
| 276 | 
            -
                # Allows you to plugin c.p.SubAssemblies to  | 
| 277 | 
            -
                #  | 
| 278 | 
            -
                #  | 
| 279 | 
            -
                # | 
| 379 | 
            +
                # Allows you to plugin c.p.SubAssemblies to an Assembly under certain
         | 
| 380 | 
            +
                # assumptions.  Note the default is to extend the tail pipe of this
         | 
| 381 | 
            +
                # Assembly using a linear SubAssembly.  See SubAssembly class for details.
         | 
| 382 | 
            +
                #
         | 
| 383 | 
            +
                # Example:
         | 
| 384 | 
            +
                #     assembly 'id_rows' do
         | 
| 385 | 
            +
                #       ...
         | 
| 386 | 
            +
                #       sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id'))
         | 
| 387 | 
            +
                #     end
         | 
| 280 388 | 
             
                def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
         | 
| 281 389 | 
             
                  sub_assembly = SubAssembly.new(self, sub_assembly)
         | 
| 282 390 | 
             
                  sub_assembly.finalize(pipes, incoming_scopes)
         | 
| @@ -287,17 +395,24 @@ module Cascading | |
| 287 395 | 
             
                  sub_assembly
         | 
| 288 396 | 
             
                end
         | 
| 289 397 |  | 
| 290 | 
            -
                # Builds a basic  | 
| 291 | 
            -
                #  | 
| 398 | 
            +
                # Builds a basic each pipe and adds it to the current Assembly.  Default
         | 
| 399 | 
            +
                # arguments are all_fields, a default inherited from c.o.Each.  Exactly one
         | 
| 400 | 
            +
                # of :function and :filter must be specified and filters do not support an
         | 
| 401 | 
            +
                # :output selector.
         | 
| 402 | 
            +
                #
         | 
| 403 | 
            +
                # The named options are:
         | 
| 404 | 
            +
                # [filter] A Cascading Filter, mutually exclusive with :function.
         | 
| 405 | 
            +
                # [function] A Cascading Function, mutually exclusive with :filter.
         | 
| 406 | 
            +
                # [output] c.p.Each output selector, only valid with :function.
         | 
| 407 | 
            +
                #
         | 
| 292 408 | 
             
                # Example:
         | 
| 293 | 
            -
                # | 
| 294 | 
            -
                 | 
| 295 | 
            -
             | 
| 296 | 
            -
             | 
| 297 | 
            -
                   | 
| 298 | 
            -
                  out_fields = fields(options[:output])
         | 
| 299 | 
            -
             | 
| 409 | 
            +
                #    each fields(input_fields), :function => Java::CascadingOperation::Identity.new
         | 
| 410 | 
            +
                #    each 'field1', 'field2', :function => Java::CascadingOperation::Identity.new
         | 
| 411 | 
            +
                def each(*args_with_options)
         | 
| 412 | 
            +
                  options, in_fields = args_with_options.extract_options!, fields(args_with_options)
         | 
| 413 | 
            +
                  out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each
         | 
| 300 414 | 
             
                  operation = options[:filter] || options[:function]
         | 
| 415 | 
            +
                  raise 'each requires either :filter or :function' unless operation
         | 
| 301 416 | 
             
                  raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output]
         | 
| 302 417 |  | 
| 303 418 | 
             
                  parameters = [tail_pipe, in_fields, operation, out_fields].compact
         | 
| @@ -308,468 +423,156 @@ module Cascading | |
| 308 423 | 
             
                  each
         | 
| 309 424 | 
             
                end
         | 
| 310 425 |  | 
| 311 | 
            -
                 | 
| 312 | 
            -
                 | 
| 313 | 
            -
                 | 
| 314 | 
            -
                 | 
| 315 | 
            -
                 | 
| 316 | 
            -
                  each fields(args), :function => Java::CascadingOperation::Identity.new
         | 
| 317 | 
            -
                end
         | 
| 318 | 
            -
             | 
| 319 | 
            -
                # Removes the specified fields from the current assembly.
         | 
| 320 | 
            -
                # --
         | 
| 321 | 
            -
                # Example:
         | 
| 322 | 
            -
                #     discard "field1", "field2"
         | 
| 323 | 
            -
                def discard(*args)
         | 
| 324 | 
            -
                  discard_fields = fields(args)
         | 
| 325 | 
            -
                  keep_fields = difference_fields(scope.values_fields, discard_fields)
         | 
| 326 | 
            -
                  project(*keep_fields.to_a)
         | 
| 327 | 
            -
                end
         | 
| 328 | 
            -
             | 
| 329 | 
            -
                # Renames fields according to the mapping provided.
         | 
| 330 | 
            -
                # --
         | 
| 331 | 
            -
                # Example:
         | 
| 332 | 
            -
                #     rename "old_name" => "new_name"
         | 
| 333 | 
            -
                def rename(name_map)
         | 
| 334 | 
            -
                  old_names = scope.values_fields.to_a
         | 
| 335 | 
            -
                  new_names = old_names.map{ |name| name_map[name] || name }
         | 
| 336 | 
            -
                  invalid = name_map.keys.sort - old_names
         | 
| 337 | 
            -
                  raise "invalid names: #{invalid.inspect}" unless invalid.empty?
         | 
| 338 | 
            -
             | 
| 339 | 
            -
                  each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
         | 
| 340 | 
            -
                end
         | 
| 341 | 
            -
             | 
| 342 | 
            -
                def cast(type_map)
         | 
| 343 | 
            -
                  names = type_map.keys.sort
         | 
| 344 | 
            -
                  types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
         | 
| 345 | 
            -
                  fields = fields(names)
         | 
| 346 | 
            -
                  types = types.to_java(java.lang.Class)
         | 
| 347 | 
            -
                  each fields, :function => Java::CascadingOperation::Identity.new(fields, types)
         | 
| 348 | 
            -
                end
         | 
| 349 | 
            -
             | 
| 350 | 
            -
                def copy(*args)
         | 
| 351 | 
            -
                  options = args.extract_options!
         | 
| 352 | 
            -
                  from = args[0] || all_fields
         | 
| 353 | 
            -
                  into = args[1] || options[:into] || all_fields
         | 
| 354 | 
            -
                  each fields(from), :function => Java::CascadingOperation::Identity.new(fields(into)), :output => all_fields
         | 
| 355 | 
            -
                end
         | 
| 356 | 
            -
             | 
| 357 | 
            -
                # A pipe that does nothing.
         | 
| 358 | 
            -
                def pass(*args)
         | 
| 359 | 
            -
                  each all_fields, :function => Java::CascadingOperation::Identity.new
         | 
| 360 | 
            -
                end
         | 
| 426 | 
            +
                include Operations
         | 
| 427 | 
            +
                include IdentityOperations
         | 
| 428 | 
            +
                include FilterOperations
         | 
| 429 | 
            +
                include RegexOperations
         | 
| 430 | 
            +
                include TextOperations
         | 
| 361 431 |  | 
| 362 | 
            -
                 | 
| 363 | 
            -
             | 
| 364 | 
            -
             | 
| 432 | 
            +
                # Builds an each assertion pipe given a c.o.a.Assertion and adds it to the
         | 
| 433 | 
            +
                # current Assembly.
         | 
| 434 | 
            +
                #
         | 
| 435 | 
            +
                # The named options are:
         | 
| 436 | 
            +
                # [level] The assertion level; defaults to strict.
         | 
| 437 | 
            +
                def assert(assertion, options = {})
         | 
| 365 438 | 
             
                  assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
         | 
| 366 439 |  | 
| 367 440 | 
             
                  parameters = [tail_pipe, assertion_level, assertion]
         | 
| 368 441 | 
             
                  make_pipe(Java::CascadingPipe::Each, parameters)
         | 
| 369 442 | 
             
                end
         | 
| 370 443 |  | 
| 371 | 
            -
                # Builds a  | 
| 372 | 
            -
                 | 
| 373 | 
            -
             | 
| 374 | 
            -
                # output.
         | 
| 375 | 
            -
                #
         | 
| 376 | 
            -
                # The other named options are:
         | 
| 377 | 
            -
                # * <tt>:print_fields</tt> a boolean. If is set to true, then it prints every 10 tuples.
         | 
| 378 | 
            -
                #
         | 
| 379 | 
            -
                def debug(*args)
         | 
| 380 | 
            -
                  options = args.extract_options!
         | 
| 381 | 
            -
                  print_fields = options[:print_fields] || true
         | 
| 382 | 
            -
                  parameters = [print_fields].compact
         | 
| 383 | 
            -
                  debug = Java::CascadingOperation::Debug.new(*parameters)
         | 
| 384 | 
            -
                  debug.print_tuple_every = options[:tuple_interval] || 1
         | 
| 385 | 
            -
                  debug.print_fields_every = options[:fields_interval] || 10
         | 
| 386 | 
            -
                  each(all_fields, :filter => debug)
         | 
| 387 | 
            -
                end
         | 
| 388 | 
            -
             | 
| 389 | 
            -
                # Builds a pipe that assert the size of the tuple is the size specified in parameter.
         | 
| 390 | 
            -
                #
         | 
| 391 | 
            -
                # The method accept an unique uname argument : a number indicating the size expected.
         | 
| 392 | 
            -
                def assert_size_equals(*args)
         | 
| 393 | 
            -
                  options = args.extract_options!
         | 
| 394 | 
            -
                  assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
         | 
| 444 | 
            +
                # Builds a pipe that asserts the size of the tuple is the specified size.
         | 
| 445 | 
            +
                def assert_size_equals(size, options = {})
         | 
| 446 | 
            +
                  assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size)
         | 
| 395 447 | 
             
                  assert(assertion, options)
         | 
| 396 448 | 
             
                end
         | 
| 397 449 |  | 
| 398 | 
            -
                #  | 
| 399 | 
            -
                def assert_not_null( | 
| 400 | 
            -
                  options = args.extract_options!
         | 
| 450 | 
            +
                # Builes a pipe that asserts none of the fiels in the tuple are null.
         | 
| 451 | 
            +
                def assert_not_null(options = {})
         | 
| 401 452 | 
             
                  assertion = Java::CascadingOperationAssertion::AssertNotNull.new
         | 
| 402 453 | 
             
                  assert(assertion, options)
         | 
| 403 454 | 
             
                end
         | 
| 404 455 |  | 
| 405 | 
            -
                 | 
| 406 | 
            -
                # using a specified regex pattern.
         | 
| 407 | 
            -
                #
         | 
| 408 | 
            -
                # If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
         | 
| 409 | 
            -
                # fields are used.
         | 
| 410 | 
            -
                #
         | 
| 411 | 
            -
                # The named options are:
         | 
| 412 | 
            -
                # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for parsing the argument fields.
         | 
| 413 | 
            -
                # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
         | 
| 414 | 
            -
                def parse(*args)
         | 
| 415 | 
            -
                    options = args.extract_options!
         | 
| 416 | 
            -
                    fields = args || all_fields
         | 
| 417 | 
            -
                    pattern = options[:pattern]
         | 
| 418 | 
            -
                    output = options[:output] || all_fields
         | 
| 419 | 
            -
                    each(fields, :function => regex_parser(pattern, options), :output => output)
         | 
| 420 | 
            -
                end
         | 
| 456 | 
            +
                private
         | 
| 421 457 |  | 
| 422 | 
            -
                 | 
| 423 | 
            -
             | 
| 424 | 
            -
             | 
| 425 | 
            -
                # The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
         | 
| 426 | 
            -
                #
         | 
| 427 | 
            -
                # The named options are:
         | 
| 428 | 
            -
                # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
         | 
| 429 | 
            -
                # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
         | 
| 430 | 
            -
                def split(*args)
         | 
| 431 | 
            -
                  options = args.extract_options!
         | 
| 432 | 
            -
                  fields = options[:into] || args[1]
         | 
| 433 | 
            -
                  pattern = options[:pattern] || /[.,]*\s+/
         | 
| 434 | 
            -
                  output = options[:output] || all_fields
         | 
| 435 | 
            -
                  each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
         | 
| 436 | 
            -
                end
         | 
| 437 | 
            -
             | 
| 438 | 
            -
                # Builds a pipe that splits a field into new rows, using a specified regular expression.
         | 
| 439 | 
            -
                #
         | 
| 440 | 
            -
                # The first unnamed argument is the field to be split.
         | 
| 441 | 
            -
                # The second unnamed argument is the field receiving the result of the split.
         | 
| 442 | 
            -
                #
         | 
| 443 | 
            -
                # The named options are:
         | 
| 444 | 
            -
                # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for splitting the argument fields.
         | 
| 445 | 
            -
                # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
         | 
| 446 | 
            -
                def split_rows(*args)
         | 
| 447 | 
            -
                  options = args.extract_options!
         | 
| 448 | 
            -
                  fields = options[:into] || args[1]
         | 
| 449 | 
            -
                  pattern = options[:pattern] || /[.,]*\s+/
         | 
| 450 | 
            -
                  output = options[:output] || all_fields
         | 
| 451 | 
            -
                  each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
         | 
| 452 | 
            -
                end
         | 
| 453 | 
            -
             | 
| 454 | 
            -
                # Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
         | 
| 455 | 
            -
                #
         | 
| 456 | 
            -
                # The first unnamed argument is the field to be matched against.
         | 
| 457 | 
            -
                # The second unnamed argument is the field receiving the result of the match.
         | 
| 458 | 
            -
                #
         | 
| 459 | 
            -
                # The named options are:
         | 
| 460 | 
            -
                # * <tt>:pattern</tt> a string or regex. Specifies the regular expression used for matching the argument fields.
         | 
| 461 | 
            -
                # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
         | 
| 462 | 
            -
                def match_rows(*args)
         | 
| 463 | 
            -
                  options = args.extract_options!
         | 
| 464 | 
            -
                  fields = options[:into] || args[1]
         | 
| 465 | 
            -
                  pattern = options[:pattern] || /[\w]+/
         | 
| 466 | 
            -
                  output = options[:output] || all_fields
         | 
| 467 | 
            -
                  each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
         | 
| 468 | 
            -
                end
         | 
| 469 | 
            -
             | 
| 470 | 
            -
                # Builds a pipe that parses the specified field as a date using hte provided format string.
         | 
| 471 | 
            -
                # The unamed argument specifies the field to format.
         | 
| 472 | 
            -
                #
         | 
| 473 | 
            -
                # The named options are:
         | 
| 474 | 
            -
                # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
         | 
| 475 | 
            -
                # the input argument.
         | 
| 476 | 
            -
                # * <tt>:pattern</tt> a string. Specifies the date format.
         | 
| 477 | 
            -
                # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
         | 
| 478 | 
            -
                def parse_date(*args)
         | 
| 479 | 
            -
                  options = args.extract_options!
         | 
| 480 | 
            -
                  field = options[:into] || "#{args[0]}_parsed"
         | 
| 481 | 
            -
                  output = options[:output] || all_fields
         | 
| 482 | 
            -
                  pattern = options[:pattern] || "yyyy/MM/dd"
         | 
| 483 | 
            -
             | 
| 484 | 
            -
                  each args[0], :function => date_parser(field, pattern), :output => output
         | 
| 485 | 
            -
                end
         | 
| 458 | 
            +
                def make_pipe(type, parameters)
         | 
| 459 | 
            +
                  @tail_pipe = type.new(*parameters)
         | 
| 460 | 
            +
                  @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
         | 
| 486 461 |  | 
| 487 | 
            -
             | 
| 488 | 
            -
                #
         | 
| 489 | 
            -
                # The unamed argument specifies the field to format.
         | 
| 490 | 
            -
                #
         | 
| 491 | 
            -
                # The named options are:
         | 
| 492 | 
            -
                # * <tt>:into</tt> a string. It specifies the receiving field. By default, it will be named after
         | 
| 493 | 
            -
                # the input argument.
         | 
| 494 | 
            -
                # * <tt>:pattern</tt> a string. Specifies the date format.
         | 
| 495 | 
            -
                # * <tt>:timezone</tt> a string.  Specifies the timezone (defaults to UTC).
         | 
| 496 | 
            -
                # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
         | 
| 497 | 
            -
                def format_date(*args)
         | 
| 498 | 
            -
                  options = args.extract_options!
         | 
| 499 | 
            -
                  field = options[:into] || "#{args[0]}_formatted"
         | 
| 500 | 
            -
                  pattern = options[:pattern] || "yyyy/MM/dd"
         | 
| 501 | 
            -
                  output = options[:output] || all_fields
         | 
| 502 | 
            -
             | 
| 503 | 
            -
                  each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
         | 
| 462 | 
            +
                  tail_pipe
         | 
| 504 463 | 
             
                end
         | 
| 505 464 |  | 
| 506 | 
            -
                 | 
| 507 | 
            -
             | 
| 508 | 
            -
             | 
| 509 | 
            -
             | 
| 510 | 
            -
             | 
| 511 | 
            -
             | 
| 512 | 
            -
             | 
| 513 | 
            -
                # * <tt>:replacement</tt> a string. Specifies the replacement.
         | 
| 514 | 
            -
                # * <tt>:output</tt> a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
         | 
| 515 | 
            -
                def replace(*args)
         | 
| 516 | 
            -
                  options = args.extract_options!
         | 
| 517 | 
            -
             | 
| 518 | 
            -
                  pattern = options[:pattern] || args[1]
         | 
| 519 | 
            -
                  replacement = options[:replacement] || args[2]
         | 
| 520 | 
            -
                  into = options[:into] || "#{args[0]}_replaced"
         | 
| 521 | 
            -
                  output = options[:output] || all_fields
         | 
| 522 | 
            -
             | 
| 523 | 
            -
                  each args[0], :function => regex_replace(into, pattern, replacement), :output => output
         | 
| 524 | 
            -
                end
         | 
| 465 | 
            +
                def populate_incoming_scopes(assembly_names, group_fields_args = {})
         | 
| 466 | 
            +
                  # NOTE: this overrides the existing incoming_scopes, which changes the
         | 
| 467 | 
            +
                  # way describe will function on this assembly
         | 
| 468 | 
            +
                  pipes, @incoming_scopes, group_fields = [], [], []
         | 
| 469 | 
            +
                  assembly_names.each do |assembly_name|
         | 
| 470 | 
            +
                    assembly = parent_flow.find_child(assembly_name)
         | 
| 471 | 
            +
                    raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
         | 
| 525 472 |  | 
| 526 | 
            -
             | 
| 527 | 
            -
             | 
| 528 | 
            -
             | 
| 529 | 
            -
                # and as values, the values they must contain. For example:
         | 
| 530 | 
            -
                #
         | 
| 531 | 
            -
                #       insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
         | 
| 532 | 
            -
                #
         | 
| 533 | 
            -
                # will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
         | 
| 534 | 
            -
                # the formatted current date.
         | 
| 535 | 
            -
                # The methods outputs all fields.
         | 
| 536 | 
            -
                # The named options are:
         | 
| 537 | 
            -
                def insert(args)
         | 
| 538 | 
            -
                  args.keys.sort.each do |field_name|
         | 
| 539 | 
            -
                    value = args[field_name]
         | 
| 540 | 
            -
             | 
| 541 | 
            -
                    if value.kind_of?(ExprStub)
         | 
| 542 | 
            -
                      value.validate_scope(scope)
         | 
| 543 | 
            -
                      each all_fields, :function => expression_function(field_name, :expression => value.expression, :parameters => value.types), :output => all_fields
         | 
| 544 | 
            -
                    else
         | 
| 545 | 
            -
                      each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
         | 
| 546 | 
            -
                    end
         | 
| 473 | 
            +
                    pipes << assembly.tail_pipe
         | 
| 474 | 
            +
                    @incoming_scopes << assembly.scope
         | 
| 475 | 
            +
                    group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
         | 
| 547 476 | 
             
                  end
         | 
| 477 | 
            +
                  [pipes, group_fields]
         | 
| 548 478 | 
             
                end
         | 
| 549 479 |  | 
| 550 | 
            -
                 | 
| 551 | 
            -
             | 
| 552 | 
            -
             | 
| 553 | 
            -
                #
         | 
| 554 | 
            -
                # The named options are:
         | 
| 555 | 
            -
                # * <tt>:pattern</tt> a string. Specifies a regular expression pattern used to filter the tuples. If this
         | 
| 556 | 
            -
                # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
         | 
| 557 | 
            -
                # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
         | 
| 558 | 
            -
                # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
         | 
| 559 | 
            -
                # expression-based. This is incompatible with the _pattern_ option.
         | 
| 560 | 
            -
                # * <tt>:validate</tt> a boolean.  Passed into Cascading#expr to enable or disable
         | 
| 561 | 
            -
                # expression validation.  Defaults to true.
         | 
| 562 | 
            -
                # * <tt>:validate_with</tt> a hash.  Actual arguments used by Cascading#expr for
         | 
| 563 | 
            -
                # expression validation.  Defaults to {}.
         | 
| 564 | 
            -
                def filter(*args)
         | 
| 565 | 
            -
                  options = args.extract_options!
         | 
| 566 | 
            -
                  from = options.delete(:from) || all_fields
         | 
| 567 | 
            -
                  expression = options.delete(:expression) || args.shift
         | 
| 568 | 
            -
                  regex = options.delete(:pattern)
         | 
| 569 | 
            -
                  validate = options.has_key?(:validate) ? options.delete(:validate) : true
         | 
| 570 | 
            -
                  validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {}
         | 
| 571 | 
            -
             | 
| 572 | 
            -
                  if expression
         | 
| 573 | 
            -
                    stub = expr(expression, { :validate => validate, :validate_with => validate_with })
         | 
| 574 | 
            -
                    types, expression = stub.types, stub.expression
         | 
| 575 | 
            -
             | 
| 576 | 
            -
                    stub.validate_scope(scope)
         | 
| 577 | 
            -
                    each from, :filter => expression_filter(
         | 
| 578 | 
            -
                      :parameters => types,
         | 
| 579 | 
            -
                      :expression => expression
         | 
| 580 | 
            -
                    )
         | 
| 581 | 
            -
                  elsif regex
         | 
| 582 | 
            -
                    each from, :filter => regex_filter(regex, options)
         | 
| 583 | 
            -
                  end
         | 
| 584 | 
            -
                end
         | 
| 480 | 
            +
                def apply_aggregations(group, incoming_scopes, &block)
         | 
| 481 | 
            +
                  aggregations = Aggregations.new(self, group, incoming_scopes)
         | 
| 482 | 
            +
                  aggregations.instance_eval(&block) if block_given?
         | 
| 585 483 |  | 
| 586 | 
            -
             | 
| 587 | 
            -
                   | 
| 588 | 
            -
             | 
| 589 | 
            -
             | 
| 590 | 
            -
             | 
| 484 | 
            +
                  # Sorting of any type means that we cannot use the AggregateBy optimization
         | 
| 485 | 
            +
                  if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
         | 
| 486 | 
            +
                    grouping_fields = group.key_selectors.values.first
         | 
| 487 | 
            +
                    group.key_selectors.values.each do |key_fields|
         | 
| 488 | 
            +
                      raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
         | 
| 489 | 
            +
                    end
         | 
| 591 490 |  | 
| 592 | 
            -
             | 
| 593 | 
            -
             | 
| 594 | 
            -
             | 
| 595 | 
            -
             | 
| 596 | 
            -
             | 
| 491 | 
            +
                    aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
         | 
| 492 | 
            +
                      name,
         | 
| 493 | 
            +
                      group.previous,
         | 
| 494 | 
            +
                      grouping_fields,
         | 
| 495 | 
            +
                      aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
         | 
| 496 | 
            +
                    ), group.previous, incoming_scopes)
         | 
| 597 497 |  | 
| 598 | 
            -
             | 
| 599 | 
            -
             | 
| 600 | 
            -
             | 
| 601 | 
            -
             | 
| 602 | 
            -
             | 
| 603 | 
            -
                # * <tt>:expression</tt> a string. Specifies a Janino expression used to filter the tuples. This option has the
         | 
| 604 | 
            -
                # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
         | 
| 605 | 
            -
                # expression-based.
         | 
| 606 | 
            -
                # * <tt>:validate</tt> a boolean.  Passed into Cascading#expr to enable or disable
         | 
| 607 | 
            -
                # expression validation.  Defaults to true.
         | 
| 608 | 
            -
                # * <tt>:validate_with</tt> a hash.  Actual arguments used by Cascading#expr for
         | 
| 609 | 
            -
                # expression validation.  Defaults to {}.
         | 
| 610 | 
            -
                def reject(*args)
         | 
| 611 | 
            -
                  options = args.extract_options
         | 
| 612 | 
            -
                  raise "Regex not allowed" if options && options[:pattern]
         | 
| 613 | 
            -
             | 
| 614 | 
            -
                  filter(*args)
         | 
| 615 | 
            -
                end
         | 
| 498 | 
            +
                    aggregate_by
         | 
| 499 | 
            +
                  else
         | 
| 500 | 
            +
                    aggregations.finalize if block_given?
         | 
| 501 | 
            +
                    @tail_pipe = aggregations.tail_pipe
         | 
| 502 | 
            +
                    @outgoing_scopes[name] = aggregations.scope
         | 
| 616 503 |  | 
| 617 | 
            -
             | 
| 618 | 
            -
                #
         | 
| 619 | 
            -
                # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
         | 
| 620 | 
            -
                #
         | 
| 621 | 
            -
                # The named options are:
         | 
| 622 | 
            -
                # * <tt>:expression</tt> a string. Specifies a Janino expression used to select the tuples. This option has the
         | 
| 623 | 
            -
                # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
         | 
| 624 | 
            -
                # expression-based.
         | 
| 625 | 
            -
                # * <tt>:validate</tt> a boolean.  Passed into Cascading#expr to enable or disable
         | 
| 626 | 
            -
                # expression validation.  Defaults to true.
         | 
| 627 | 
            -
                # * <tt>:validate_with</tt> a hash.  Actual arguments used by Cascading#expr for
         | 
| 628 | 
            -
                # expression validation.  Defaults to {}.
         | 
| 629 | 
            -
                def where(*args)
         | 
| 630 | 
            -
                  options = args.extract_options
         | 
| 631 | 
            -
                  raise "Regex not allowed" if options && options[:pattern]
         | 
| 632 | 
            -
             | 
| 633 | 
            -
                  if options[:expression]
         | 
| 634 | 
            -
                    _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
         | 
| 635 | 
            -
                    options[:expression] = "#{imports}!(#{expr})"
         | 
| 636 | 
            -
                  elsif args[0]
         | 
| 637 | 
            -
                    _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
         | 
| 638 | 
            -
                    args[0] = "#{imports}!(#{expr})"
         | 
| 504 | 
            +
                    group
         | 
| 639 505 | 
             
                  end
         | 
| 640 | 
            -
             | 
| 641 | 
            -
                  filter(*args)
         | 
| 642 506 | 
             
                end
         | 
| 643 507 |  | 
| 644 | 
            -
                 | 
| 645 | 
            -
             | 
| 646 | 
            -
                # The named options are:
         | 
| 647 | 
            -
                # * <tt>:from</tt> a string or array of strings. Specifies the input fields.
         | 
| 648 | 
            -
                # * <tt>:express</tt> a string. The janino expression.
         | 
| 649 | 
            -
                # * <tt>:into</tt> a string. Specified the name of the field to insert with the result of the evaluation.
         | 
| 650 | 
            -
                # * <tt>:parameters</tt> a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
         | 
| 651 | 
            -
                def eval_expression(*args)
         | 
| 652 | 
            -
                  options = args.extract_options!
         | 
| 653 | 
            -
             | 
| 654 | 
            -
                  into = options.delete(:into)
         | 
| 655 | 
            -
                  from = options.delete(:from) || all_fields
         | 
| 656 | 
            -
                  output = options.delete(:output) || all_fields
         | 
| 657 | 
            -
                  options[:expression] ||= args.shift
         | 
| 658 | 
            -
                  options[:parameters] ||= args.shift
         | 
| 659 | 
            -
             | 
| 660 | 
            -
                  each from, :function => expression_function(into, options), :output=>output
         | 
| 661 | 
            -
                end
         | 
| 508 | 
            +
                def prepare_join(assembly_names, options, &block)
         | 
| 509 | 
            +
                  pipes, _ = populate_incoming_scopes(assembly_names)
         | 
| 662 510 |  | 
| 663 | 
            -
             | 
| 664 | 
            -
             | 
| 665 | 
            -
                # The method accepts optional unamed argument specifying the fields to base the distinct on
         | 
| 666 | 
            -
                # (all fields, by default).
         | 
| 667 | 
            -
                def distinct(*args)
         | 
| 668 | 
            -
                  raise "Distinct is badly broken"
         | 
| 669 | 
            -
                  fields = args[0] || all_fields
         | 
| 670 | 
            -
                  group_by *fields
         | 
| 671 | 
            -
                  pass
         | 
| 672 | 
            -
                end
         | 
| 673 | 
            -
             | 
| 674 | 
            -
                def join_fields(*args)
         | 
| 675 | 
            -
                  options = args.extract_options!
         | 
| 676 | 
            -
                  output = options[:output] || all_fields
         | 
| 511 | 
            +
                  group_fields_args = options[:on]
         | 
| 512 | 
            +
                  raise 'join requires :on parameter' unless group_fields_args
         | 
| 677 513 |  | 
| 678 | 
            -
                   | 
| 679 | 
            -
             | 
| 514 | 
            +
                  if group_fields_args.kind_of?(String)
         | 
| 515 | 
            +
                    group_fields_args = [group_fields_args]
         | 
| 516 | 
            +
                  end
         | 
| 680 517 |  | 
| 681 | 
            -
             | 
| 682 | 
            -
             | 
| 683 | 
            -
             | 
| 684 | 
            -
             | 
| 685 | 
            -
             | 
| 686 | 
            -
             | 
| 687 | 
            -
             | 
| 688 | 
            -
             | 
| 689 | 
            -
             | 
| 690 | 
            -
             | 
| 691 | 
            -
                #   in the order provided.
         | 
| 692 | 
            -
                # * <tt>:num_values</tt> an integer specifying the number of fields to
         | 
| 693 | 
            -
                #   ungroup into each output tuple (excluding the key fields).  All input
         | 
| 694 | 
            -
                #   fields will be ungrouped.
         | 
| 695 | 
            -
                # * <tt>:input</tt> an array of field names that specifies the fields to
         | 
| 696 | 
            -
                #   input to UnGroup.  Defaults to all_fields.
         | 
| 697 | 
            -
                # * <tt>:into</tt> an array of field names.  Default set by UnGroup.
         | 
| 698 | 
            -
                # * <tt>:output</tt> an array of field names that specifies the fields to
         | 
| 699 | 
            -
                #   produce as output of UnGroup.  Defaults to all_fields.
         | 
| 700 | 
            -
                def ungroup(*args)
         | 
| 701 | 
            -
                  options = args.extract_options!
         | 
| 702 | 
            -
                  input = options[:input] || all_fields
         | 
| 703 | 
            -
                  into = fields(options[:into])
         | 
| 704 | 
            -
                  output = options[:output] || all_fields
         | 
| 705 | 
            -
                  key = fields(options[:key])
         | 
| 706 | 
            -
             | 
| 707 | 
            -
                  raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
         | 
| 708 | 
            -
                  value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
         | 
| 709 | 
            -
                  num_values = options[:num_values] if options.has_key?(:num_values)
         | 
| 710 | 
            -
             | 
| 711 | 
            -
                  parameters = [into, key, value_selectors, num_values].compact
         | 
| 712 | 
            -
                  each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
         | 
| 713 | 
            -
                end
         | 
| 518 | 
            +
                  group_fields = []
         | 
| 519 | 
            +
                  if group_fields_args.kind_of?(Array)
         | 
| 520 | 
            +
                    pipes.size.times do
         | 
| 521 | 
            +
                      group_fields << fields(group_fields_args)
         | 
| 522 | 
            +
                    end
         | 
| 523 | 
            +
                  elsif group_fields_args.kind_of?(Hash)
         | 
| 524 | 
            +
                    pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
         | 
| 525 | 
            +
                  else
         | 
| 526 | 
            +
                    raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
         | 
| 527 | 
            +
                  end
         | 
| 714 528 |  | 
| 715 | 
            -
             | 
| 716 | 
            -
             | 
| 717 | 
            -
             | 
| 718 | 
            -
             | 
| 719 | 
            -
             | 
| 720 | 
            -
             | 
| 721 | 
            -
                # * <tt>filter</tt> Cascading Filter to apply.
         | 
| 722 | 
            -
                # * <tt>keep_value</tt> Java value to produce when the filter would keep
         | 
| 723 | 
            -
                #   the given input.
         | 
| 724 | 
            -
                # * <tt>remove_value</tt> Java value to produce when the filter would
         | 
| 725 | 
            -
                #   remove the given input.
         | 
| 726 | 
            -
                #
         | 
| 727 | 
            -
                # The named options are:
         | 
| 728 | 
            -
                # * <tt>:into</tt> an output field name, defaulting to 'filter_value'.
         | 
| 729 | 
            -
                # * <tt>:output</tt> an array of field names that specifies the fields to
         | 
| 730 | 
            -
                #   retain in the output tuple.  Defaults to all_fields.
         | 
| 731 | 
            -
                def set_value(input, filter, keep_value, remove_value, params = {})
         | 
| 732 | 
            -
                  into = fields(params[:into] || 'filter_value')
         | 
| 733 | 
            -
                  output = params[:output] || all_fields
         | 
| 734 | 
            -
                  each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output
         | 
| 735 | 
            -
                end
         | 
| 529 | 
            +
                  raise 'join requires non-empty :on parameter' if group_fields_args.empty?
         | 
| 530 | 
            +
                  group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
         | 
| 531 | 
            +
                  incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
         | 
| 532 | 
            +
                  declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
         | 
| 533 | 
            +
                  joiner = options[:joiner]
         | 
| 534 | 
            +
                  is_hash_join = options[:hash] || false
         | 
| 736 535 |  | 
| 737 | 
            -
             | 
| 738 | 
            -
             | 
| 739 | 
            -
             | 
| 740 | 
            -
             | 
| 741 | 
            -
             | 
| 742 | 
            -
             | 
| 743 | 
            -
             | 
| 744 | 
            -
             | 
| 745 | 
            -
             | 
| 746 | 
            -
             | 
| 747 | 
            -
             | 
| 748 | 
            -
             | 
| 749 | 
            -
             | 
| 750 | 
            -
             | 
| 751 | 
            -
             | 
| 752 | 
            -
             | 
| 753 | 
            -
             | 
| 536 | 
            +
                  case joiner
         | 
| 537 | 
            +
                  when :inner, 'inner', nil
         | 
| 538 | 
            +
                    joiner = Java::CascadingPipeJoiner::InnerJoin.new
         | 
| 539 | 
            +
                  when :left,  'left'
         | 
| 540 | 
            +
                    joiner = Java::CascadingPipeJoiner::LeftJoin.new
         | 
| 541 | 
            +
                  when :right, 'right'
         | 
| 542 | 
            +
                    joiner = Java::CascadingPipeJoiner::RightJoin.new
         | 
| 543 | 
            +
                  when :outer, 'outer'
         | 
| 544 | 
            +
                    joiner = Java::CascadingPipeJoiner::OuterJoin.new
         | 
| 545 | 
            +
                  when Array
         | 
| 546 | 
            +
                    joiner = joiner.map do |t|
         | 
| 547 | 
            +
                      case t
         | 
| 548 | 
            +
                      when true,  1, :inner then true
         | 
| 549 | 
            +
                      when false, 0, :outer then false
         | 
| 550 | 
            +
                      else fail "invalid mixed joiner entry: #{t}"
         | 
| 551 | 
            +
                      end
         | 
| 552 | 
            +
                    end
         | 
| 553 | 
            +
                    joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
         | 
| 554 | 
            +
                  end
         | 
| 754 555 |  | 
| 755 | 
            -
             | 
| 756 | 
            -
             | 
| 757 | 
            -
             | 
| 758 | 
            -
             | 
| 759 | 
            -
             | 
| 760 | 
            -
             | 
| 761 | 
            -
             | 
| 762 | 
            -
             | 
| 763 | 
            -
             | 
| 764 | 
            -
             | 
| 765 | 
            -
             | 
| 766 | 
            -
             | 
| 767 | 
            -
             | 
| 768 | 
            -
             | 
| 769 | 
            -
             | 
| 770 | 
            -
             | 
| 771 | 
            -
             | 
| 772 | 
            -
             | 
| 556 | 
            +
                  if is_hash_join
         | 
| 557 | 
            +
                    parameters = [
         | 
| 558 | 
            +
                      pipes.to_java(Java::CascadingPipe::Pipe),
         | 
| 559 | 
            +
                      group_fields,
         | 
| 560 | 
            +
                      declared_fields,
         | 
| 561 | 
            +
                      joiner
         | 
| 562 | 
            +
                    ]
         | 
| 563 | 
            +
                    group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
         | 
| 564 | 
            +
                  else
         | 
| 565 | 
            +
                    result_group_fields = dedup_fields(*group_fields)
         | 
| 566 | 
            +
                    parameters = [
         | 
| 567 | 
            +
                      pipes.to_java(Java::CascadingPipe::Pipe),
         | 
| 568 | 
            +
                      group_fields,
         | 
| 569 | 
            +
                      declared_fields,
         | 
| 570 | 
            +
                      result_group_fields,
         | 
| 571 | 
            +
                      joiner
         | 
| 572 | 
            +
                    ]
         | 
| 573 | 
            +
                    group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
         | 
| 574 | 
            +
                  end
         | 
| 575 | 
            +
                  apply_aggregations(group_assembly, @incoming_scopes, &block)
         | 
| 773 576 | 
             
                end
         | 
| 774 577 | 
             
              end
         | 
| 775 578 | 
             
            end
         |