cascading.jruby 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/HACKING.md +15 -0
  2. data/History.txt +0 -0
  3. data/LICENSE.txt +165 -0
  4. data/README.md +7 -0
  5. data/Rakefile +45 -0
  6. data/bin/make_job +81 -0
  7. data/lib/cascading/assembly.rb +726 -0
  8. data/lib/cascading/base.rb +63 -0
  9. data/lib/cascading/cascade.rb +63 -0
  10. data/lib/cascading/cascading.rb +134 -0
  11. data/lib/cascading/cascading_exception.rb +30 -0
  12. data/lib/cascading/expr_stub.rb +33 -0
  13. data/lib/cascading/ext/array.rb +15 -0
  14. data/lib/cascading/flow.rb +168 -0
  15. data/lib/cascading/operations.rb +204 -0
  16. data/lib/cascading/scope.rb +160 -0
  17. data/lib/cascading.rb +63 -0
  18. data/samples/branch.rb +31 -0
  19. data/samples/cascading.rb +41 -0
  20. data/samples/copy.rb +18 -0
  21. data/samples/data/data2.txt +88799 -0
  22. data/samples/data/data_join1.txt +3 -0
  23. data/samples/data/data_join2.txt +3 -0
  24. data/samples/data/data_join3.txt +3 -0
  25. data/samples/join.rb +32 -0
  26. data/samples/logwordcount.rb +22 -0
  27. data/samples/project.rb +24 -0
  28. data/samples/rename.rb +21 -0
  29. data/samples/scorenames.rb +20 -0
  30. data/samples/splitter.rb +20 -0
  31. data/samples/union.rb +35 -0
  32. data/spec/cascading_spec.rb +100 -0
  33. data/spec/expr_spec.rb +10 -0
  34. data/spec/primary_key_spec.rb +119 -0
  35. data/spec/resource/join_input.txt +3 -0
  36. data/spec/resource/test_input.txt +4 -0
  37. data/spec/scope_spec.rb +174 -0
  38. data/spec/spec.opts +6 -0
  39. data/spec/spec_helper.rb +5 -0
  40. data/spec/spec_util.rb +188 -0
  41. data/src/cascading/jruby/Main.java +38 -0
  42. data/src/cascading/jruby/runner.rb +6 -0
  43. data/tags +238 -0
  44. data/tasks/ann.rake +80 -0
  45. data/tasks/ant.rake +11 -0
  46. data/tasks/bones.rake +20 -0
  47. data/tasks/gem.rake +206 -0
  48. data/tasks/git.rake +40 -0
  49. data/tasks/notes.rake +27 -0
  50. data/tasks/post_load.rake +34 -0
  51. data/tasks/rdoc.rake +50 -0
  52. data/tasks/rubyforge.rake +55 -0
  53. data/tasks/samples.rake +13 -0
  54. data/tasks/setup.rb +300 -0
  55. data/tasks/spec.rake +59 -0
  56. data/tasks/svn.rake +47 -0
  57. data/tasks/test.rake +42 -0
  58. data/test/data/data1.txt +14 -0
  59. data/test/data/data2.txt +14 -0
  60. data/test/test_assembly.rb +321 -0
  61. data/test/test_cascading.rb +49 -0
  62. data/test/test_flow.rb +15 -0
  63. metadata +137 -0
@@ -0,0 +1,63 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ module Cascading
6
+ class Node
7
+ attr_accessor :name, :parent, :children, :last_child
8
+
9
+ def initialize(name, parent)
10
+ @name = name
11
+ @parent = parent
12
+ @children = {}
13
+ @last_child = nil
14
+ end
15
+
16
+ def add_child(node)
17
+ @children[node.name] = node
18
+ @last_child = node
19
+ node
20
+ end
21
+
22
+ def find_child(name)
23
+ children.each do |child_name, child|
24
+ return child if child_name == name
25
+ result = child.find_child(name)
26
+ return result if result
27
+ end
28
+ return nil
29
+ end
30
+ end
31
+
32
+ # A module to add auto-registration capability
33
+ module Registerable
34
+ def all
35
+ @registered.nil? ? [] : @registered.values
36
+ end
37
+
38
+ def get(key)
39
+ if key.is_a? self
40
+ return key
41
+ else
42
+ @registered ||= {}
43
+ return @registered[key]
44
+ end
45
+ end
46
+
47
+ def reset
48
+ @registered.clear if @registered
49
+ end
50
+
51
+ def add(name, instance)
52
+ @registered ||= {}
53
+ @registered[name] = instance
54
+ end
55
+
56
+ private
57
+
58
+ def registered
59
+ @registered ||= {}
60
+ @registered
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,63 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ require 'cascading/base'
6
+ require 'yaml'
7
+
8
+ module Cascading
9
+ class Cascade < Cascading::Node
10
+ extend Registerable
11
+
12
+ def initialize(name)
13
+ super(name, nil) # A Cascade cannot have a parent
14
+ self.class.add(name, self)
15
+ end
16
+
17
+ def flow(name, &block)
18
+ raise "Could not build flow '#{name}'; block required" unless block_given?
19
+ flow = Flow.new(name, self)
20
+ add_child(flow)
21
+ flow.instance_eval(&block)
22
+ flow
23
+ end
24
+
25
+ def draw(dir, properties = nil)
26
+ @children.each do |name, flow|
27
+ flow.connect(properties).writeDOT("#{dir}/#{name}.dot")
28
+ end
29
+ end
30
+
31
+ def sink_metadata
32
+ @children.inject({}) do |sink_fields, (name, flow)|
33
+ sink_fields[name] = flow.sink_metadata
34
+ sink_fields
35
+ end
36
+ end
37
+
38
+ def write_sink_metadata(file_name)
39
+ File.open(file_name, 'w') do |file|
40
+ YAML.dump(sink_metadata, file)
41
+ end
42
+ end
43
+
44
+ def complete(properties = nil)
45
+ begin
46
+ Java::CascadingCascade::CascadeConnector.new.connect(name, make_flows(@children, properties)).complete
47
+ rescue NativeException => e
48
+ raise CascadingException.new(e, 'Error completing cascade')
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def make_flows(flows, properties)
55
+ flow_instances = flows.map do |name, flow|
56
+ cascading_flow = flow.connect(properties)
57
+ flow.listeners.each { |l| cascading_flow.addListener(l) }
58
+ cascading_flow
59
+ end
60
+ flow_instances.to_java(Java::CascadingFlow::Flow)
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,134 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ require 'cascading/expr_stub'
6
+
7
+ module Cascading
8
+ JAVA_TYPE_MAP = {
9
+ :int => java.lang.Integer.java_class, :long => java.lang.Long.java_class,
10
+ :bool => java.lang.Boolean.java_class, :double => java.lang.Double.java_class,
11
+ :float => java.lang.Float.java_class, :string => java.lang.String.java_class,
12
+ }
13
+
14
+ def cascade(name, &block)
15
+ raise "Could not build cascade '#{name}'; block required" unless block_given?
16
+ cascade = Cascade.new(name)
17
+ cascade.instance_eval(&block)
18
+ cascade
19
+ end
20
+
21
+ # For applications built of Flows with no Cascades
22
+ def flow(name, &block)
23
+ flow = Flow.new(name, nil)
24
+ flow.instance_eval(&block)
25
+ flow
26
+ end
27
+
28
+ def expr(s)
29
+ return s if s.kind_of?(ExprStub)
30
+ ExprStub.new(s)
31
+ end
32
+
33
+ # Creates a cascading.tuple.Fields instance from a string or an array of strings.
34
+ def fields(fields)
35
+ if fields.nil?
36
+ return nil
37
+ elsif fields.is_a? Java::CascadingTuple::Fields
38
+ return fields
39
+ elsif fields.is_a? ::Array
40
+ if fields.size == 1
41
+ return fields(fields[0])
42
+ end
43
+ raise "Fields cannot be nil: #{fields.inspect}" if fields.include?(nil)
44
+ end
45
+ return Java::CascadingTuple::Fields.new([fields].flatten.map{ |f| f.kind_of?(Fixnum) && JRUBY_VERSION > '1.2.0' ? f.to_java(:int) : f }.to_java(java.lang.Comparable))
46
+ end
47
+
48
+ def all_fields
49
+ Java::CascadingTuple::Fields::ALL
50
+ end
51
+
52
+ def union_fields(*fields)
53
+ fields(fields.inject([]){ |acc, arr| acc | arr.to_a })
54
+ end
55
+
56
+ def difference_fields(*fields)
57
+ fields(fields[1..-1].inject(fields.first.to_a){ |acc, arr| acc - arr.to_a })
58
+ end
59
+
60
+ def copy_fields(fields)
61
+ fields.select(all_fields)
62
+ end
63
+
64
+ def dedup_fields(*fields)
65
+ raise 'Can only be applied to declarators' unless fields.all?{ |f| f.is_declarator? }
66
+ fields(dedup_field_names(*fields.map{ |f| f.to_a }))
67
+ end
68
+
69
+ def dedup_field_names(*names)
70
+ names.inject([]) do |acc, arr|
71
+ acc + arr.map{ |e| search_field_name(acc, e) }
72
+ end
73
+ end
74
+
75
+ def search_field_name(names, candidate)
76
+ names.include?(candidate) ? search_field_name(names, "#{candidate}_") : candidate
77
+ end
78
+
79
+ def last_grouping_fields
80
+ Java::CascadingTuple::Fields::VALUES
81
+ end
82
+
83
+ def results_fields
84
+ Java::CascadingTuple::Fields::RESULTS
85
+ end
86
+
87
+ # Creates a c.s.TextLine scheme instance from the specified fields.
88
+ def text_line_scheme(*fields)
89
+ unless fields.empty?
90
+ fields = fields(fields)
91
+ return Java::CascadingScheme::TextLine.new(fields)
92
+ else
93
+ return Java::CascadingScheme::TextLine.new
94
+ end
95
+ end
96
+
97
+ # Creates a c.s.SequenceFile scheme instance from the specified fields.
98
+ def sequence_file_scheme(*fields)
99
+ unless fields.empty?
100
+ fields = fields(fields)
101
+ return Java::CascadingScheme::SequenceFile.new(fields)
102
+ else
103
+ return Java::CascadingScheme::SequenceFile.new(all_fields)
104
+ end
105
+ end
106
+
107
+ def multi_tap(*taps)
108
+ Java::CascadingTap::MultiTap.new(taps.to_java("cascading.tap.Tap"))
109
+ end
110
+
111
+ # Generic method for creating taps.
112
+ # It expects a ":kind" argument pointing to the type of tap to create.
113
+ def tap(*args)
114
+ opts = args.extract_options!
115
+ path = args.empty? ? opts[:path] : args[0]
116
+ scheme = opts[:scheme] || text_line_scheme
117
+ sink_mode = opts[:sink_mode] || :keep
118
+ sink_mode = case sink_mode
119
+ when :keep, 'keep' then Java::CascadingTap::SinkMode::KEEP
120
+ when :replace, 'replace' then Java::CascadingTap::SinkMode::REPLACE
121
+ when :append, 'append' then Java::CascadingTap::SinkMode::APPEND
122
+ else raise "Unrecognized sink mode '#{sink_mode}'"
123
+ end
124
+ fs = opts[:kind] || :hfs
125
+ klass = case fs
126
+ when :hfs, 'hfs' then Java::CascadingTap::Hfs
127
+ when :dfs, 'dfs' then Java::CascadingTap::Dfs
128
+ when :lfs, 'lfs' then Java::CascadingTap::Lfs
129
+ else raise "Unrecognized kind of tap '#{fs}'"
130
+ end
131
+ parameters = [scheme, path, sink_mode]
132
+ klass.new(*parameters)
133
+ end
134
+ end
@@ -0,0 +1,30 @@
1
+ # Wrapper meant for NativeExceptions that wrap exceptions from Cascading. The
2
+ # trouble is that the combined stack traces are so long, printing them case
3
+ # actually omit locations in the cascading.jruby or application code that
4
+ # matter, leaving you with no information about the source of the error. This
5
+ # class just swallows all the nested exceptions, printing their message, while
6
+ # giving you a direct route into JRuby code to the cause of the problem.
7
+ class CascadingException < StandardError
8
+ def initialize(native_exception, message)
9
+ @ne = native_exception
10
+ super("#{message}\n#{trace_causes(@ne, 1)}")
11
+ end
12
+
13
+ def cause(depth)
14
+ fetch_cause(@ne, depth)
15
+ end
16
+
17
+ private
18
+
19
+ def fetch_cause(ne, depth)
20
+ return ne if depth <= 1
21
+ fetch_cause(ne.cause, depth - 1)
22
+ end
23
+
24
+ def trace_causes(ne, depth)
25
+ return unless ne
26
+ trace = "Cause #{depth}: #{ne}\n"
27
+ trace += ne.stack_trace.map { |e| " at #{e.class_name}.#{e.method_name}(#{e.file_name}:#{e.line_number})" }.join("\n") + "\n" if ne.respond_to?(:stack_trace)
28
+ trace += "#{trace_causes(ne.cause, depth + 1)}"
29
+ end
30
+ end
@@ -0,0 +1,33 @@
1
+ class ExprStub
2
+ attr_accessor :expression, :types
3
+
4
+ def initialize(st)
5
+ @expression = st.dup
6
+ @types = {}
7
+
8
+ # Simple regexp based parser for types
9
+
10
+ JAVA_TYPE_MAP.each do |sym, klass|
11
+ @expression.gsub!(/[A-Za-z0-9_]+:#{sym.to_s}/) do |match|
12
+ name = match.split(/:/).first.gsub(/\s+/, "")
13
+ @types[name] = klass
14
+ match.gsub(/:#{sym.to_s}/, "")
15
+ end
16
+ end
17
+ end
18
+
19
+ def self.split_hash(h)
20
+ keys, values = h.keys.sort, []
21
+ keys.each do |key|
22
+ values << h[key]
23
+ end
24
+ [keys, values]
25
+ end
26
+
27
+ def self.split_names_and_types(expr_types)
28
+ names, types = split_hash(expr_types)
29
+ names = names.to_java(java.lang.String)
30
+ types = types.to_java(java.lang.Class)
31
+ [names, types]
32
+ end
33
+ end
@@ -0,0 +1,15 @@
1
+ # ext.rb : some extensions to basic types
2
+ #
3
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
4
+ #
5
+ # This is free software. Please see the LICENSE and COPYING files for details.
6
+
7
+ class Array
8
+ def extract_options!
9
+ last.is_a?(::Hash) ? pop : {}
10
+ end
11
+
12
+ def extract_options
13
+ last.is_a?(::Hash) ? last : {}
14
+ end
15
+ end
@@ -0,0 +1,168 @@
1
+ # Copyright 2009, Grégoire Marabout. All Rights Reserved.
2
+ #
3
+ # This is free software. Please see the LICENSE and COPYING files for details.
4
+
5
+ require 'cascading/assembly'
6
+
7
+ module Cascading
8
+ class Flow < Cascading::Node
9
+ extend Registerable
10
+
11
+ attr_accessor :properties, :sources, :sinks, :outgoing_scopes, :listeners
12
+
13
+ def initialize(name, parent)
14
+ @properties, @sources, @sinks, @outgoing_scopes, @listeners = {}, {}, {}, {}, []
15
+ super(name, parent)
16
+ self.class.add(name, self)
17
+ end
18
+
19
+ def assembly(name, &block)
20
+ raise "Could not build assembly '#{name}'; block required" unless block_given?
21
+ assembly = Assembly.new(name, self, @outgoing_scopes)
22
+ add_child(assembly)
23
+ assembly.instance_eval(&block)
24
+ assembly
25
+ end
26
+
27
+ # Create a new sink for this flow, with the specified name.
28
+ # "tap" can be either a tap (see Cascading.tap) or a string that will
29
+ # reference a path.
30
+ def sink(*args)
31
+ if (args.size == 2)
32
+ @sinks[args[0]] = args[1]
33
+ elsif (args.size == 1)
34
+ @sinks[@name] = args[0]
35
+ end
36
+ end
37
+
38
+ # Create a new source for this flow, with the specified name.
39
+ # "tap" can be either a tap (see Cascading.tap) or a string that will
40
+ # reference a path.
41
+ def source(*args)
42
+ if (args.size == 2)
43
+ @sources[args[0]] = args[1]
44
+ @outgoing_scopes[args[0]] = Scope.tap_scope(args[1], args[0])
45
+ elsif (args.size == 1)
46
+ @sources[@name] = args[0]
47
+ @outgoing_scopes[@name] = Scope.empty_scope(@name)
48
+ end
49
+ end
50
+
51
+ def scope(name = nil)
52
+ raise 'Must specify name if no children have been defined yet' unless name || last_child
53
+ name ||= last_child.name
54
+ @outgoing_scopes[name]
55
+ end
56
+
57
+ def debug_scope(name = nil)
58
+ scope = scope(name)
59
+ name ||= last_child.name
60
+ puts "Scope for '#{name}':\n #{scope}"
61
+ end
62
+
63
+ def sink_metadata
64
+ @sinks.keys.inject({}) do |sink_metadata, sink_name|
65
+ raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
66
+ sink_metadata[sink_name] = {
67
+ :field_names => @outgoing_scopes[sink_name].values_fields.to_a,
68
+ :primary_key => @outgoing_scopes[sink_name].primary_key_fields.to_a
69
+ }
70
+ sink_metadata
71
+ end
72
+ end
73
+
74
+ # TODO: support all codecs, support list of codecs
75
+ def compress_output(codec, type)
76
+ properties['mapred.output.compress'] = 'true'
77
+ properties['mapred.output.compression.codec'] = case codec
78
+ when :default then Java::OrgApacheHadoopIoCompress::DefaultCodec.java_class.name
79
+ when :gzip then Java::OrgApacheHadoopIoCompress::GzipCodec.java_class.name
80
+ else raise "Codec #{codec} not yet supported by cascading.jruby"
81
+ end
82
+ properties['mapred.output.compression.type'] = case type
83
+ when :none then Java::OrgApacheHadoopIo::SequenceFile::CompressionType::NONE.to_s
84
+ when :record then Java::OrgApacheHadoopIo::SequenceFile::CompressionType::RECORD.to_s
85
+ when :block then Java::OrgApacheHadoopIo::SequenceFile::CompressionType::BLOCK.to_s
86
+ else raise "Compression type '#{type}' not supported"
87
+ end
88
+ end
89
+
90
+ def set_spill_threshold(threshold)
91
+ properties['cascading.cogroup.spill.threshold'] = threshold.to_s
92
+ end
93
+
94
+ def add_file_to_distributed_cache(file)
95
+ add_to_distributed_cache(file, "mapred.cache.files")
96
+ end
97
+
98
+ def add_archive_to_distributed_cache(file)
99
+ add_to_distributed_cache(file, "mapred.cache.archives")
100
+ end
101
+
102
+ def add_listener(listener)
103
+ @listeners << listener
104
+ end
105
+
106
+ def emr_local_path_for_distributed_cache_file(file)
107
+ # NOTE this needs to be *appended* to the property mapred.local.dir
108
+ if file =~ /^s3n?:\/\//
109
+ # EMR
110
+ "/taskTracker/archive/#{file.gsub(/^s3n?:\/\//, "")}"
111
+ else
112
+ # Local
113
+ file
114
+ end
115
+ end
116
+
117
+ def add_to_distributed_cache(file, property)
118
+ v = properties[property]
119
+
120
+ if v
121
+ properties[property] = [v.split(/,/), file].flatten.join(",")
122
+ else
123
+ properties[property] = file
124
+ end
125
+ end
126
+
127
+ def connect(properties = nil)
128
+ properties = java.util.HashMap.new(properties || @properties)
129
+ Java::CascadingFlow::FlowConnector.new(properties).connect(
130
+ name,
131
+ make_tap_parameter(@sources),
132
+ make_tap_parameter(@sinks),
133
+ make_pipes
134
+ )
135
+ end
136
+
137
+ def complete(properties = nil)
138
+ begin
139
+ flow = connect(properties)
140
+ @listeners.each { |l| flow.addListener(l) }
141
+ flow.complete
142
+ rescue NativeException => e
143
+ raise CascadingException.new(e, 'Error completing flow')
144
+ end
145
+ end
146
+
147
+ private
148
+
149
+ def make_tap_parameter(taps)
150
+ taps.inject({}) do |map, (name, tap)|
151
+ assembly = find_child(name)
152
+ raise "Could not find assembly '#{name}' to connect to tap: #{tap}" unless assembly
153
+
154
+ map[assembly.tail_pipe.name] = tap
155
+ map
156
+ end
157
+ end
158
+
159
+ def make_pipes
160
+ @sinks.inject([]) do |pipes, (name, sink)|
161
+ assembly = find_child(name)
162
+ raise "Could not find assembly '#{name}' to make pipe for sink: #{sink}" unless assembly
163
+ pipes << assembly.tail_pipe
164
+ pipes
165
+ end.to_java(Java::CascadingPipe::Pipe)
166
+ end
167
+ end
168
+ end