cascading.jruby 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/HACKING.md +15 -0
  2. data/History.txt +0 -0
  3. data/LICENSE.txt +165 -0
  4. data/README.md +7 -0
  5. data/Rakefile +45 -0
  6. data/bin/make_job +81 -0
  7. data/lib/cascading/assembly.rb +726 -0
  8. data/lib/cascading/base.rb +63 -0
  9. data/lib/cascading/cascade.rb +63 -0
  10. data/lib/cascading/cascading.rb +134 -0
  11. data/lib/cascading/cascading_exception.rb +30 -0
  12. data/lib/cascading/expr_stub.rb +33 -0
  13. data/lib/cascading/ext/array.rb +15 -0
  14. data/lib/cascading/flow.rb +168 -0
  15. data/lib/cascading/operations.rb +204 -0
  16. data/lib/cascading/scope.rb +160 -0
  17. data/lib/cascading.rb +63 -0
  18. data/samples/branch.rb +31 -0
  19. data/samples/cascading.rb +41 -0
  20. data/samples/copy.rb +18 -0
  21. data/samples/data/data2.txt +88799 -0
  22. data/samples/data/data_join1.txt +3 -0
  23. data/samples/data/data_join2.txt +3 -0
  24. data/samples/data/data_join3.txt +3 -0
  25. data/samples/join.rb +32 -0
  26. data/samples/logwordcount.rb +22 -0
  27. data/samples/project.rb +24 -0
  28. data/samples/rename.rb +21 -0
  29. data/samples/scorenames.rb +20 -0
  30. data/samples/splitter.rb +20 -0
  31. data/samples/union.rb +35 -0
  32. data/spec/cascading_spec.rb +100 -0
  33. data/spec/expr_spec.rb +10 -0
  34. data/spec/primary_key_spec.rb +119 -0
  35. data/spec/resource/join_input.txt +3 -0
  36. data/spec/resource/test_input.txt +4 -0
  37. data/spec/scope_spec.rb +174 -0
  38. data/spec/spec.opts +6 -0
  39. data/spec/spec_helper.rb +5 -0
  40. data/spec/spec_util.rb +188 -0
  41. data/src/cascading/jruby/Main.java +38 -0
  42. data/src/cascading/jruby/runner.rb +6 -0
  43. data/tags +238 -0
  44. data/tasks/ann.rake +80 -0
  45. data/tasks/ant.rake +11 -0
  46. data/tasks/bones.rake +20 -0
  47. data/tasks/gem.rake +206 -0
  48. data/tasks/git.rake +40 -0
  49. data/tasks/notes.rake +27 -0
  50. data/tasks/post_load.rake +34 -0
  51. data/tasks/rdoc.rake +50 -0
  52. data/tasks/rubyforge.rake +55 -0
  53. data/tasks/samples.rake +13 -0
  54. data/tasks/setup.rb +300 -0
  55. data/tasks/spec.rake +59 -0
  56. data/tasks/svn.rake +47 -0
  57. data/tasks/test.rake +42 -0
  58. data/test/data/data1.txt +14 -0
  59. data/test/data/data2.txt +14 -0
  60. data/test/test_assembly.rb +321 -0
  61. data/test/test_cascading.rb +49 -0
  62. data/test/test_flow.rb +15 -0
  63. metadata +137 -0
@@ -0,0 +1,321 @@
1
+ require 'test/unit'
2
+ require 'cascading'
3
+
4
+ def compare_with_references(test_name)
5
+ result = compare_files("test/references/#{test_name}.txt", "output/#{test_name}/part-00000")
6
+ assert_nil(result)
7
+ end
8
+
9
+ # Convenience for basic assembly tests; not valid for applications
10
+ def assembly(name, &block)
11
+ assembly = Assembly.new(name, nil)
12
+ assembly.instance_eval(&block)
13
+ assembly
14
+ end
15
+
16
+ class TC_Assembly < Test::Unit::TestCase
17
+ include Operations
18
+
19
+ def mock_assembly(&block)
20
+ flow 'test' do
21
+ source 'test', tap('test/data/data1.txt')
22
+ $assembly = assembly 'test', &block
23
+ end
24
+ $assembly
25
+ end
26
+
27
+ def test_create_assembly_simple
28
+ assembly = assembly "assembly1" do
29
+ # Empty assembly
30
+ end
31
+
32
+ assert_not_nil assembly
33
+ pipe = assembly.tail_pipe
34
+ assert pipe.is_a? Java::CascadingPipe::Pipe
35
+ end
36
+
37
+ def test_each_identity
38
+ assembly = mock_assembly do
39
+ each 'offset', :filter => identity
40
+ end
41
+
42
+ flow = Flow.get('test')
43
+ assert_not_nil flow
44
+
45
+ assert_not_nil flow.find_child('test')
46
+ assert_equal assembly, flow.find_child('test')
47
+ assert_not_nil Flow.get('test').find_child('test')
48
+ end
49
+
50
+ def test_create_each
51
+ # You can't apply an Each to 0 fields
52
+ assert_raise CascadingException do
53
+ assembly = mock_assembly do
54
+ each(:filter => identity)
55
+ end
56
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::Each
57
+ end
58
+
59
+ assembly = mock_assembly do
60
+ each('offset', :output => 'offset_copy',
61
+ :filter => Java::CascadingOperation::Identity.new(fields('offset_copy')))
62
+ end
63
+ pipe = assembly.tail_pipe
64
+
65
+ assert pipe.is_a? Java::CascadingPipe::Each
66
+
67
+ assert_equal 'offset', pipe.getArgumentSelector().get(0)
68
+ assert_equal 'offset_copy', pipe.getOutputSelector().get(0)
69
+ end
70
+
71
+ # For now, replaced these tests with the trivial observation that you can't
72
+ # follow a Tap with an Every. Eventually, should support testing within a
73
+ # group_by block.
74
+ def test_create_every
75
+ assert_raise CascadingException do
76
+ assembly = mock_assembly do
77
+ every(:aggregator => count_function)
78
+ end
79
+ pipe = assembly.tail_pipe
80
+ assert pipe.is_a? Java::CascadingPipe::Every
81
+ end
82
+
83
+ assert_raise CascadingException do
84
+ assembly = mock_assembly do
85
+ every(:aggregator => count_function("field1", "field2"))
86
+ end
87
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::Every
88
+ end
89
+
90
+ assert_raise CascadingException do
91
+ assembly = mock_assembly do
92
+ every("Field1", :aggregator => count_function)
93
+ end
94
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::Every
95
+ assert_equal "Field1", assembly.tail_pipe.getArgumentSelector().get(0)
96
+ end
97
+
98
+ assert_raise CascadingException do
99
+ assembly = mock_assembly do
100
+ every('line', :aggregator => count_function, :output=>'line_count')
101
+ end
102
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::Every
103
+ assert_equal 'line', assembly.tail_pipe.getArgumentSelector().get(0)
104
+ assert_equal 'line_count', assembly.tail_pipe.getOutputSelector().get(0)
105
+ end
106
+ end
107
+
108
+ def test_create_group_by
109
+ assembly = mock_assembly do
110
+ group_by('line')
111
+ end
112
+
113
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::GroupBy
114
+ grouping_fields = assembly.tail_pipe.getGroupingSelectors()['test']
115
+ assert_equal 'line', grouping_fields.get(0)
116
+
117
+ assembly = mock_assembly do
118
+ group_by('line')
119
+ end
120
+
121
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::GroupBy
122
+ grouping_fields = assembly.tail_pipe.getGroupingSelectors()['test']
123
+ assert_equal 'line', grouping_fields.get(0)
124
+ end
125
+
126
+ def test_create_group_by_many_fields
127
+ assembly = mock_assembly do
128
+ group_by(['offset', 'line'])
129
+ end
130
+
131
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::GroupBy
132
+ grouping_fields = assembly.tail_pipe.getGroupingSelectors()['test']
133
+ assert_equal 'offset', grouping_fields.get(0)
134
+ assert_equal 'line', grouping_fields.get(1)
135
+ end
136
+
137
+ def test_create_group_by_with_sort
138
+ assembly = mock_assembly do
139
+ group_by('offset', 'line', :sort_by => ['line'])
140
+ end
141
+
142
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::GroupBy
143
+ grouping_fields = assembly.tail_pipe.getGroupingSelectors()['test']
144
+ sorting_fields = assembly.tail_pipe.getSortingSelectors()['test']
145
+
146
+ assert_equal 2, grouping_fields.size
147
+ assert_equal 1, sorting_fields.size
148
+
149
+ assert_equal 'offset', grouping_fields.get(0)
150
+ assert_equal 'line', grouping_fields.get(1)
151
+ assert assembly.tail_pipe.isSorted()
152
+ assert !assembly.tail_pipe.isSortReversed()
153
+ assert_equal 'line', sorting_fields.get(0)
154
+ end
155
+
156
+ def test_create_group_by_with_sort_reverse
157
+ assembly = mock_assembly do
158
+ group_by('offset', 'line', :sort_by => ['line'], :reverse => true)
159
+ end
160
+
161
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::GroupBy
162
+ grouping_fields = assembly.tail_pipe.getGroupingSelectors()['test']
163
+ sorting_fields = assembly.tail_pipe.getSortingSelectors()['test']
164
+
165
+ assert_equal 2, grouping_fields.size
166
+ assert_equal 1, sorting_fields.size
167
+
168
+ assert_equal 'offset', grouping_fields.get(0)
169
+ assert_equal 'line', grouping_fields.get(1)
170
+ assert assembly.tail_pipe.isSorted()
171
+ assert assembly.tail_pipe.isSortReversed()
172
+ assert_equal 'line', sorting_fields.get(0)
173
+ end
174
+
175
+ def test_create_group_by_reverse
176
+ assembly = mock_assembly do
177
+ group_by('offset', 'line', :reverse => true)
178
+ end
179
+
180
+ assert assembly.tail_pipe.is_a? Java::CascadingPipe::GroupBy
181
+ grouping_fields = assembly.tail_pipe.getGroupingSelectors()['test']
182
+ sorting_fields = assembly.tail_pipe.getSortingSelectors()['test']
183
+
184
+ assert_equal 2, grouping_fields.size
185
+ assert_equal 2, sorting_fields.size
186
+
187
+ assert_equal 'offset', grouping_fields.get(0)
188
+ assert_equal 'line', grouping_fields.get(1)
189
+ assert assembly.tail_pipe.isSorted()
190
+ assert assembly.tail_pipe.isSortReversed()
191
+ assert_equal 'offset', sorting_fields.get(0)
192
+ assert_equal 'line', sorting_fields.get(1)
193
+ end
194
+
195
+ def test_branch_unique
196
+ assembly = mock_assembly do
197
+ branch 'branch1' do
198
+ end
199
+ end
200
+
201
+ assert_equal 1, assembly.children.size
202
+
203
+ end
204
+
205
+ def test_branch_empty
206
+ assembly = mock_assembly do
207
+ branch 'branch1' do
208
+ end
209
+
210
+ branch 'branch2' do
211
+ branch 'branch3' do
212
+ end
213
+ end
214
+ end
215
+
216
+ assert_equal 2, assembly.children.size
217
+ assert_equal 0, assembly.children['branch1'].children.size
218
+ assert_equal 1, assembly.children['branch2'].children.size
219
+ end
220
+
221
+ def test_branch_single
222
+ assembly = mock_assembly do
223
+ branch 'branch1' do
224
+ branch 'branch2' do
225
+ each 'line', :function => identity
226
+ end
227
+ end
228
+ end
229
+
230
+ assert_equal 1, assembly.children.size
231
+ assert_equal 1, assembly.children['branch1'].children.size
232
+ assert_equal 0, assembly.children['branch1'].children['branch2'].children.size
233
+ end
234
+
235
+ # Fixed this test, but it isn't even valid. You shouldn't be able to follow
236
+ # an Each with an Every.
237
+ def test_full_assembly
238
+ assert_raise CascadingException do
239
+ assembly = mock_assembly do
240
+ each('offset', :output => 'offset_copy',
241
+ :filter => Java::CascadingOperation::Identity.new(fields('offset_copy')))
242
+ every(:aggregator => count_function)
243
+ end
244
+
245
+ pipe = assembly.tail_pipe
246
+
247
+ assert pipe.is_a? Java::CascadingPipe::Every
248
+ end
249
+ end
250
+
251
+ end
252
+
253
+
254
+ class TC_AssemblyScenarii < Test::Unit::TestCase
255
+
256
+ def test_splitter
257
+ flow = flow "splitter" do
258
+ source "copy", tap("test/data/data1.txt")
259
+ sink "copy", tap('output/splitter', :sink_mode => :replace)
260
+
261
+ assembly "copy" do
262
+ split "line", :pattern => /[.,]*\s+/, :into=>["name", "score1", "score2", "id"], :output => ["name", "score1", "score2", "id"]
263
+ assert_size_equals 4
264
+ assert_not_null
265
+ debug :print_fields => true
266
+ end
267
+ end.complete
268
+ end
269
+
270
+ def test_join1
271
+ cascade 'splitter' do
272
+ flow 'splitter' do
273
+ source "data1", tap("test/data/data1.txt")
274
+ source "data2", tap("test/data/data2.txt")
275
+ sink "joined", tap('output/joined', :sink_mode => :replace)
276
+
277
+ assembly1 = assembly "data1" do
278
+ split "line", :pattern => /[.,]*\s+/, :into=>["name", "score1", "score2", "id"], :output => ["name", "score1", "score2", "id"]
279
+ assert_size_equals 4
280
+ assert_not_null
281
+ debug :print_fields => true
282
+ end
283
+
284
+ assembly2 = assembly "data2" do
285
+ split "line", :pattern => /[.,]*\s+/, :into=>["name", "id", "town"], :output => ["name", "id", "town"]
286
+ assert_size_equals 3
287
+ assert_not_null
288
+ debug :print_fields => true
289
+ end
290
+
291
+ assembly "joined" do
292
+ join assembly1.name, assembly2.name, :on => ["name", "id"], :declared_fields => ["name", "score1", "score2", "id", "name2", "id2", "town"]
293
+ assert_size_equals 7
294
+ assert_not_null
295
+ end
296
+ end
297
+ end.complete
298
+ end
299
+
300
+ def test_join2
301
+ flow = flow "splitter" do
302
+ source "data1", tap("test/data/data1.txt")
303
+ source "data2", tap("test/data/data2.txt")
304
+ sink "joined", tap('output/joined', :sink_mode => :replace)
305
+
306
+ assembly "data1" do
307
+ split "line", :pattern => /[.,]*\s+/, :into=>["name", "score1", "score2", "id"], :output => ["name", "score1", "score2", "id"]
308
+ debug :print_fields => true
309
+ end
310
+
311
+ assembly "data2" do
312
+ split "line", :pattern => /[.,]*\s+/, :into=>["name", "code", "town"], :output => ["name", "code", "town"]
313
+ debug :print_fields => true
314
+ end
315
+
316
+ assembly "joined" do
317
+ join :on => {"data1"=>["name", "id"], "data2"=>["name", "code"]}, :declared_fields => ["name", "score1", "score2", "id", "name2", "code", "town"]
318
+ end
319
+ end.complete
320
+ end
321
+ end
@@ -0,0 +1,49 @@
1
+ require 'test/unit'
2
+ require 'cascading'
3
+
4
+ class TC_Cascading < Test::Unit::TestCase
5
+ def test_fields_field
6
+ result = fields(all_fields)
7
+ assert result == all_fields
8
+ end
9
+
10
+ def test_fields_single
11
+ declared = "Field1"
12
+
13
+ result = fields(declared)
14
+
15
+ assert result.size == 1
16
+
17
+ assert_equal declared, result.get(0)
18
+ end
19
+
20
+ def test_fields_multiple
21
+ declared = ["Field1", "Field2", "Field3"]
22
+
23
+ result = fields(declared)
24
+
25
+ assert result.size == 3
26
+
27
+ assert_equal declared[0], result.get(0)
28
+ assert_equal declared[1], result.get(1)
29
+ assert_equal declared[2], result.get(2)
30
+ end
31
+
32
+ def test_tap
33
+ tap = tap('/temp')
34
+ assert_equal '/temp', tap.getPath().toString()
35
+ assert tap.is_a? Java::CascadingTap::Hfs
36
+
37
+ tap = tap('/temp', :kind => :dfs)
38
+ assert_equal '/temp', tap.getPath().toString()
39
+ assert tap.is_a? Java::CascadingTap::Dfs
40
+
41
+ tap = tap('/temp', :kind => :lfs)
42
+ assert_equal '/temp', tap.getPath().toString()
43
+ assert tap.is_a? Java::CascadingTap::Lfs
44
+
45
+ tap = tap('/temp', :kind => :hfs)
46
+ assert_equal '/temp', tap.getPath().toString()
47
+ assert tap.is_a? Java::CascadingTap::Hfs
48
+ end
49
+ end
data/test/test_flow.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'test/unit'
2
+ require 'cascading'
3
+
4
+ class TC_Flow < Test::Unit::TestCase
5
+ def test_assembly
6
+ flow = flow 'My Flow1' do
7
+ assembly "Test1" do
8
+ end
9
+ end
10
+
11
+ assert_equal 1, flow.children.size
12
+ assert_equal flow.children["Test1"], flow.find_child("Test1")
13
+ assert_equal flow.last_child, flow.find_child("Test1")
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,137 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cascading.jruby
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 4
9
+ version: 0.0.4
10
+ platform: ruby
11
+ authors:
12
+ - Matt Walker
13
+ - "Gr\xC3\xA9goire Marabout"
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-04-05 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: cascading.jruby is a small DSL above Cascading, written in JRuby
23
+ email: mwalker@etsy.com
24
+ executables:
25
+ - make_job
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - History.txt
30
+ - LICENSE.txt
31
+ - bin/make_job
32
+ - samples/data/data2.txt
33
+ - samples/data/data_join1.txt
34
+ - samples/data/data_join2.txt
35
+ - samples/data/data_join3.txt
36
+ - spec/resource/join_input.txt
37
+ - spec/resource/test_input.txt
38
+ - test/data/data1.txt
39
+ - test/data/data2.txt
40
+ files:
41
+ - HACKING.md
42
+ - History.txt
43
+ - LICENSE.txt
44
+ - README.md
45
+ - Rakefile
46
+ - bin/make_job
47
+ - lib/cascading.rb
48
+ - lib/cascading/assembly.rb
49
+ - lib/cascading/base.rb
50
+ - lib/cascading/cascade.rb
51
+ - lib/cascading/cascading.rb
52
+ - lib/cascading/cascading_exception.rb
53
+ - lib/cascading/expr_stub.rb
54
+ - lib/cascading/ext/array.rb
55
+ - lib/cascading/flow.rb
56
+ - lib/cascading/operations.rb
57
+ - lib/cascading/scope.rb
58
+ - samples/branch.rb
59
+ - samples/cascading.rb
60
+ - samples/copy.rb
61
+ - samples/data/data2.txt
62
+ - samples/data/data_join1.txt
63
+ - samples/data/data_join2.txt
64
+ - samples/data/data_join3.txt
65
+ - samples/join.rb
66
+ - samples/logwordcount.rb
67
+ - samples/project.rb
68
+ - samples/rename.rb
69
+ - samples/scorenames.rb
70
+ - samples/splitter.rb
71
+ - samples/union.rb
72
+ - spec/cascading_spec.rb
73
+ - spec/expr_spec.rb
74
+ - spec/primary_key_spec.rb
75
+ - spec/resource/join_input.txt
76
+ - spec/resource/test_input.txt
77
+ - spec/scope_spec.rb
78
+ - spec/spec.opts
79
+ - spec/spec_helper.rb
80
+ - spec/spec_util.rb
81
+ - src/cascading/jruby/Main.java
82
+ - src/cascading/jruby/runner.rb
83
+ - tags
84
+ - tasks/ann.rake
85
+ - tasks/ant.rake
86
+ - tasks/bones.rake
87
+ - tasks/gem.rake
88
+ - tasks/git.rake
89
+ - tasks/notes.rake
90
+ - tasks/post_load.rake
91
+ - tasks/rdoc.rake
92
+ - tasks/rubyforge.rake
93
+ - tasks/samples.rake
94
+ - tasks/setup.rb
95
+ - tasks/spec.rake
96
+ - tasks/svn.rake
97
+ - tasks/test.rake
98
+ - test/data/data1.txt
99
+ - test/data/data2.txt
100
+ - test/test_assembly.rb
101
+ - test/test_cascading.rb
102
+ - test/test_flow.rb
103
+ has_rdoc: true
104
+ homepage: http://github.com/etsy/cascading.jruby
105
+ licenses: []
106
+
107
+ post_install_message:
108
+ rdoc_options:
109
+ - --main
110
+ - README.md
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ segments:
118
+ - 0
119
+ version: "0"
120
+ required_rubygems_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ segments:
125
+ - 0
126
+ version: "0"
127
+ requirements: []
128
+
129
+ rubyforge_project: cascading.jruby
130
+ rubygems_version: 1.3.6
131
+ signing_key:
132
+ specification_version: 3
133
+ summary: A JRuby DSL for Cascading
134
+ test_files:
135
+ - test/test_assembly.rb
136
+ - test/test_cascading.rb
137
+ - test/test_flow.rb