mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,1640 @@
1
+ # ---------------------------------------------------------------------------
2
+ #
3
+ # = CROSS
4
+ #
5
+ # Computes the cross product of two or more relations.
6
+ #
7
+ # == Syntax
8
+ #
9
+ # alias = CROSS alias, alias [, alias …] [PARALLEL n];
10
+ #
11
+ # == Terms
12
+ #
13
+ # alias::
14
+ # The name of a relation.
15
+ #
16
+ # PARALLEL n::
17
+ # Increase the parallelism of a job by specifying the number of reduce tasks,
18
+ # n. The optimal number of parallel tasks depends on the amount of memory on
19
+ # each node and the memory required by each of the tasks. To determine n, use
20
+ # the following as a general guideline:
21
+ # n = (nr_nodes - 1) * 0.45 * nr_GB
22
+ # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
23
+ # memory on each node.
24
+ #
25
+ # Note the following:
26
+ # * Parallel only affects the number of reduce tasks. Map parallelism is
27
+ # determined by the input file, one map for each HDFS block.
28
+ # * If you don’t specify parallel, you still get the same map parallelism but
29
+ # only one reduce task.
30
+ #
31
+ # == Usage
32
+ #
33
+ # Use the CROSS operator to compute the cross product (Cartesian product) of two
34
+ # or more relations.
35
+ #
36
+ # CROSS is an expensive operation and should be used sparingly.
37
+ #
38
+ # == Example
39
+ #
40
+ # Suppose we have relations A and B.
41
+ #
42
+ # (A) (B)
43
+ # ----------- --------
44
+ # (1, 2, 3) (2, 4)
45
+ # (4, 2, 1) (8, 9)
46
+ # (1, 3)
47
+ #
48
+ # In this example the cross product of relation A and B is computed.
49
+ #
50
+ # X = CROSS A, B;
51
+ #
52
+ # Relation X looks like this.
53
+ #
54
+ # (1, 2, 3, 2, 4)
55
+ # (1, 2, 3, 8, 9)
56
+ # (1, 2, 3, 1, 3)
57
+ # (4, 2, 1, 2, 4)
58
+ # (4, 2, 1, 8, 9)
59
+ # (4, 2, 1, 1, 3)
60
+ #
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ #
65
+ # DISTINCT
66
+ #
67
+ # Removes duplicate tuples in a relation.
68
+ #
69
+ # == Syntax
70
+ #
71
+ # alias = DISTINCT alias [PARALLEL n];
72
+ #
73
+ # == Terms
74
+ #
75
+ # alias::
76
+ # The name of a relation.
77
+ #
78
+ # PARALLEL n::
79
+ # Increase the parallelism of a job by specifying the number of reduce tasks,
80
+ # n. The optimal number of parallel tasks depends on the amount of memory on
81
+ # each node and the memory required by each of the tasks. To determine n, use
82
+ # the following as a general guideline:
83
+ # n = (nr_nodes - 1) * 0.45 * nr_GB
84
+ # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
85
+ # memory on each node.
86
+ #
87
+ # Note the following:
88
+ # * Parallel only affects the number of reduce tasks. Map parallelism is
89
+ # determined by the input file, one map for each HDFS block.
90
+ # * If you don’t specify parallel, you still get the same map parallelism but
91
+ # only one reduce task.
92
+ #
93
+ # == Usage
94
+ #
95
+ # Use the DISTINCT operator to remove duplicate tuples in a relation. DISTINCT
96
+ # does not preserve the original order of the contents (to eliminate duplicates,
97
+ # Pig must first sort the data). You cannot use DISTINCT on a subset of fields. To
98
+ # do this, use FOREACH … GENERATE to select the fields, and then use DISTINCT.
99
+ #
100
+ # == Example
101
+ #
102
+ # Suppose we have relation A.
103
+ #
104
+ # (A)
105
+ # ---------
106
+ # (8, 3, 4)
107
+ # (1, 2, 3)
108
+ # (4, 3, 3)
109
+ # (4, 3, 3)
110
+ # (1, 2, 3)
111
+ #
112
+ # In this example all duplicate tuples are removed.
113
+ #
114
+ # X = DISTINCT A;
115
+ #
116
+ # Relation X looks like this.
117
+ #
118
+ # (1, 2, 3)
119
+ # (4, 3, 3)
120
+ # (8, 3, 4)
121
+ #
122
+
123
+ # ---------------------------------------------------------------------------
124
+ #
125
+ # FILTER
126
+ #
127
+ # Selects tuples (rows) from a relation based on some condition.
128
+ #
129
+ # == Syntax
130
+ #
131
+ # alias = FILTER alias BY expression;
132
+ #
133
+ # == Terms
134
+ #
135
+ # alias::
136
+ # The name of a relation.
137
+ #
138
+ # BY::
139
+ # Required keyword.
140
+ #
141
+ # expression::
142
+ # An expression.
143
+ #
144
+ # == Usage
145
+ #
146
+ # Use the FILTER operator to work with tuples (rows) of data. FILTER is commonly
147
+ # used to select the data that you want; or, conversely, to filter out (remove)
148
+ # the data you don’t want.
149
+ #
150
+ # Note: If you want to work with specific fields (columns) of data, use the
151
+ # FOREACH …GENERATE operation.
152
+ #
153
+ # == Examples
154
+ #
155
+ # Suppose we have relation A.
156
+ #
157
+ # (A: f1:int, f2:int, f3:int)
158
+ # ----------------
159
+ # (1, 2, 3)
160
+ # (4, 2, 1)
161
+ # (8, 3, 4)
162
+ # (4, 3, 3)
163
+ # (7, 2, 5)
164
+ # (8, 4, 3)
165
+ #
166
+ # In this example the condition states that if the third field equals 3, then add the tuple to relation X.
167
+ #
168
+ # X = FILTER A BY f3 == 3;
169
+ #
170
+ # Relation X looks like this.
171
+ #
172
+ # (1, 2, 3)
173
+ # (4, 3, 3)
174
+ # (8, 4, 3)
175
+ #
176
+ # In this example the condition states that if the first field equals 8 or if the sum of fields f2 and f3 is not greater than first field, then add the tuple to relation X.
177
+ #
178
+ # X = FILTER A BY (f1 == 8) OR (NOT (f2+f3 > f1));
179
+ #
180
+ # Relation X looks like this.
181
+ #
182
+ # (4, 2, 1)
183
+ # (8, 3, 4)
184
+ # (7, 2, 5)
185
+ # (8, 4, 3)
186
+ #
187
+
188
+ # ---------------------------------------------------------------------------
189
+ #
190
+ # FOREACH … GENERATE
191
+ #
192
+ # Generates data transformations based on fields (columns) of data.
193
+ #
194
+ # == Syntax
195
+ #
196
+ # alias = FOREACH { gen_blk | nested_gen_blk } [AS schema];
197
+ #
198
+ # == Terms
199
+ #
200
+ # alias::
201
+ # The name of a relation.
202
+ #
203
+ # gen_blk::
204
+ # FOREACH … GENERATE used with a non-nested relation. Use this syntax:
205
+ #
206
+ # alias = FOREACH alias GENERATE expression [expression ….]
207
+ #
208
+ # nested_gen_blk::
209
+ # FOREACH … GENERATE used with a nested relation. Use this syntax:
210
+ #
211
+ # alias = FOREACH nested_alias {
212
+ # alias = nested_op; [alias = nested_op; …]
213
+ # GENERATE expression [expression ….]
214
+ # };
215
+ #
216
+ # where:
217
+ # * The nested block is enclosed in opening and closing brackets { … }.
218
+ # * The GENERATE keyword must be the last statement within the nested block.
219
+ #
220
+ # expression::
221
+ # An expression.
222
+ #
223
+ # nested_alias::
224
+ # If one of the fields (columns) in a relation is a bag, the bag can be treated
225
+ # as an inner or a nested relation.
226
+ #
227
+ # nested_op::
228
+ # Allowable operations include FILTER, ORDER, and DISTINCT.
229
+ #
230
+ # The FOREACH … GENERATE operation itself is not allowed since this could lead
231
+ # to an arbitrary number of nesting levels.
232
+ #
233
+ # AS::
234
+ # Keyword.
235
+ #
236
+ # schema::
237
+ # A schema using the AS keyword (see Schemas).
238
+ #
239
+ # * If the FLATTEN keyword is used, enclose the schema in parentheses.
240
+ #
241
+ # * If the FLATTEN keyword is not used, don't enclose the schema in parentheses.
242
+ #
243
+ # == Usage
244
+ #
245
+ # Use the FOREACH …GENERATE operation to work with individual fields (columns) of data. The FOREACH …GENERATE operation works with non-nested and nested relations.
246
+ #
247
+ # A statement with a non-nested relation A could look like this.
248
+ #
249
+ # X = FOREACH A GENERATE f1;
250
+ #
251
+ # A statement with a nested relation A could look like this.
252
+ #
253
+ # X = FOREACH B {
254
+ #
255
+ # S = FILTER A by 'xyz';
256
+ #
257
+ # GENERATE COUNT (S.$0);
258
+ #
259
+ # }
260
+ #
261
+ # Note: FOREACH … GENERATE works with fields (columns) of data. If you want to work with entire tuples (rows) of data, use the FILTER operation.
262
+ #
263
+ # == Examples
264
+ #
265
+ # Suppose we have relations A and B, and derived relation C (where C = COGROUP A BY a1 INNER, B BY b1 INNER;).
266
+ #
267
+ # (A: a1:int, a2:int, a3:int)
268
+ # -----------------
269
+ # (1, 2, 3)
270
+ # (4, 2, 1)
271
+ # (8, 3, 4)
272
+ # (4, 3, 3)
273
+ # (7, 2, 5)
274
+ # (8, 4, 3)
275
+ #
276
+ #
277
+ # (B: b1:int, b2:int)
278
+ # ---------------
279
+ # (2, 4)
280
+ # (8, 9)
281
+ # (1, 3)
282
+ # (2, 7)
283
+ # (2, 9)
284
+ # (4, 6)
285
+ # (4, 9)
286
+ #
287
+ # (C: c1, c2, c3)
288
+ # ---------------------
289
+ # (1, {(1, 2, 3)}, {(1, 3)})
290
+ # (4, {(4, 2, 1), (4, 3, 3)}, {(4, 6), (4, 9)})
291
+ # (8, {(8, 3, 4), (8, 4, 3)}, {(8, 9)})
292
+ #
293
+ #
294
+ # == Example: Projection
295
+ #
296
+ # In this example the asterisk (*) is used to project all fields from relation A to relation X (this is similar to SQL Select *). Relation A and X are identical.
297
+ #
298
+ # X = FOREACH A GENERATE *;
299
+ #
300
+ # In this example two fields from relation A are projected to form relation X.
301
+ #
302
+ # X = FOREACH A GENERATE a1, a2;
303
+ #
304
+ # Relation X looks this.
305
+ #
306
+ # (1, 2)
307
+ # (4, 2)
308
+ # (8, 3)
309
+ # (4, 3)
310
+ # (7, 2)
311
+ # (8, 4)
312
+ # == Example: Nested Projection
313
+ #
314
+ # Note: See GROUP for information about the "group" field in relation C.
315
+ #
316
+ # In this example if one of the fields in the input relation is a tuple, bag or map, we can perform projection on that field.
317
+ #
318
+ # X = FOREACH C GENERATE group, B.b2;
319
+ #
320
+ # Relation X looks like this.
321
+ #
322
+ # (1, {(3)})
323
+ # (4, {(6), (9)})
324
+ # (8, {(9)})
325
+ #
326
+ # In this example multiple nested columns are retained.
327
+ #
328
+ # X = FOREACH C GENERATE group, A.(a1, a2);
329
+ #
330
+ # Relation X looks like this.
331
+ #
332
+ # (1, {(1, 2)})
333
+ # (4, {(4, 2), (4, 3)})
334
+ # (8, {(8, 3), (8, 4)})
335
+ # == Example: Schema
336
+ #
337
+ # In this example two fields in relation A are summed to form relation X. A schema is defined for the projected field.
338
+ #
339
+ # X = FOREACH A GENERATE a1+a2 AS f1:int;
340
+ #
341
+ # Y = FILTER X by f1 > 10;
342
+ #
343
+ # Relations X and Y look this.
344
+ #
345
+ # (X) (Y)
346
+ # ----- ------
347
+ # (3) (11)
348
+ # (6) (12)
349
+ # (11)
350
+ # (7)
351
+ # (9)
352
+ # (12)
353
+ #
354
+ # == Example: Applying Functions
355
+ #
356
+ # Note: See GROUP for information about the "group" field in relation C.
357
+ #
358
+ # In this example the built-in function SUM() is used to sum a set of numbers in a bag.
359
+ #
360
+ # X = FOREACH C GENERATE group, SUM (A.a1);
361
+ #
362
+ # Relation X looks like this.
363
+ #
364
+ # (1, 1)
365
+ # (4, 8)
366
+ # (8, 16)
367
+ # == Example: Flattening
368
+ #
369
+ # Note: See GROUP for information about the "group" field in relation C.
370
+ #
371
+ # In this example the FLATTEN keyword is used to eliminate nesting.
372
+ #
373
+ # X = FOREACH C GENERATE group, FLATTEN(A);
374
+ #
375
+ # Relation X looks like this.
376
+ #
377
+ # (1, 1, 2, 3)
378
+ # (4, 4, 2, 1)
379
+ # (4, 4, 3, 3)
380
+ # (8, 8, 3, 4)
381
+ # (8, 8, 4, 3)
382
+ #
383
+ # Another FLATTEN example.
384
+ #
385
+ # X = FOREACH C GENERATE GROUP, FLATTEN(A.a3);
386
+ #
387
+ # Relation X looks like this.
388
+ #
389
+ # (1, 3)
390
+ # (4, 1)
391
+ # (4, 3)
392
+ # (8, 4)
393
+ # (8, 3)
394
+ #
395
+ # Another FLATTEN example.
396
+ #
397
+ # X = FOREACH C GENERATE FLATTEN(A.(f1, f2)), FLATTEN(B.$1);
398
+ #
399
+ # Relation X looks like this. Note that for the group '4' in C, there are two tuples in each bag. Thus, when both bags are flattened, the cross product of these tuples is returned; that is, tuples (4, 2, 6), (4, 3, 6), (4, 2, 9), and (4, 3, 9).
400
+ #
401
+ # (1, 2, 3)
402
+ # (4, 2, 6)
403
+ # (4, 3, 6)
404
+ # (4, 2, 9)
405
+ # (4, 3, 9)
406
+ # (8, 3, 9)
407
+ # (8, 4, 9)
408
+ #
409
+ # == Example: Nested Block
410
+ #
411
+ # Suppose we have relation A and derived relation B (where B = GROUP A BY url;). Since relation B contains tuples with bags it can be treated as a nested relation.
412
+ #
413
+ # A (url:chararray, outlink:chararray)
414
+ # ---------------------------------------------
415
+ # (www.ccc.com,www.hjk.com)
416
+ # (www.ddd.com,www.xyz.org)
417
+ # (www.aaa.com,www.cvn.org)
418
+ # (www.www.com,www.kpt.net)
419
+ # (www.www.com,www.xyz.org)
420
+ # (www.ddd.com,www.xyz.org)
421
+ #
422
+ #
423
+ # B
424
+ # ---------------------------------------------
425
+ # (www.aaa.com,{(www.aaa.com,www.cvn.org)})
426
+ # (www.ccc.com,{(www.ccc.com,www.hjk.com)})
427
+ # (www.ddd.com,{(www.ddd.com,www.xyz.org),(www.ddd.com,www.xyz.org)})
428
+ # (www.www.com,{(www.www.com,www.kpt.net),(www.www.com,www.xyz.org)})
429
+ #
430
+ # In this example we perform two of the allowed Pig operations, FILTER (FA) and DISTINCT (DA), as well as projection (PA). Note that the last statement in the nested block must be GENERATE.
431
+ #
432
+ # X = foreach B {
433
+ # FA= FILTER A BY outlink == 'www.xyz.org';
434
+ # PA = FA.outlink;
435
+ # DA = DISTINCT PA;
436
+ # GENERATE GROUP, COUNT(DA);
437
+ # }
438
+ #
439
+ # Relation X looks like this.
440
+ #
441
+ # (www.ddd.com,1L)
442
+ # (www.www.com,1L)
443
+
444
+
445
+ # ---------------------------------------------------------------------------
446
+ #
447
+ # GROUP
448
+ #
449
+ # Groups the data in a single relation.
450
+ #
451
+ # == Syntax
452
+ #
453
+ # alias = GROUP alias
454
+ # [BY {[field_alias [, field_alias]] | * | [expression] } ]
455
+ # [ALL] [PARALLEL n];
456
+ #
457
+ # == Terms
458
+ #
459
+ # alias::
460
+ # The name of a relation.
461
+ #
462
+ # BY::
463
+ # Keyword. Use this clause to group the relation by fields or by expression.
464
+ #
465
+ # field_alias::
466
+ # The name of a field in a relation. This is the group key or key field.
467
+ #
468
+ # A relation can be grouped by a single field (f1) or by the composite value of
469
+ # multiple fields (f1,f2).
470
+ #
471
+ # *::
472
+ # The asterisk. A designator for all fields in the relation.
473
+ #
474
+ # expression::
475
+ # An expression.
476
+ #
477
+ # ALL::
478
+ # Keyword. Use ALL if you want all tuples to go to a single group; for example, when doing aggregates across entire relations.
479
+ #
480
+ # PARALLEL n::
481
+ # Increase the parallelism of a job by specifying the number of reduce tasks,
482
+ # n. The optimal number of parallel tasks depends on the amount of memory on
483
+ # each node and the memory required by each of the tasks. To determine n, use
484
+ # the following as a general guideline:
485
+ # n = (nr_nodes - 1) * 0.45 * nr_GB
486
+ # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
487
+ # memory on each node.
488
+ #
489
+ # Note the following:
490
+ # * Parallel only affects the number of reduce tasks. Map parallelism is
491
+ # determined by the input file, one map for each HDFS block.
492
+ # * If you don’t specify parallel, you still get the same map parallelism but
493
+ # only one reduce task.
494
+ #
495
+ # == Usage
496
+ #
497
+ # The GROUP operator groups together tuples that have the same group key (key
498
+ # field). The result of a GROUP operation is a relation that includes one tuple
499
+ # per group. This tuple contains two fields:
500
+ #
501
+ # * The first field is named "group" (do not confuse this with the GROUP operator)
502
+ # and is the same type of the group key.
503
+ #
504
+ # * The second field takes the name of the original relation and is type bag.
505
+ #
506
+ # Suppose we have the following data:
507
+ #
508
+ # john 25 3.6
509
+ # george 25 2.9
510
+ # anne 27 3.9
511
+ # julia 28 3.6
512
+ #
513
+ # And, suppose we perform the LOAD and GROUP statements shown below. We can use
514
+ # the DESCRIBE operator to view the schemas for relation Y. We can use DUMP to
515
+ # view the contents of Y.
516
+ #
517
+ # Note that relation Y has two fields. The first field is named "group" and is
518
+ # type int (the same as age). The second field takes the name of the original
519
+ # relation "X" and is type bag (that can contain tuples with three elements of
520
+ # type chararray, int, and float).
521
+ #
522
+ # Statements
523
+ #
524
+ # X = LOAD 'data AS (name:chararray, age:int, gpa:float);
525
+ # Y = GROUP X BY age;
526
+ # DESCRIBE Y;
527
+ # Y: {group: int,X: {name: chararray,age: int,gpa: float}}
528
+ # DUMP Y;
529
+ #
530
+ # (25,{(john,25,3.6F),(george,25,2.9F)})
531
+ # (27,{(anne,27,3.9F)})
532
+ # (28,{(julia,28,3.6F)})
533
+ #
534
+ # As shown in this FOREACH statement, we can refer to the fields in relation Y by their names "group" and "X".
535
+ #
536
+ # Z = FOREACH Y GENERATE group, COUNT(X);
537
+ #
538
+ # Relation Z looks like this.
539
+ #
540
+ # (25,2L)
541
+ # (27,1L)
542
+ # (28,1L)
543
+ #
544
+ # == Examples
545
+ #
546
+ # Suppose we have relation A.
547
+ #
548
+ # A: (owner:chararray, pet:chararray)
549
+ # -----------------
550
+ # (Alice, turtle)
551
+ # (Alice, goldfish)
552
+ # (Alice, cat)
553
+ # (Bob, dog)
554
+ # (Bob, cat)
555
+ #
556
+ # In this example tuples are grouped using the field "owner."
557
+ #
558
+ # X = GROUP A BY owner;
559
+ #
560
+ # Relation X looks like this. "group" is the name of the first field. "A" is the
561
+ # name of the second field.
562
+ #
563
+ # (Alice, {(Alice, turtle), (Alice, goldfish)})
564
+ # (Bob, {(Bob, dog), (Bob, cat)})
565
+ #
566
+ # In this example tuples are grouped using the ALL keyword. Field "A" is then
567
+ # counted and projected to from relation Y.
568
+ #
569
+ # X = GROUP A ALL;
570
+ # Y = FOREACH X GENERATE COUNT(A);
571
+ #
572
+ # Relation X looks like this. "group" is the name of the first field. "A" is the
573
+ # name of the second field.
574
+ #
575
+ # (all,{(Alice,turtle),(Alice,goldfish),(Alice,cat),(Bob,dog),(Bob,cat)})
576
+ #
577
+ # Relation Y looks like this.
578
+ #
579
+ # (5L)
580
+ #
581
+ # Suppose we have relation S.
582
+ #
583
+ # S: (f1:chararay, f2:int, f3:int)
584
+ # -----------------
585
+ # (r1, 1, 2)
586
+ # (r2, 2, 1)
587
+ # (r3, 2, 8)
588
+ # (r4, 4, 4)
589
+ #
590
+ # In this example tuples are grouped using an expression, f2*f3.
591
+ #
592
+ # X = GROUP S BY f2*f3;
593
+ #
594
+ # Relation Y looks like this. The first field is named "group". The second field is named "S".
595
+ #
596
+ # (2, {(r1, 1, 2), (r2, 2, 1)})
597
+ # (16, {(r3, 2, 8), (r4, 4, 4)})
598
+
599
+
600
+ # ---------------------------------------------------------------------------
601
+ #
602
+ # JOIN
603
+ #
604
+ # Joins two or more relations based on common field values.
605
+ #
606
+ # == Syntax
607
+ #
608
+ # alias = JOIN alias BY field_alias,
609
+ # alias BY field_alias [, alias BY field_alias …]
610
+ # [PARALLEL n];
611
+ #
612
+ # == Terms
613
+ #
614
+ # alias::
615
+ # The name of a relation.
616
+ #
617
+ # BY::
618
+ # Keyword.
619
+ #
620
+ # field_alias::
621
+ # The name of a field in a relation. The alias and field_alias specified in the
622
+ # BY clause must correspond.
623
+ #
624
+ # == Example:
625
+ # X = JOIN relationA BY fieldA, relationB by fieldB, relationC by fieldC;
626
+ #
627
+ # PARALLEL n::
628
+ # Increase the parallelism of a job by specifying the number of reduce tasks,
629
+ # n. The optimal number of parallel tasks depends on the amount of memory on
630
+ # each node and the memory required by each of the tasks. To determine n, use
631
+ # the following as a general guideline:
632
+ # n = (nr_nodes - 1) * 0.45 * nr_GB
633
+ # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
634
+ # memory on each node.
635
+ #
636
+ # Note the following:
637
+ # * Parallel only affects the number of reduce tasks. Map parallelism is
638
+ # determined by the input file, one map for each HDFS block.
639
+ # * If you don’t specify parallel, you still get the same map parallelism but
640
+ # only one reduce task.
641
+ #
642
+ # == Usage
643
+ #
644
+ # Use the JOIN operator to join two or more relations based on common field
645
+ # values. The JOIN operator always performs an inner join.
646
+ #
647
+ # Note: The JOIN and COGROUP operators perform similar functions. JOIN creates a
648
+ # flat set of output records while COGROUP creates a nested set of output records.
649
+ #
650
+ # == Example
651
+ #
652
+ # Suppose we have relations A and B.
653
+ #
654
+ # (A: a1, a2, a3) (B: b1, b2)
655
+ # ----------------- ---------------
656
+ # (1, 2, 3) (2, 4)
657
+ # (4, 2, 1) (8, 9)
658
+ # (8, 3, 4) (1, 3)
659
+ # (4, 3, 3) (2, 7)
660
+ # (7, 2, 5) (2, 9)
661
+ # (8, 4, 3) (4, 6)
662
+ # (4, 9)
663
+ #
664
+ # In this example relations A and B are joined on their first fields.
665
+ #
666
+ # X = JOIN A BY a1, B BY b1;
667
+ #
668
+ # Relation X looks like this.
669
+ #
670
+ # (1, 2, 3, 1, 3)
671
+ # (4, 2, 1, 4, 6)
672
+ # (4, 3, 3, 4, 6)
673
+ # (4, 2, 1, 4, 9)
674
+ # (4, 3, 3, 4, 9)
675
+ # (8, 3, 4, 8, 9)
676
+ # (8, 4, 3, 8, 9)
677
+ #
678
+
679
+
680
+ # ---------------------------------------------------------------------------
681
+ #
682
+ # LIMIT
683
+ #
684
+ # Limits the number of output tuples.
685
+ #
686
+ # == Syntax
687
+ #
688
+ # alias = LIMIT alias n;
689
+ #
690
+ # == Terms
691
+ #
692
+ # alias::
693
+ # The name of a relation.
694
+ #
695
+ # n::
696
+ # The number of tuples.
697
+ #
698
+ # == Usage
699
+ #
700
+ # Use the LIMIT operator to limit the number of output tuples (rows). If the
701
+ # specified number of output tuples is equal to or exceeds the number of tuples in
702
+ # the relation, the output will include all tuples in the relation.
703
+ #
704
+ # There is no guarantee which tuples will be returned, and the tuples that are
705
+ # returned can change from one run to the next. A particular set of tuples can be
706
+ # requested using the ORDER operator followed by LIMIT.
707
+ #
708
+ # Note: The LIMIT operator allows Pig to avoid processing all tuples in a
709
+ # relation. In most cases a query that uses LIMIT will run more efficiently than
710
+ # an identical query that does not use LIMIT. It is always a good idea to use
711
+ # limit if you can.
712
+ #
713
+ # == Examples
714
+ #
715
+ # Suppose we have relation A.
716
+ #
717
+ # (A: f1:int, f2:int, f3:int)
718
+ # -----------------
719
+ # (1, 2, 3)
720
+ # (4, 2, 1)
721
+ # (8, 3, 4)
722
+ # (4, 3, 3)
723
+ # (7, 2, 5)
724
+ # (8, 4, 3)
725
+ #
726
+ # In this example output is limited to 3 tuples.
727
+ #
728
+ # X = LIMIT A 3;
729
+ #
730
+ # Relation X could look like this (there is no guarantee which three tuples will be output).
731
+ #
732
+ # (1, 2, 3)
733
+ # (4, 3, 3)
734
+ # (7, 2, 5)
735
+ #
736
+ # In this example the ORDER operator is used to order the tuples and the LIMIT operator is used to output the first three tuples.
737
+ #
738
+ # B = ORDER A BY f1 DESC, f2 ASC;
739
+ # X = LIMIT B 3;
740
+ #
741
+ # Relation B and relation X look like this.
742
+ #
743
+ # (B) (X)
744
+ # ----------- -----------
745
+ # (8, 3, 4) (8, 3, 4)
746
+ # (8, 4, 3) (8, 4, 3)
747
+ # (7, 2, 5) (7, 2, 5)
748
+ # (4, 2, 1)
749
+ # (4, 3, 3)
750
+ # (1, 2, 3)
751
+
752
+
753
+ # ---------------------------------------------------------------------------
754
+ #
755
+ # LOAD
756
+ #
757
+ # Loads data from the file system.
758
+ #
759
+ # == Syntax
760
+ #
761
+ # LOAD 'data' [USING function] [AS schema];
762
+ #
763
+ # == Terms
764
+ #
765
+ # 'data'::
766
+ # The name of the file or directory, in single quotes.
767
+ #
768
+ # If you specify a directory name, all the files in the directory are loaded.
769
+ #
770
+ # You can use hadoop-supported globing to specify files at the file system or
771
+ # directory levels (see [WWW]hadoop glob documentation for details on globing
772
+ # syntax).
773
+ #
774
+ # USING::
775
+ # Keyword.
776
+ #
777
+ # function::
778
+ # The load function.
779
+ #
780
+ # PigStorage is the default load/store function and does not need to be
781
+ # specified. This function reads/writes simple newline-separated records with
782
+ # delimiter-separated fields. The function has one parameter, the field
783
+ # delimiter (tab (‘\t’) if the default delimiter).
784
+ #
785
+ # If the data is stored in a special format that the Pig load functions cannot
786
+ # parse, you can write your own load function.
787
+ #
788
+ # AS::
789
+ # Keyword.
790
+ #
791
+ # schema::
792
+ # A schema using the AS keyword, enclosed in parentheses (see Schemas).
793
+ #
794
+ # == Usage
795
+ #
796
+ # Use the LOAD operator to load data from the file system.
797
+ #
798
+ # == Examples
799
+ #
800
+ # Suppose we have a data file called myfile.txt. The fields are tab-delimited. The
801
+ # records are newline-separated.
802
+ #
803
+ # 1 2 3
804
+ # 4 2 1
805
+ # 8 3 4
806
+ #
807
+ # In this example the default load function, PigStorage, loads data from
808
+ # myfile.txt into relation A. Note that, because no schema is specified, the
809
+ # fields are not named and all fields default to type bytearray. The two
810
+ # statements are equivalent.
811
+ #
812
+ # A = LOAD 'myfile.txt';
813
+ # A = LOAD 'myfile.txt' USING PigStorage('\t');
814
+ #
815
+ # Relation A looks like this.
816
+ #
817
+ # (1, 2, 3)
818
+ # (4, 2, 1)
819
+ # (8, 3, 4)
820
+ #
821
+ # In this example a schema is specified using the AS keyword. The two statements
822
+ # are equivalent.
823
+ #
824
+ # A = LOAD 'myfile.txt' AS (f1:int, f2:int, f3:int);
825
+ # A = LOAD 'myfile.txt' USING PigStorage(‘\t’) AS (f1:int, f2:int, f3:int);
826
+
827
+
828
+ # ---------------------------------------------------------------------------
829
+ #
830
+ # ORDER
831
+ #
832
+ # Sorts a relation based on one or more fields.
833
+ #
834
+ # == Syntax
835
+ #
836
+ # alias = ORDER alias BY { * [ASC|DESC] | field_alias [ASC|DESC]
837
+ # [, field_alias [ASC|DESC] …] } [PARALLEL n];
838
+ #
839
+ # == Terms
840
+ #
841
+ # alias::
842
+ # The name of a relation.
843
+ #
844
+ # BY::
845
+ # Required keyword.
846
+ #
847
+ # *::
848
+ # Represents all fields in the relation.
849
+ #
850
+ # ASC::
851
+ # Sort in ascending order.
852
+ #
853
+ # DESC::
854
+ # Sort in descending order.
855
+ #
856
+ # field_alias::
857
+ # A field in the relation.
858
+ #
859
+ # PARALLEL n::
860
+ # Increase the parallelism of a job by specifying the number of reduce tasks,
861
+ # n. The optimal number of parallel tasks depends on the amount of memory on
862
+ # each node and the memory required by each of the tasks. To determine n, use
863
+ # the following as a general guideline:
864
+ # n = (nr_nodes - 1) * 0.45 * nr_GB
865
+ # where nr_nodes is the number of nodes used and nr_GB is the amount of physical
866
+ # memory on each node.
867
+ #
868
+ # Note the following:
869
+ # * Parallel only affects the number of reduce tasks. Map parallelism is
870
+ # determined by the input file, one map for each HDFS block.
871
+ # * If you don’t specify parallel, you still get the same map parallelism but
872
+ # only one reduce task.
873
+ #
874
+ # == Usage
875
+ #
876
+ # In Pig, relations are logically unordered.
877
+ #
878
+ # * If you order relation A to produce relation X (X = ORDER A BY * DESC;),
879
+ # relations A and X still contain the same thing.
880
+ #
881
+ # * If you retrieve the contents of relation X, they are guaranteed to be in the
882
+ # order you specified (descending).
883
+ #
884
+ # * However, if you further process relation X, there is no guarantee that the
885
+ # contents will be processed in the order you specified.
886
+ #
887
+ # == Examples
888
+ #
889
+ # Suppose we have relation A.
890
+ #
891
+ # (A: f1, f2, f3)
892
+ # -----------------
893
+ # (1, 2, 3)
894
+ # (4, 2, 1)
895
+ # (8, 3, 4)
896
+ # (4, 3, 3)
897
+ # (7, 2, 5)
898
+ # (8, 4, 3)
899
+ #
900
+ # In this example relation A is sorted by the third field, f3 in descending order.
901
+ #
902
+ # X = ORDER A BY f3 DESC;
903
+ #
904
+ # Relation X could look like this (note that the order of the three tuples ending
905
+ # in 3 can vary).
906
+ #
907
+ # (7, 2, 5)
908
+ # (8, 3, 4)
909
+ # (1, 2, 3)
910
+ # (4, 3, 3)
911
+ # (8, 4, 3)
912
+ # (4, 2, 1)
913
+
914
+
915
+ # ---------------------------------------------------------------------------
916
+ #
917
+ # SPLIT
918
+ #
919
+ # Partitions a relation into two or more relations.
920
+ #
921
+ # == Syntax
922
+ #
923
+ # SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …];
924
+ #
925
+ # == Terms
926
+ #
927
+ # alias::
928
+ # The name of a relation.
929
+ #
930
+ # INTO::
931
+ # Required keyword.
932
+ #
933
+ # IF::
934
+ # Required keyword.
935
+ #
936
+ # expression::
937
+ # An expression.
938
+ #
939
+ # == Usage
940
+ #
941
+ # Use the SPLIT operator to partition a relation into two or more relations based
942
+ # on some expression. Depending on the expression:
943
+ #
944
+ # * A tuple may be assigned to more than one relation.
945
+ #
946
+ # * A tuple may not be assigned to any relation.
947
+ #
948
+ # == Example
949
+ #
950
+ # Suppose we have relation A.
951
+ #
952
+ # (A: f1, f2, f3)
953
+ # -----------------
954
+ # (1, 2, 3)
955
+ # (4, 5, 6)
956
+ # (7, 8, 9)
957
+ #
958
+ # In this example relation A is split into three relations, X, Y, and Z.
959
+ #
960
+ # SPLIT A INTO X IF f1< 7, Y IF f2==5, Z IF (f3<6 OR f3>6);
961
+ #
962
+ # Relations X, Y, and Z look like this.
963
+ #
964
+ # (X) (Y) (Z)
965
+ # ---------- ----------- -----------
966
+ # (1, 2, 3) (4, 5, 6) (1, 2, 3)
967
+ # (4, 5, 6) (7, 8, 9)
968
+
969
+
970
+ # ---------------------------------------------------------------------------
971
+ #
972
+ # STORE
973
+ #
974
+ # Stores data to the file system.
975
+ #
976
+ # == Syntax
977
+ #
978
+ # STORE alias INTO 'directory' [USING function];
979
+ #
980
+ # == Terms
981
+ #
982
+ # alias::
983
+ # The name of a relation.
984
+ #
985
+ # INTO::
986
+ # Required keyword.
987
+ #
988
+ # 'directory'::
989
+ # The name of the storage directory, in quotes. If the directory already exists, the STORE operation will fail.
990
+ #
991
+ #
992
+ #
993
+ # The output data files, named part-nnnnn, are written to this directory.
994
+ #
995
+ # USING::
996
+ # Keyword. Use this clause to name the store function.
997
+ #
998
+ # function::
999
+ # The load function.
1000
+ #
1001
+ # PigStorage is the default load/store function and does not need to be specified. This function reads/writes simple newline-separated records with delimiter-separated fields. The function has one parameter, the field delimiter (tab ‘\t’ if the default delimiter)
1002
+ #
1003
+ # If you want to store the data in a special format that the Pig Load/Store functions cannot handle, you can write your own store function.
1004
+ #
1005
+ # == Usage
1006
+ #
1007
+ # Use the STORE operator to store data on the file system.
1008
+ #
1009
+ # == Example
1010
+ #
1011
+ # Suppose we have relation A.
1012
+ #
1013
+ # (A)
1014
+ #
1015
+ # ----------------
1016
+ # (1, 2, 3)
1017
+ # (4, 2, 1)
1018
+ # (8, 3, 4)
1019
+ # (4, 3, 3)
1020
+ # (7, 2, 5)
1021
+ # (8, 4, 3)
1022
+ #
1023
+ # In this example the contents of relation A are written to file part-00000 located in directory myoutput.
1024
+ #
1025
+ # STORE relationA INTO ‘myoutput’ USING PigStorage (‘*’);
1026
+ #
1027
+ # The part-00000 file looks like this. Fields are delimited with the asterisk * characters and records are separated by newlines.
1028
+ #
1029
+ # 1*2*3
1030
+ # 4*2*1
1031
+ # 8*3*4
1032
+ # 4*3*3
1033
+ # 7*2*5
1034
+ # 8*4*3
1035
+ #
1036
+
1037
+
1038
+ # ---------------------------------------------------------------------------
1039
+ #
1040
+ # STREAM
1041
+ #
1042
+ # Sends data to an external script or program.
1043
+ #
1044
+ # == Syntax
1045
+ #
1046
+ # alias = STREAM alias [, alias …] THROUGH {`command` | cmd_alias } [AS schema] ;
1047
+ #
1048
+ # == Terms
1049
+ #
1050
+ # alias::
1051
+ # The name of a relation.
1052
+ #
1053
+ # THROUGH::
1054
+ # Keyword.
1055
+ #
1056
+ # `command`::
1057
+ # A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).
1058
+ #
1059
+ # cmd_alias::
1060
+ # The name of a command created using the DEFINE operator.
1061
+ #
1062
+ # AS::
1063
+ # Keyword.
1064
+ #
1065
+ # schema::
1066
+ # A schema using the AS keyword, enclosed in parentheses (see Schemas).
1067
+ #
1068
+ # == Usage
1069
+ #
1070
+ # Use the STREAM operator to send data through an external script or program. Multiple stream operators can appear in the same Pig script. The stream operators can be adjacent to each other or have other operations in between.
1071
+ #
1072
+ # When used with a command, a stream statement could look like this:
1073
+ #
1074
+ # A = LOAD 'data';
1075
+ #
1076
+ # B = STREAM A THROUGH `stream.pl -n 5`;
1077
+ #
1078
+ # When used with a cmd_alias, a stream statement could look like this, where cmd is the defined alias.
1079
+ #
1080
+ # A = LOAD 'data';
1081
+ #
1082
+ # DEFINE cmd `stream.pl –n 5`;
1083
+ #
1084
+ # B = STREAM A THROUGH cmd;
1085
+ # About Data Guarantees
1086
+ #
1087
+ # Data guarantees are determined based on the position of the streaming operator in the Pig script.
1088
+ #
1089
+ # * Unordered data – No guarantee for the order in which the data is delivered to
1090
+ # the streaming application.
1091
+ #
1092
+ # * Grouped data – The data for the same grouped key is guaranteed to be provided
1093
+ # to the streaming application contiguously
1094
+ #
1095
+ # * Grouped and ordered data – The data for the same grouped key is guaranteed to
1096
+ # be provided to the streaming application contiguously. Additionally, the data
1097
+ # within the group is guaranteed to be sorted by the provided secondary key.
1098
+ #
1099
+ # In addition to position, data grouping and ordering can be determined by the
1100
+ # data itself. However, you need to know the property of the data to be able to
1101
+ # take advantage of its structure.
1102
+ #
1103
+ # == Example: Data Guarantees
1104
+ #
1105
+ # In this example the data is unordered.
1106
+ #
1107
+ # A = LOAD 'data';
1108
+ # B = STREAM A THROUGH `stream.pl`;
1109
+ #
1110
+ # In this example the data is grouped.
1111
+ #
1112
+ # A = LOAD 'data';
1113
+ # B = GROUP A BY $1;
1114
+ # C = FOREACH B FLATTEN(A);
1115
+ # D = STREAM C THROUGH `stream.pl`
1116
+ #
1117
+ # In this example the data is grouped and ordered.
1118
+ #
1119
+ # A = LOAD 'data';
1120
+ # B = GROUP A BY $1;
1121
+ # C = FOREACH B {
1122
+ # D = ORDER A BY ($3, $4);
1123
+ # GENERATE D;
1124
+ # }
1125
+ # E = STREAM C THROUGH `stream.pl`;
1126
+ #
1127
+ # == Example: Schemas
1128
+ #
1129
+ # In this example a schema is specified as part of the STREAM statement.
1130
+ #
1131
+ # X = STREAM A THROUGH `stream.pl` as (f1:int, f2;int, f3:int);
1132
+ #
1133
+ # Additional Examples
1134
+ #
1135
+ # See DEFINE for additional examples.
1136
+
1137
+
1138
+ # ---------------------------------------------------------------------------
1139
+ #
1140
+ # UNION
1141
+ #
1142
+ # Computes the union of two or more relations.
1143
+ #
1144
+ # == Syntax
1145
+ #
1146
+ # alias = UNION alias, alias [, alias …];
1147
+ #
1148
+ # == Terms
1149
+ #
1150
+ # alias::
1151
+ # The name of a relation.
1152
+ #
1153
+ # == Usage
1154
+ #
1155
+ # Use the UNION operator to compute the union of two or more relations. The UNION operator:
1156
+ #
1157
+ # * Does not preserve the order of tuples. Both the input and output relations are
1158
+ # interpreted as unordered bags of tuples.
1159
+ #
1160
+ # * Does not ensure (as databases do) that all tuples adhere to the same schema or
1161
+ # that they have the same number of fields. In a typical scenario, however, this
1162
+ # should be the case; therefore, it is the user's responsibility to either (1)
1163
+ # ensure that the tuples in the input relations have the same schema or (2) be
1164
+ # able to process varying tuples in the output relation.
1165
+ #
1166
+ # * Does not eliminate duplicate tuples.
1167
+ #
1168
+ # == Example
1169
+ #
1170
+ # Suppose we have relations A and B.
1171
+ #
1172
+ # (A) (B)
1173
+ # ----------- --------
1174
+ # (1, 2, 3) (2, 4)
1175
+ # (4, 2, 1) (8, 9)
1176
+ # (1, 3)
1177
+ #
1178
+ # In this example the union of relation A and B is computed.
1179
+ #
1180
+ # X = UNION A, B;
1181
+ #
1182
+ # Relation X looks like this.
1183
+ #
1184
+ # (1, 2, 3)
1185
+ # (4, 2, 1)
1186
+ # (2, 4)
1187
+ # (8, 9)
1188
+ # (1, 3)
1189
+ # Diagnostic Operators
1190
+ # DESCRIBE
1191
+ #
1192
+ # Returns the schema of an alias.
1193
+ #
1194
+ # == Syntax
1195
+ #
1196
+ # DESCRIBE alias;
1197
+ #
1198
+ # == Terms
1199
+ #
1200
+ # alias::
1201
+ # The name of a relation.
1202
+ #
1203
+ # == Usage
1204
+ #
1205
+ # Use the DESCRIBE operator to review the schema of a particular alias.
1206
+ #
1207
+ # == Example
1208
+ #
1209
+ # In this example a schema is specified using the AS clause.
1210
+ #
1211
+ # A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
1212
+ # B = FILTER A BY name matches 'John%';
1213
+ # C = GROUP B BY name;
1214
+ # D = FOREACH B GENERATE COUNT(B.age);
1215
+ # DESCRIBE A;
1216
+ # A: {group, B: (name: chararray,age: int,gpa: float}
1217
+ # DESCRIBE B;
1218
+ # B: {group, B: (name: chararray,age: int,gpa: float}
1219
+ # DESCRIBE C;
1220
+ # C: {group, chararry,B: (name: chararray,age: int,gpa: float}
1221
+ # DESCRIBE D;
1222
+ # D: {long}
1223
+ #
1224
+ # In this example no schema is specified. All data items default to type bytearray.
1225
+ #
1226
+ # grunt> a = LOAD '/data/students';
1227
+ # grunt> b = FILTER a BY $0 matches 'John%';
1228
+ # grunt> c = GROUP b BY $0;
1229
+ # grunt> d = FOREACH c GENERATE COUNT(b.$1);
1230
+ # grunt> DESCRIBE a;
1231
+ #
1232
+ # Schema for a unknown.
1233
+ #
1234
+ # grunt> DESCRIBE b;
1235
+ # 2008-12-05 01:17:15,316 [main] WARN org.apache.pig.PigServer - bytearray is implicitly cast to chararray under LORegexp Operator
1236
+ #
1237
+ # Schema for b unknown.
1238
+ #
1239
+ # grunt> DESCRIBE c;
1240
+ # 2008-12-05 01:17:23,343 [main] WARN org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
1241
+ #
1242
+ # c: {group: bytearray,b: {null}}
1243
+ #
1244
+ # grunt> DESCRIBE d;
1245
+ # 2008-12-05 03:04:30,076 [main] WARN org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
1246
+ #
1247
+ # d: {long}
1248
+ #
1249
+ # DUMP
1250
+ #
1251
+ # Displays the contents of an alias.
1252
+ #
1253
+ # == Syntax
1254
+ #
1255
+ # DUMP alias;
1256
+ #
1257
+ # == Terms
1258
+ #
1259
+ # alias::
1260
+ # The name of a relation.
1261
+ #
1262
+ # == Usage
1263
+ #
1264
+ # Use the DUMP operator to display the contents of an alias. You can use DUMP as a
1265
+ # debugging device to make sure the correct results are being generated.
1266
+ #
1267
+ # == Example
1268
+ #
1269
+ # In this example a dump is performed after each statement.
1270
+ #
1271
+ # A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
1272
+ # DUMP A;
1273
+ # B = FILTER A BY name matches 'John%';
1274
+ # DUMP B;
1275
+ # B = GROUP B BY name;
1276
+ # DUMP C;
1277
+ # D = FOREACH C GENERATE COUNT(B.age);
1278
+ # DUMP D;
1279
+ #
1280
+ # EXPLAIN
1281
+ #
1282
+ # Displays execution plans.
1283
+ #
1284
+ # == Syntax
1285
+ #
1286
+ # EXPLAIN alias;
1287
+ #
1288
+ # == Terms
1289
+ #
1290
+ # alias::
1291
+ # The name of a relation.
1292
+ #
1293
+ # == Usage
1294
+ #
1295
+ # Use the EXPLAIN operator to review the logical, physical, and map reduce
1296
+ # execution plans that are used to compute the specified relationship.
1297
+ #
1298
+ # * The logical plan shows a pipeline of operators to be executed to build the
1299
+ # relation. Type checking and backend-independent optimizations (such as
1300
+ # applying filters early on) also apply.
1301
+ #
1302
+ # * The physical plan shows how the logical operators are translated to
1303
+ # backend-specific physical operators. Some backend optimizations also apply.
1304
+ #
1305
+ # * The map reduce plan shows how the physical operators are grouped into map
1306
+ # reduce jobs.
1307
+ #
1308
+ # == Example
1309
+ #
1310
+ # In this example the EXPLAIN operator produces all three plans. (Note that only a
1311
+ # portion of the output is shown in this example.)
1312
+ #
1313
+ # A = LOAD 'students' AS (name:chararray, age:int, gpa:float);
1314
+ # B = GROUP A BY name;
1315
+ # C = FOREACH B GENERATE COUNT(A.age);
1316
+ # EXPLAIN C;
1317
+ #
1318
+ #
1319
+ # Logical Plan:
1320
+ #
1321
+ # Store xxx-Fri Dec 05 19:42:29 UTC 2008-23 Schema: {long} Type: Unknown
1322
+ # |
1323
+ # |---ForEach xxx-Fri Dec 05 19:42:29 UTC 2008-15 Schema: {long} Type: bag
1324
+ # etc …
1325
+ #
1326
+ # -----------------------------------------------
1327
+ # Physical Plan:
1328
+ # -----------------------------------------------
1329
+ # Store(fakefile:org.apache.pig.builtin.PigStorage) - xxx-Fri Dec 05 19:42:29 UTC 2008-40
1330
+ # |
1331
+ # |---New For Each(false)[bag] - xxx-Fri Dec 05 19:42:29 UTC 2008-39
1332
+ # | |
1333
+ # | POUserFunc(org.apache.pig.builtin.COUNT)[long] - xxx-Fri Dec 05
1334
+ # etc …
1335
+ #
1336
+ # --------------------------------------------------
1337
+ # | Map Reduce Plan |
1338
+ # --------------------------------------------------
1339
+ # MapReduce node xxx-Fri Dec 05 19:42:29 UTC 2008-41
1340
+ # Map Plan
1341
+ # Local Rearrange[tuple]{chararray}(false) - xxx-Fri Dec 05 19:42:29 UTC 2008-34
1342
+ # | |
1343
+ # | Project[chararray][0] - xxx-Fri Dec 05 19:42:29 UTC 2008-35
1344
+ # etc …
1345
+ # ILLUSTRATE
1346
+ #
1347
+ # Displays a step-by-step execution of a sequence of statements.
1348
+ #
1349
+ # == Syntax
1350
+ #
1351
+ # ILLUSTRATE alias;
1352
+ #
1353
+ # == Terms
1354
+ #
1355
+ # alias::
1356
+ # The name of a relation.
1357
+ #
1358
+ # == Usage
1359
+ #
1360
+ # Use the ILLUSTRATE operator to review how data items are transformed through a
1361
+ # sequence of Pig Latin statements.
1362
+ #
1363
+ # ILLUSTRATE accesses the ExampleGenerator algorithm which can select an
1364
+ # appropriate and concise set of example data items automatically. It does a
1365
+ # better job than random sampling would do; for example, random sampling suffers
1366
+ # from the drawback that selective operations such as filters or joins can
1367
+ # eliminate all the sampled data items, giving you empty results which is of no
1368
+ # help with debugging.
1369
+ #
1370
+ # With the ILLUSTRATE operator you can test your programs on small datasets and
1371
+ # get faster turnaround times. The ExampleGenerator algorithm uses Pig's Local
1372
+ # mode (rather than Hadoop mode) which means that illustrative example data is
1373
+ # generated in near real-time.
1374
+ #
1375
+ # == Example
1376
+ #
1377
+ # Suppose we have a data file called 'visits.txt'.
1378
+ # Amy cnn.com 20080218
1379
+ # Fred harvard.edu 20081204
1380
+ # Amy bbc.com 20081205
1381
+ # Fred stanford.edu 20081206
1382
+ #
1383
+ # In this example we count the number of sites a user has visited since
1384
+ # 12/1/08. The ILLUSTRATE statement will show how the results for num_user_visits
1385
+ # are derived.
1386
+ #
1387
+ # visits = LOAD 'visits.txt' AS (user:chararray, url:chararray, timestamp:chararray);
1388
+ #
1389
+ # recent_visits = FILTER visits BY timestamp >= '20081201';
1390
+ #
1391
+ # user_visits = GROUP recent_visits BY user;
1392
+ #
1393
+ # num_user_visits = FOREACH user_visits GENERATE COUNT(recent_visits);
1394
+ #
1395
+ # ILLUSTRATE num_user_visits
1396
+ #
1397
+ # The output from the ILLUSTRATE statement looks like this.
1398
+ #
1399
+ # ------------------------------------------------------------------------
1400
+ #
1401
+ # | visits | user: bytearray | url: bytearray | timestamp: bytearray |
1402
+ # ------------------------------------------------------------------------
1403
+ # | | Amy | cnn.com | 20080218 |
1404
+ # | | Fred | harvard.edu | 20081204 |
1405
+ # | | Amy | bbc.com | 20081205 |
1406
+ # | | Fred | stanford.edu | 20081206 |
1407
+ # ------------------------------------------------------------------------
1408
+ #
1409
+ # -------------------------------------------------------------------------------
1410
+ # | recent_visits | user: chararray | url: chararray | timestamp: chararray |
1411
+ # -------------------------------------------------------------------------------
1412
+ # | | Fred | harvard.edu | 20081204 |
1413
+ # | | Amy | bbc.com | 20081205 |
1414
+ # | | Fred | stanford.edu | 20081206 |
1415
+ # -------------------------------------------------------------------------------
1416
+ #
1417
+ # ------------------------------------------------------------------------------------------------------------------
1418
+ # | user_visits | group: chararray | recent_visits: bag({user: chararray,url: chararray,timestamp: chararray}) |
1419
+ # ------------------------------------------------------------------------------------------------------------------
1420
+ # | | Amy | {(Amy, bbc.com, 20081205)} |
1421
+ # | | Fred | {(Fred, harvard.edu, 20081204), (Fred, stanford.edu, 20081206)} |
1422
+ # ------------------------------------------------------------------------------------------------------------------
1423
+ #
1424
+ # -------------------------------
1425
+ # | num_user_visits | long |
1426
+ # -------------------------------
1427
+ # | | 1 |
1428
+ # | | 2 |
1429
+ # -------------------------------
1430
+ #
1431
+
1432
+ # ---------------------------------------------------------------------------
1433
+ #
1434
+ # DEFINE
1435
+ #
1436
+ # Assigns an alias to a function or command.
1437
+ #
1438
+ # == Syntax
1439
+ #
1440
+ # DEFINE alias {function | [`command` [input] [output] [ship] [cache]] };
1441
+ #
1442
+ # == Terms
1443
+ #
1444
+ # alias::
1445
+ # The name for the function or command.
1446
+ #
1447
+ # function::
1448
+ # The name of a function.
1449
+ #
1450
+ # Use this option to define functions for use with the FOREACH and FILTER operators.
1451
+ #
1452
+ # `command `::
1453
+ # A command, including the arguments, enclosed in back tics (where a command is anything that can be executed).
1454
+ #
1455
+ # Use this option to define commands for use with the STREAM operator.
1456
+ #
1457
+ # input::
1458
+ # INPUT ( {stdin | 'path'} [USING serializer] [, {stdin | 'path'} [USING serializer] …] )
1459
+ #
1460
+ # Where:
1461
+ # * INPUT – Keyword.
1462
+ # * 'path' – A file path, enclosed in single quotes.
1463
+ # * USING – Keyword.
1464
+ # * serializer – A function that converts data from tuples to stream format. PigStorage is the default serializer. You can also write your own UDF.
1465
+ #
1466
+ # output::
1467
+ # OUTPUT ( {stdout | stderr | 'path'} [USING deserializer] [, {stdout | stderr | 'path'} [USING deserializer] …] )
1468
+ #
1469
+ # Where:
1470
+ #
1471
+ # * OUTPUT – Keyword.
1472
+ # * 'path' – A file path, enclosed in single quotes.
1473
+ # * USING – Keyword.
1474
+ # * deserializer – A function that converts data from stream format to tuples. PigStorage is the default deserializer. You can also write your own UDF.
1475
+ #
1476
+ # ship::
1477
+ # SHIP('path' [, 'path' …])
1478
+ #
1479
+ # Where:
1480
+ #
1481
+ # * SHIP – Keyword.
1482
+ # * 'path' – A file path, enclosed in single quotes.
1483
+ #
1484
+ # cache::
1485
+ # CACHE('dfs_path#dfs_file' [, 'dfs_path#dfs_file' …])
1486
+ #
1487
+ # Where:
1488
+ #
1489
+ # * CACHE – Keyword.
1490
+ # * 'dfs_path#dfs_file' – A file path/file name on the distributed file system,
1491
+ # enclosed in single quotes. Example: '/mydir/mydata.txt#mydata.txt'
1492
+ #
1493
+ #
1494
+ # == Usage
1495
+ #
1496
+ # Use the DEFINE statement to assign a name (alias) to a function or to a command.
1497
+ #
1498
+ # Use DEFINE to specify a function when:
1499
+ #
1500
+ # * The function has a log package name that you don't want to include in a
1501
+ # script, especially if you call the function several times in that script.
1502
+ #
1503
+ # * The constructor for the function takes parameters (see the first example
1504
+ # below). If you need to use different constructor parameters for different
1505
+ # calls to the function you will need to create multiple defines – one for each
1506
+ # parameter set.
1507
+ #
1508
+ # Use DEFINE to specify a command when the streaming command specification is
1509
+ # complex or requires additional parameters (input, output, and so on).
1510
+ #
1511
+ # === About Input and Output
1512
+ #
1513
+ # Serialization is needed to convert data from tuples to a format that can be
1514
+ # processed by the streaming application. Deserialization is needed to convert the
1515
+ # output from the streaming application back into tuples.
1516
+ #
1517
+ # PigStorage, the default serialization/deserialization function, converts tuples
1518
+ # to tab-delimited lines. Pig's BinarySerializer and BinaryDeserializer functions
1519
+ # treat the entire file as a byte stream (no formatting or interpretation takes
1520
+ # place). You can also write your own serialization/deserialization functions.
1521
+ #
1522
+ # === About Ship
1523
+ #
1524
+ # Use the ship option to send streaming binary and supporting files, if any, from
1525
+ # the client node to the compute nodes. Pig does not automatically ship
1526
+ # dependencies; it is your responsibility to explicitly specify all the
1527
+ # dependencies and to make sure that the software the processing relies on (for
1528
+ # instance, perl or python) is installed on the cluster. Supporting files are
1529
+ # shipped to the task's current working directory and only relative paths should
1530
+ # be specified. Any pre-installed binaries should be specified in the path.
1531
+ #
1532
+ # Only files, not directories, can be specified with the ship option. One way to
1533
+ # work around this limitation is to tar all the dependencies into a tar file that
1534
+ # accurately reflects the structure needed on the compute nodes, then have a
1535
+ # wrapper for your script that un-tars the dependencies prior to execution.
1536
+ #
1537
+ # Note that the ship option has two components: the source specification, provided
1538
+ # in the ship clause, is the view of your machine; the command specification is
1539
+ # the view of the cluster.The only guarantee is that the shipped files are
1540
+ # available is the current working directory of the launched job and that your
1541
+ # current working directory is also on the PATH environment variable.
1542
+ #
1543
+ # Shipping files to relative paths or absolute paths is not supported since you
1544
+ # might not have permission to read/write/execute from arbitrary paths on the
1545
+ # clusters.
1546
+ #
1547
+ # === About Cache
1548
+ #
1549
+ # The ship option works with binaries, jars, and small datasets. However, loading
1550
+ # larger datasets at run time for every execution can severely impact
1551
+ # performance. Instead, use the cache option to access large files already moved
1552
+ # to and available on the compute nodes. Only files, not directories, can be
1553
+ # specified with the cache option.
1554
+ #
1555
+ # == Example: Input/Output
1556
+ #
1557
+ # In this example PigStorage is the default serialization/deserialization
1558
+ # function. The tuples from relation A are converted to tab-delimited lines that
1559
+ # are passed to the script.
1560
+ #
1561
+ # X = STREAM A THROUGH `stream.pl`;
1562
+ #
1563
+ # In this example PigStorage is used as the serialization/deserialization
1564
+ # function, but a comma is used as the delimiter.
1565
+ #
1566
+ # DEFINE Y `stream.pl` INPUT(stdin USING PigStorage(',')) OUTPUT (stdout USING PigStorage(','));
1567
+ # X = STREAM A THROUGH Y;
1568
+ #
1569
+ # In this example user-defined serialization/deserialization functions are used
1570
+ # with the script.
1571
+ #
1572
+ # DEFINE Y `stream.pl` INPUT(stdin USING MySerializer) OUTPUT (stdout USING MyDeserializer);
1573
+ # X = STREAM A THROUGH Y;
1574
+ #
1575
+ # == Example: Ship/Cache
1576
+ #
1577
+ # In this example ship is used to send the script to the cluster compute nodes.
1578
+ #
1579
+ # DEFINE Y `stream.pl` SHIP('/work/stream.pl');
1580
+ # X = STREAM A THROUGH Y;
1581
+ #
1582
+ # In this example cache is used to specify a file located on the cluster compute
1583
+ # nodes.
1584
+ #
1585
+ # DEFINE Y `stream.pl data.gz` SHIP('/work/stream.pl') CACHE('/input/data.gz#data.gz');
1586
+ # X = STREAM A THROUGH Y;
1587
+ #
1588
+ # == Example: Logging
1589
+ #
1590
+ # In this example the streaming stderr is stored in the _logs/<dir> directory of
1591
+ # the job's output directory. Because the job can have multiple streaming
1592
+ # applications associated with it, you need to ensure that different directory
1593
+ # names are used to avoid conflicts. Pig stores up to 100 tasks per streaming job.
1594
+ #
1595
+ # DEFINE Y `stream.pl` stderr('<dir>' limit 100);
1596
+ # X = STREAM A THROUGH Y;
1597
+ #
1598
+ # In this example a function is defined for use with the FOREACH …GENERATE operator.
1599
+ # grunt> REGISTER /src/myfunc.jar
1600
+ # grunt> define myFunc myfunc.MyEvalfunc('foo');
1601
+ # grunt> A = LOAD 'students';
1602
+ # grunt> B = FOREACH A GENERATE myFunc($0);
1603
+ #
1604
+ # In this example a command is defined for use with the STREAM operator.
1605
+ # grunt> A = LOAD 'data';
1606
+ # grunt> DEFINE cmd `stream_cmd –input file.dat`
1607
+ # grunt> B = STREAM A through cmd.
1608
+ #
1609
+
1610
+
1611
+ # ---------------------------------------------------------------------------
1612
+ #
1613
+ # = REGISTER
1614
+ #
1615
+ # Registers a JAR file so that the UDFs in the file can be used.
1616
+ #
1617
+ # == Syntax
1618
+ #
1619
+ # REGISTER alias;
1620
+ #
1621
+ # == Terms
1622
+ #
1623
+ # [alias] The path of a Java JAR file. Do not place the name in quotes.
1624
+ #
1625
+ # == Usage
1626
+ #
1627
+ # Use the REGISTER statement to specify the path of a Java JAR file containing UDFs.
1628
+ #
1629
+ # For more information about UDFs, see the User Defined Function Guide. Note that
1630
+ # Pig currently only supports functions written in Java.
1631
+ #
1632
+ # == Example
1633
+ #
1634
+ # In this example REGISTER states that myfunc.jar is located in the /src
1635
+ # directory.
1636
+ #
1637
+ # grunt> REGISTER /src/myfunc.jar;
1638
+ # grunt> A = LOAD 'students';
1639
+ # grunt> B = FOREACH A GENERATE myfunc.MyEvalFunc($0);
1640
+ #