wukong 1.4.7 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/CHANGELOG.textile +9 -0
  2. data/README.textile +1 -1
  3. data/bin/hdp-bzip +28 -0
  4. data/bin/hdp-mkdir +1 -1
  5. data/bin/hdp-stream-flat +3 -2
  6. data/bin/wu-lign +32 -18
  7. data/docpages/pig/cookbook.html +481 -0
  8. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  9. data/docpages/pig/images/instruction_arrow.png +0 -0
  10. data/docpages/pig/images/pig-logo.gif +0 -0
  11. data/docpages/pig/piglatin_ref1.html +1103 -0
  12. data/docpages/pig/piglatin_ref2.html +14340 -0
  13. data/docpages/pig/setup.html +505 -0
  14. data/docpages/pig/skin/basic.css +166 -0
  15. data/docpages/pig/skin/breadcrumbs.js +237 -0
  16. data/docpages/pig/skin/fontsize.js +166 -0
  17. data/docpages/pig/skin/getBlank.js +40 -0
  18. data/docpages/pig/skin/getMenu.js +45 -0
  19. data/docpages/pig/skin/images/chapter.gif +0 -0
  20. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  21. data/docpages/pig/skin/images/current.gif +0 -0
  22. data/docpages/pig/skin/images/external-link.gif +0 -0
  23. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  24. data/docpages/pig/skin/images/page.gif +0 -0
  25. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  26. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  27. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  28. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  29. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  30. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  31. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  32. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  33. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  34. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  35. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  36. data/docpages/pig/skin/print.css +54 -0
  37. data/docpages/pig/skin/profile.css +181 -0
  38. data/docpages/pig/skin/screen.css +587 -0
  39. data/docpages/pig/tutorial.html +1059 -0
  40. data/docpages/pig/udf.html +1509 -0
  41. data/examples/keystore/conditional_outputter_example.rb +70 -0
  42. data/examples/{graph → network_graph}/adjacency_list.rb +0 -0
  43. data/examples/{graph → network_graph}/breadth_first_search.rb +0 -0
  44. data/examples/{graph → network_graph}/gen_2paths.rb +0 -0
  45. data/examples/{graph → network_graph}/gen_multi_edge.rb +0 -0
  46. data/examples/{graph → network_graph}/gen_symmetric_links.rb +0 -0
  47. data/examples/pagerank/run_pagerank.sh +10 -8
  48. data/examples/{apache_log_parser.rb → server_logs/apache_log_parser.rb} +0 -0
  49. data/examples/stupidly_simple_filter.rb +43 -0
  50. data/lib/wukong/extensions/hash.rb +13 -0
  51. data/lib/wukong/extensions/hash_like.rb +7 -0
  52. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +122 -0
  53. data/lib/wukong/script.rb +27 -22
  54. data/lib/wukong/script/hadoop_command.rb +5 -3
  55. data/lib/wukong/streamer/accumulating_reducer.rb +2 -1
  56. data/wukong.gemspec +64 -26
  57. metadata +89 -31
  58. data/docpages/pig/PigLatinReferenceManual.html +0 -19134
  59. data/examples/foo.rb +0 -9
  60. data/examples/package-local.rb +0 -100
  61. data/examples/package.rb +0 -96
  62. data/examples/run_all.sh +0 -47
@@ -0,0 +1,1059 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
5
+ <meta content="Apache Forrest" name="Generator">
6
+ <meta name="Forrest-version" content="0.8">
7
+ <meta name="Forrest-skin-name" content="pelt">
8
+ <title>Pig Tutorial</title>
9
+ <link type="text/css" href="skin/basic.css" rel="stylesheet">
10
+ <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
11
+ <link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
12
+ <link type="text/css" href="skin/profile.css" rel="stylesheet">
13
+ <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
14
+ <link rel="shortcut icon" href="">
15
+ </head>
16
+ <body onload="init()">
17
+ <script type="text/javascript">ndeSetTextSize();</script>
18
+ <div id="top">
19
+ <!--+
20
+ |breadtrail
21
+ +-->
22
+ <div class="breadtrail">
23
+ <a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/pig/">Pig</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
24
+ </div>
25
+ <!--+
26
+ |header
27
+ +-->
28
+ <div class="header">
29
+ <!--+
30
+ |start group logo
31
+ +-->
32
+ <div class="grouplogo">
33
+ <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
34
+ </div>
35
+ <!--+
36
+ |end group logo
37
+ +-->
38
+ <!--+
39
+ |start Project Logo
40
+ +-->
41
+ <div class="projectlogo">
42
+ <a href="http://hadoop.apache.org/pig/"><img class="logoImage" alt="Pig" src="images/pig-logo.gif" title="A platform for analyzing large datasets."></a>
43
+ </div>
44
+ <!--+
45
+ |end Project Logo
46
+ +-->
47
+ <!--+
48
+ |start Search
49
+ +-->
50
+ <div class="searchbox">
51
+ <form action="http://www.google.com/search" method="get" class="roundtopsmall">
52
+ <input value="" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
53
+ <input name="Search" value="Search" type="submit">
54
+ </form>
55
+ </div>
56
+ <!--+
57
+ |end search
58
+ +-->
59
+ <!--+
60
+ |start Tabs
61
+ +-->
62
+ <ul id="tabs">
63
+ <li>
64
+ <a class="unselected" href="http://hadoop.apache.org/pig/">Project</a>
65
+ </li>
66
+ <li>
67
+ <a class="unselected" href="http://wiki.apache.org/pig/">Wiki</a>
68
+ </li>
69
+ <li class="current">
70
+ <a class="selected" href="index.html">Pig 0.7.0 Documentation</a>
71
+ </li>
72
+ </ul>
73
+ <!--+
74
+ |end Tabs
75
+ +-->
76
+ </div>
77
+ </div>
78
+ <div id="main">
79
+ <div id="publishedStrip">
80
+ <!--+
81
+ |start Subtabs
82
+ +-->
83
+ <div id="level2tabs"></div>
84
+ <!--+
85
+ |end Endtabs
86
+ +-->
87
+ <script type="text/javascript"><!--
88
+ document.write("Last Published: " + document.lastModified);
89
+ // --></script>
90
+ </div>
91
+ <!--+
92
+ |breadtrail
93
+ +-->
94
+ <div class="breadtrail">
95
+
96
+ &nbsp;
97
+ </div>
98
+ <!--+
99
+ |start Menu, mainarea
100
+ +-->
101
+ <!--+
102
+ |start Menu
103
+ +-->
104
+ <div id="menu">
105
+ <div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Pig</div>
106
+ <div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
107
+ <div class="menuitem">
108
+ <a href="index.html">Overview</a>
109
+ </div>
110
+ <div class="menuitem">
111
+ <a href="setup.html">Setup</a>
112
+ </div>
113
+ <div class="menupage">
114
+ <div class="menupagetitle">Tutorial</div>
115
+ </div>
116
+ <div class="menuitem">
117
+ <a href="piglatin_ref1.html">Pig Latin 1</a>
118
+ </div>
119
+ <div class="menuitem">
120
+ <a href="piglatin_ref2.html">Pig Latin 2</a>
121
+ </div>
122
+ <div class="menuitem">
123
+ <a href="cookbook.html">Cookbook</a>
124
+ </div>
125
+ <div class="menuitem">
126
+ <a href="udf.html">UDFs</a>
127
+ </div>
128
+ </div>
129
+ <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Zebra</div>
130
+ <div id="menu_1.2" class="menuitemgroup">
131
+ <div class="menuitem">
132
+ <a href="zebra_overview.html">Zebra Overview </a>
133
+ </div>
134
+ <div class="menuitem">
135
+ <a href="zebra_users.html">Zebra Users </a>
136
+ </div>
137
+ <div class="menuitem">
138
+ <a href="zebra_reference.html">Zebra Reference </a>
139
+ </div>
140
+ <div class="menuitem">
141
+ <a href="zebra_mapreduce.html">Zebra MapReduce </a>
142
+ </div>
143
+ <div class="menuitem">
144
+ <a href="zebra_pig.html">Zebra Pig </a>
145
+ </div>
146
+ <div class="menuitem">
147
+ <a href="zebra_stream.html">Zebra Streaming </a>
148
+ </div>
149
+ </div>
150
+ <div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Miscellaneous</div>
151
+ <div id="menu_1.3" class="menuitemgroup">
152
+ <div class="menuitem">
153
+ <a href="api/">API Docs</a>
154
+ </div>
155
+ <div class="menuitem">
156
+ <a href="http://wiki.apache.org/pig/">Wiki</a>
157
+ </div>
158
+ <div class="menuitem">
159
+ <a href="http://wiki.apache.org/pig/FAQ">FAQ</a>
160
+ </div>
161
+ <div class="menuitem">
162
+ <a href="http://hadoop.apache.org/pig/releases.html">Release Notes</a>
163
+ </div>
164
+ </div>
165
+ <div id="credit"></div>
166
+ <div id="roundbottom">
167
+ <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
168
+ <!--+
169
+ |alternative credits
170
+ +-->
171
+ <div id="credit2"></div>
172
+ </div>
173
+ <!--+
174
+ |end Menu
175
+ +-->
176
+ <!--+
177
+ |start content
178
+ +-->
179
+ <div id="content">
180
+ <div title="Portable Document Format" class="pdflink">
181
+ <a class="dida" href="tutorial.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
182
+ PDF</a>
183
+ </div>
184
+ <h1>Pig Tutorial</h1>
185
+ <div id="minitoc-area">
186
+ <ul class="minitoc">
187
+ <li>
188
+ <a href="#Overview">Overview</a>
189
+ </li>
190
+ <li>
191
+ <a href="#Java+Installation"> Java Installation</a>
192
+ </li>
193
+ <li>
194
+ <a href="#Pig+Installation"> Pig Installation</a>
195
+ </li>
196
+ <li>
197
+ <a href="#Running+the+Pig+Scripts+in+Local+Mode"> Running the Pig Scripts in Local Mode</a>
198
+ </li>
199
+ <li>
200
+ <a href="#Running+the+Pig+Scripts+in+Mapreduce+Mode"> Running the Pig Scripts in Mapreduce Mode</a>
201
+ </li>
202
+ <li>
203
+ <a href="#Pig+Tutorial+File"> Pig Tutorial File</a>
204
+ </li>
205
+ <li>
206
+ <a href="#Pig+Script+1%3A+Query+Phrase+Popularity"> Pig Script 1: Query Phrase Popularity</a>
207
+ </li>
208
+ <li>
209
+ <a href="#Pig+Script+2%3A+Temporal+Query+Phrase+Popularity">Pig Script 2: Temporal Query Phrase Popularity</a>
210
+ </li>
211
+ </ul>
212
+ </div>
213
+
214
+
215
+ <a name="N1000D"></a><a name="Overview"></a>
216
+ <h2 class="h3">Overview</h2>
217
+ <div class="section">
218
+ <p>The Pig tutorial shows you how to run two Pig scripts in local mode and mapreduce mode. </p>
219
+ <ul>
220
+
221
+ <li>
222
+ <p>
223
+ <strong>Local Mode</strong>: To run the scripts in local mode, no Hadoop or HDFS installation is required. All files are installed and run from your local host and file system. </p>
224
+
225
+ </li>
226
+
227
+ <li>
228
+ <p>
229
+ <strong>Mapreduce Mode</strong>: To run the scripts in mapreduce mode, you need access to a Hadoop cluster and HDFS installation. </p>
230
+
231
+ </li>
232
+
233
+ </ul>
234
+ <p>The Pig tutorial file (tutorial/pigtutorial.tar.gz file in the pig distribution) includes the Pig JAR file (pig.jar) and the tutorial files (tutorial.jar, Pigs scripts, log files).
235
+ These files work with Hadoop 0.20.2 and include everything you need to run the Pig scripts.</p>
236
+ <p>To get started, follow these basic steps: </p>
237
+ <ol>
238
+
239
+ <li>
240
+ <p>Install Java </p>
241
+
242
+ </li>
243
+
244
+ <li>
245
+ <p>Install Pig </p>
246
+
247
+ </li>
248
+
249
+ <li>
250
+ <p>Run the Pig scripts - in Local or Hadoop mode </p>
251
+
252
+ </li>
253
+
254
+ </ol>
255
+ </div>
256
+
257
+
258
+ <a name="N10042"></a><a name="Java+Installation"></a>
259
+ <h2 class="h3"> Java Installation</h2>
260
+ <div class="section">
261
+ <p>Make sure your run-time environment includes the following: </p>
262
+ <ul>
263
+
264
+ <li>
265
+ <p>Java 1.6 or higher (preferably from Sun) </p>
266
+
267
+ </li>
268
+
269
+ <li>
270
+ <p>The JAVA_HOME environment variable is set the root of your Java installation. </p>
271
+
272
+ </li>
273
+
274
+ </ul>
275
+ </div>
276
+
277
+
278
+ <a name="N10059"></a><a name="Pig+Installation"></a>
279
+ <h2 class="h3"> Pig Installation</h2>
280
+ <div class="section">
281
+ <p>To install Pig, do the following: </p>
282
+ <ol>
283
+
284
+ <li>
285
+ <p>Download the Pig tutorial file to your local directory. </p>
286
+
287
+ </li>
288
+
289
+ <li>
290
+ <p>Unzip the Pig tutorial file (the files are stored in a newly created directory, pigtmp). </p>
291
+
292
+ <pre class="code">
293
+ $ tar -xzf pigtutorial.tar.gz
294
+ </pre>
295
+
296
+ </li>
297
+
298
+ <li>
299
+ <p>Move to the pigtmp directory. </p>
300
+
301
+ </li>
302
+
303
+ <li>
304
+ <p>Review the contents of the Pig tutorial file. </p>
305
+
306
+ </li>
307
+
308
+ <li>
309
+ <p>Copy the <strong>pig.jar</strong> file to the appropriate directory on your system. For example: /home/me/pig. </p>
310
+
311
+ </li>
312
+
313
+ <li>
314
+ <p>Create an environment variable, <strong>PIGDIR</strong>, and point it to your directory; for example, export PIGDIR=/home/me/pig (bash, sh) or setenv PIGDIR /home/me/pig (tcsh, csh). </p>
315
+
316
+ </li>
317
+
318
+ </ol>
319
+ </div>
320
+
321
+
322
+ <a name="N1008E"></a><a name="Running+the+Pig+Scripts+in+Local+Mode"></a>
323
+ <h2 class="h3"> Running the Pig Scripts in Local Mode</h2>
324
+ <div class="section">
325
+ <p>To run the Pig scripts in local mode, do the following: </p>
326
+ <ol>
327
+
328
+ <li>
329
+
330
+ <p>Set the maximum memory for Java.</p>
331
+
332
+ <pre class="code">
333
+ java -Xmx256m -cp pig.jar org.apache.pig.Main -x local script1-local.pig
334
+ java -Xmx256m -cp pig.jar org.apache.pig.Main -x local script2-local.pig
335
+ </pre>
336
+
337
+ </li>
338
+
339
+ <li>
340
+ <p>Move to the pigtmp directory. </p>
341
+ </li>
342
+
343
+ <li>
344
+ <p>Review Pig Script 1 and Pig Script 2. </p>
345
+ </li>
346
+
347
+ <li>
348
+
349
+ <p>Execute the following command (using either script1-local.pig or script2-local.pig). </p>
350
+
351
+ <pre class="code">
352
+ $ java -cp $PIGDIR/pig.jar org.apache.pig.Main -x local script1-local.pig
353
+ </pre>
354
+
355
+ </li>
356
+
357
+ <li>
358
+ <p>Review the result files, located in the part-r-00000 directory.</p>
359
+
360
+ <p>The output may contain a few Hadoop warnings which can be ignored:</p>
361
+
362
+ <pre class="code">
363
+ 2010-04-08 12:55:33,642 [main] INFO org.apache.hadoop.metrics.jvm.JvmMetrics
364
+ - Cannot initialize JVM Metrics with processName=JobTracker, sessionId= - already initialized
365
+ </pre>
366
+
367
+ </li>
368
+
369
+ </ol>
370
+ </div>
371
+
372
+
373
+ <a name="N100C3"></a><a name="Running+the+Pig+Scripts+in+Mapreduce+Mode"></a>
374
+ <h2 class="h3"> Running the Pig Scripts in Mapreduce Mode</h2>
375
+ <div class="section">
376
+ <p>To run the Pig scripts in mapreduce mode, do the following: </p>
377
+ <ol>
378
+
379
+ <li>
380
+ <p>Move to the pigtmp directory. </p>
381
+
382
+ </li>
383
+
384
+ <li>
385
+ <p>Review Pig Script 1 and Pig Script 2. </p>
386
+
387
+ </li>
388
+
389
+ <li>
390
+ <p>Copy the excite.log.bz2 file from the pigtmp directory to the HDFS directory. </p>
391
+
392
+ <pre class="code">
393
+ $ hadoop fs &ndash;copyFromLocal excite.log.bz2 .
394
+ </pre>
395
+
396
+ </li>
397
+
398
+
399
+ <li>
400
+ <p>Set the HADOOP_CONF_DIR environment variable to the location of your core-site.xml, hdfs-site.xml and mapred-site.xml files. </p>
401
+
402
+ </li>
403
+
404
+ <li>
405
+ <p>Execute the following command (using either script1-hadoop.pig or script2-hadoop.pig): </p>
406
+
407
+ <pre class="code">
408
+ $ java -cp $PIGDIR/pig.jar:$HADOOP_CONF_DIR org.apache.pig.Main script1-hadoop.pig
409
+ </pre>
410
+
411
+ </li>
412
+
413
+
414
+ <li>
415
+ <p>Review the result files, located in the script1-hadoop-results or script2-hadoop-results HDFS directory: </p>
416
+
417
+ <pre class="code">
418
+ $ hadoop fs -ls script1-hadoop-results
419
+ $ hadoop fs -cat 'script1-hadoop-results/*' | less
420
+ </pre>
421
+
422
+ </li>
423
+
424
+ </ol>
425
+ </div>
426
+
427
+
428
+ <a name="N100FA"></a><a name="Pig+Tutorial+File"></a>
429
+ <h2 class="h3"> Pig Tutorial File</h2>
430
+ <div class="section">
431
+ <p>The contents of the Pig tutorial file (pigtutorial.tar.gz) are described here. </p>
432
+ <table class="ForrestTable" cellspacing="1" cellpadding="4">
433
+
434
+ <tr>
435
+
436
+ <td colspan="1" rowspan="1">
437
+
438
+ <p>
439
+ <strong>File</strong>
440
+ </p>
441
+
442
+ </td>
443
+ <td colspan="1" rowspan="1">
444
+
445
+ <p>
446
+ <strong>Description</strong>
447
+ </p>
448
+
449
+ </td>
450
+
451
+ </tr>
452
+
453
+ <tr>
454
+
455
+ <td colspan="1" rowspan="1">
456
+
457
+ <p> pig.jar </p>
458
+
459
+ </td>
460
+ <td colspan="1" rowspan="1">
461
+
462
+ <p> Pig JAR file </p>
463
+
464
+ </td>
465
+
466
+ </tr>
467
+
468
+ <tr>
469
+
470
+ <td colspan="1" rowspan="1">
471
+
472
+ <p> tutorial.jar </p>
473
+
474
+ </td>
475
+ <td colspan="1" rowspan="1">
476
+
477
+ <p> User-defined functions (UDFs) and Java classes </p>
478
+
479
+ </td>
480
+
481
+ </tr>
482
+
483
+ <tr>
484
+
485
+ <td colspan="1" rowspan="1">
486
+
487
+ <p> script1-local.pig </p>
488
+
489
+ </td>
490
+ <td colspan="1" rowspan="1">
491
+
492
+ <p> Pig Script 1, Query Phrase Popularity (local mode) </p>
493
+
494
+ </td>
495
+
496
+ </tr>
497
+
498
+ <tr>
499
+
500
+ <td colspan="1" rowspan="1">
501
+
502
+ <p> script1-hadoop.pig </p>
503
+
504
+ </td>
505
+ <td colspan="1" rowspan="1">
506
+
507
+ <p> Pig Script 1, Query Phrase Popularity (Hadoop cluster) </p>
508
+
509
+ </td>
510
+
511
+ </tr>
512
+
513
+ <tr>
514
+
515
+ <td colspan="1" rowspan="1">
516
+
517
+ <p> script2-local.pig </p>
518
+
519
+ </td>
520
+ <td colspan="1" rowspan="1">
521
+
522
+ <p> Pig Script 2, Temporal Query Phrase Popularity (local mode)</p>
523
+
524
+ </td>
525
+
526
+ </tr>
527
+
528
+ <tr>
529
+
530
+ <td colspan="1" rowspan="1">
531
+
532
+ <p> script2-hadoop.pig </p>
533
+
534
+ </td>
535
+ <td colspan="1" rowspan="1">
536
+
537
+ <p> Pig Script 2, Temporal Query Phrase Popularity (Hadoop cluster) </p>
538
+
539
+ </td>
540
+
541
+ </tr>
542
+
543
+ <tr>
544
+
545
+ <td colspan="1" rowspan="1">
546
+
547
+ <p> excite-small.log </p>
548
+
549
+ </td>
550
+ <td colspan="1" rowspan="1">
551
+
552
+ <p> Log file, Excite search engine (local mode) </p>
553
+
554
+ </td>
555
+
556
+ </tr>
557
+
558
+ <tr>
559
+
560
+ <td colspan="1" rowspan="1">
561
+
562
+ <p> excite.log.bz2 </p>
563
+
564
+ </td>
565
+ <td colspan="1" rowspan="1">
566
+
567
+ <p> Log file, Excite search engine (Hadoop cluster) </p>
568
+
569
+ </td>
570
+
571
+ </tr>
572
+
573
+ </table>
574
+ <p>The user-defined functions (UDFs) are described here. </p>
575
+ <table class="ForrestTable" cellspacing="1" cellpadding="4">
576
+
577
+ <tr>
578
+
579
+ <td colspan="1" rowspan="1">
580
+
581
+ <p>
582
+ <strong>UDF</strong>
583
+ </p>
584
+
585
+ </td>
586
+ <td colspan="1" rowspan="1">
587
+
588
+ <p>
589
+ <strong>Description</strong>
590
+ </p>
591
+
592
+ </td>
593
+
594
+ </tr>
595
+
596
+ <tr>
597
+
598
+ <td colspan="1" rowspan="1">
599
+
600
+ <p> ExtractHour </p>
601
+
602
+ </td>
603
+ <td colspan="1" rowspan="1">
604
+
605
+ <p> Extracts the hour from the record.</p>
606
+
607
+ </td>
608
+
609
+ </tr>
610
+
611
+ <tr>
612
+
613
+ <td colspan="1" rowspan="1">
614
+
615
+ <p> NGramGenerator </p>
616
+
617
+ </td>
618
+ <td colspan="1" rowspan="1">
619
+
620
+ <p> Composes n-grams from the set of words. </p>
621
+
622
+ </td>
623
+
624
+ </tr>
625
+
626
+ <tr>
627
+
628
+ <td colspan="1" rowspan="1">
629
+
630
+ <p> NonURLDetector </p>
631
+
632
+ </td>
633
+ <td colspan="1" rowspan="1">
634
+
635
+ <p> Removes the record if the query field is empty or a URL. </p>
636
+
637
+ </td>
638
+
639
+ </tr>
640
+
641
+ <tr>
642
+
643
+ <td colspan="1" rowspan="1">
644
+
645
+ <p> ScoreGenerator </p>
646
+
647
+ </td>
648
+ <td colspan="1" rowspan="1">
649
+
650
+ <p> Calculates a "popularity" score for the n-gram.</p>
651
+
652
+ </td>
653
+
654
+ </tr>
655
+
656
+ <tr>
657
+
658
+ <td colspan="1" rowspan="1">
659
+
660
+ <p> ToLower </p>
661
+
662
+ </td>
663
+ <td colspan="1" rowspan="1">
664
+
665
+ <p> Changes the query field to lowercase. </p>
666
+
667
+ </td>
668
+
669
+ </tr>
670
+
671
+ <tr>
672
+
673
+ <td colspan="1" rowspan="1">
674
+
675
+ <p> TutorialUtil </p>
676
+
677
+ </td>
678
+ <td colspan="1" rowspan="1">
679
+
680
+ <p> Divides the query string into a set of words.</p>
681
+
682
+ </td>
683
+
684
+ </tr>
685
+
686
+ </table>
687
+ </div>
688
+
689
+
690
+ <a name="N10247"></a><a name="Pig+Script+1%3A+Query+Phrase+Popularity"></a>
691
+ <h2 class="h3"> Pig Script 1: Query Phrase Popularity</h2>
692
+ <div class="section">
693
+ <p>The Query Phrase Popularity script (script1-local.pig or script1-hadoop.pig) processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day. </p>
694
+ <p>The script is shown here: </p>
695
+ <ul>
696
+
697
+ <li>
698
+ <p> Register the tutorial JAR file so that the included UDFs can be called in the script. </p>
699
+
700
+ </li>
701
+
702
+ </ul>
703
+ <pre class="code">
704
+ REGISTER ./tutorial.jar;
705
+ </pre>
706
+ <ul>
707
+
708
+ <li>
709
+ <p> Use the PigStorage function to load the excite log file (excite.log or excite-small.log) into the &ldquo;raw&rdquo; bag as an array of records with the fields <strong>user</strong>, <strong>time</strong>, and <strong>query</strong>. </p>
710
+
711
+ </li>
712
+
713
+ </ul>
714
+ <pre class="code">
715
+ raw = LOAD 'excite.log' USING PigStorage('\t') AS (user, time, query);
716
+ </pre>
717
+ <ul>
718
+
719
+ <li>
720
+ <p> Call the NonURLDetector UDF to remove records if the query field is empty or a URL. </p>
721
+
722
+ </li>
723
+
724
+ </ul>
725
+ <pre class="code">
726
+ clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
727
+ </pre>
728
+ <ul>
729
+
730
+ <li>
731
+ <p> Call the ToLower UDF to change the query field to lowercase. </p>
732
+
733
+ </li>
734
+
735
+ </ul>
736
+ <pre class="code">
737
+ clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
738
+ </pre>
739
+ <ul>
740
+
741
+ <li>
742
+ <p> Because the log file only contains queries for a single day, we are only interested in the hour. The excite query log timestamp format is YYMMDDHHMMSS. Call the ExtractHour UDF to extract the hour (HH) from the time field. </p>
743
+
744
+ </li>
745
+
746
+ </ul>
747
+ <pre class="code">
748
+ houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
749
+ </pre>
750
+ <ul>
751
+
752
+ <li>
753
+ <p> Call the NGramGenerator UDF to compose the n-grams of the query. </p>
754
+
755
+ </li>
756
+
757
+ </ul>
758
+ <pre class="code">
759
+ ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
760
+ </pre>
761
+ <ul>
762
+
763
+ <li>
764
+ <p> Use the DISTINCT operator to get the unique n-grams for all records. </p>
765
+
766
+ </li>
767
+
768
+ </ul>
769
+ <pre class="code">
770
+ ngramed2 = DISTINCT ngramed1;
771
+ </pre>
772
+ <ul>
773
+
774
+ <li>
775
+ <p> Use the GROUP operator to group records by n-gram and hour. </p>
776
+
777
+ </li>
778
+
779
+ </ul>
780
+ <pre class="code">
781
+ hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
782
+ </pre>
783
+ <ul>
784
+
785
+ <li>
786
+ <p> Use the COUNTfunction to get the count (occurrences) of each n-gram. </p>
787
+
788
+ </li>
789
+
790
+ </ul>
791
+ <pre class="code">
792
+ hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
793
+ </pre>
794
+ <ul>
795
+
796
+ <li>
797
+ <p> Use the GROUP operator to group records by n-gram only. Each group now corresponds to a distinct n-gram and has the count for each hour. </p>
798
+
799
+ </li>
800
+
801
+ </ul>
802
+ <pre class="code">
803
+ uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
804
+ </pre>
805
+ <ul>
806
+
807
+ <li>
808
+ <p> For each group, identify the hour in which this n-gram is used with a particularly high frequency. Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram. </p>
809
+
810
+ </li>
811
+
812
+ </ul>
813
+ <pre class="code">
814
+ uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1));
815
+ </pre>
816
+ <ul>
817
+
818
+ <li>
819
+ <p> Use the FOREACH-GENERATE operator to assign names to the fields. </p>
820
+
821
+ </li>
822
+
823
+ </ul>
824
+ <pre class="code">
825
+ uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean;
826
+ </pre>
827
+ <ul>
828
+
829
+ <li>
830
+ <p> Use the FILTER operator to move all records with a score less than or equal to 2.0. </p>
831
+
832
+ </li>
833
+
834
+ </ul>
835
+ <pre class="code">
836
+ filtered_uniq_frequency = FILTER uniq_frequency3 BY score &gt; 2.0;
837
+ </pre>
838
+ <ul>
839
+
840
+ <li>
841
+ <p> Use the ORDER operator to sort the remaining records by hour and score. </p>
842
+
843
+ </li>
844
+
845
+ </ul>
846
+ <pre class="code">
847
+ ordered_uniq_frequency = ORDER filtered_uniq_frequency BY (hour, score);
848
+ </pre>
849
+ <ul>
850
+
851
+ <li>
852
+ <p> Use the PigStorage function to store the results. The output file contains a list of n-grams with the following fields: <strong>hour</strong>, <strong>ngram</strong>, <strong>score</strong>, <strong>count</strong>, <strong>mean</strong>. </p>
853
+
854
+ </li>
855
+
856
+ </ul>
857
+ <pre class="code">
858
+ STORE ordered_uniq_frequency INTO '/tmp/tutorial-results' USING PigStorage();
859
+ </pre>
860
+ </div>
861
+
862
+
863
+ <a name="N10320"></a><a name="Pig+Script+2%3A+Temporal+Query+Phrase+Popularity"></a>
864
+ <h2 class="h3">Pig Script 2: Temporal Query Phrase Popularity</h2>
865
+ <div class="section">
866
+ <p>The Temporal Query Phrase Popularity script (script2-local.pig or script2-hadoop.pig) processes a search query log file from the Excite search engine and compares the occurrence of frequency of search phrases across two time periods separated by twelve hours. </p>
867
+ <p>The script is shown here: </p>
868
+ <ul>
869
+
870
+ <li>
871
+ <p> Register the tutorial JAR file so that the user-defined functions (UDFs) can be called in the script. </p>
872
+
873
+ </li>
874
+
875
+ </ul>
876
+ <pre class="code">
877
+ REGISTER ./tutorial.jar;
878
+ </pre>
879
+ <ul>
880
+
881
+ <li>
882
+ <p> Use the PigStorage function to load the excite log file (excite.log or excite-small.log) into the &ldquo;raw&rdquo; bag as an array of records with the fields <strong>user</strong>, <strong>time</strong>, and <strong>query</strong>. </p>
883
+
884
+ </li>
885
+
886
+ </ul>
887
+ <pre class="code">
888
+ raw = LOAD 'excite.log' USING PigStorage('\t') AS (user, time, query);
889
+ </pre>
890
+ <ul>
891
+
892
+ <li>
893
+ <p> Call the NonURLDetector UDF to remove records if the query field is empty or a URL. </p>
894
+
895
+ </li>
896
+
897
+ </ul>
898
+ <pre class="code">
899
+ clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
900
+ </pre>
901
+ <ul>
902
+
903
+ <li>
904
+ <p> Call the ToLower UDF to change the query field to lowercase. </p>
905
+
906
+ </li>
907
+
908
+ </ul>
909
+ <pre class="code">
910
+ clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
911
+ </pre>
912
+ <ul>
913
+
914
+ <li>
915
+ <p> Because the log file only contains queries for a single day, we are only interested in the hour. The excite query log timestamp format is YYMMDDHHMMSS. Call the ExtractHour UDF to extract the hour from the time field. </p>
916
+
917
+ </li>
918
+
919
+ </ul>
920
+ <pre class="code">
921
+ houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
922
+ </pre>
923
+ <ul>
924
+
925
+ <li>
926
+ <p> Call the NGramGenerator UDF to compose the n-grams of the query. </p>
927
+
928
+ </li>
929
+
930
+ </ul>
931
+ <pre class="code">
932
+ ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
933
+ </pre>
934
+ <ul>
935
+
936
+ <li>
937
+ <p> Use the DISTINCT operator to get the unique n-grams for all records. </p>
938
+
939
+ </li>
940
+
941
+ </ul>
942
+ <pre class="code">
943
+ ngramed2 = DISTINCT ngramed1;
944
+ </pre>
945
+ <ul>
946
+
947
+ <li>
948
+ <p> Use the GROUP operator to group the records by n-gram and hour. </p>
949
+
950
+ </li>
951
+
952
+ </ul>
953
+ <pre class="code">
954
+ hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
955
+ </pre>
956
+ <ul>
957
+
958
+ <li>
959
+ <p> Use the COUNT function to get the count (occurrences) of each n-gram. </p>
960
+
961
+ </li>
962
+
963
+ </ul>
964
+ <pre class="code">
965
+ hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
966
+ </pre>
967
+ <ul>
968
+
969
+ <li>
970
+ <p> Use the FOREACH-GENERATE operator to assign names to the fields. </p>
971
+
972
+ </li>
973
+
974
+ </ul>
975
+ <pre class="code">
976
+ hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 as count;
977
+ </pre>
978
+ <ul>
979
+
980
+ <li>
981
+ <p> Use the FILTERoperator to get the n-grams for hour &lsquo;00&rsquo; </p>
982
+
983
+ </li>
984
+
985
+ </ul>
986
+ <pre class="code">
987
+ hour00 = FILTER hour_frequency2 BY hour eq '00';
988
+ </pre>
989
+ <ul>
990
+
991
+ <li>
992
+ <p> Uses the FILTER operators to get the n-grams for hour &lsquo;12&rsquo; </p>
993
+
994
+ </li>
995
+
996
+ </ul>
997
+ <pre class="code">
998
+ hour12 = FILTER hour_frequency3 BY hour eq '12';
999
+ </pre>
1000
+ <ul>
1001
+
1002
+ <li>
1003
+ <p> Use the JOIN operator to get the n-grams that appear in both hours. </p>
1004
+
1005
+ </li>
1006
+
1007
+ </ul>
1008
+ <pre class="code">
1009
+ same = JOIN hour00 BY $0, hour12 BY $0;
1010
+ </pre>
1011
+ <ul>
1012
+
1013
+ <li>
1014
+ <p> Use the FOREACH-GENERATE operator to record their frequency. </p>
1015
+
1016
+ </li>
1017
+
1018
+ </ul>
1019
+ <pre class="code">
1020
+ same1 = FOREACH same GENERATE hour_frequency2::hour00::group::ngram as ngram, $2 as count00, $5 as count12;
1021
+ </pre>
1022
+ <ul>
1023
+
1024
+ <li>
1025
+ <p> Use the PigStorage function to store the results. The output file contains a list of n-grams with the following fields: <strong>hour</strong>, <strong>count00</strong>, <strong>count12</strong>. </p>
1026
+
1027
+ </li>
1028
+
1029
+ </ul>
1030
+ <pre class="code">
1031
+ STORE same1 INTO '/tmp/tutorial-join-results' USING PigStorage();
1032
+ </pre>
1033
+ </div>
1034
+
1035
+ </div>
1036
+ <!--+
1037
+ |end content
1038
+ +-->
1039
+ <div class="clearboth">&nbsp;</div>
1040
+ </div>
1041
+ <div id="footer">
1042
+ <!--+
1043
+ |start bottomstrip
1044
+ +-->
1045
+ <div class="lastmodified">
1046
+ <script type="text/javascript"><!--
1047
+ document.write("Last Published: " + document.lastModified);
1048
+ // --></script>
1049
+ </div>
1050
+ <div class="copyright">
1051
+ Copyright &copy;
1052
+ 2007-2010 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
1053
+ </div>
1054
+ <!--+
1055
+ |end bottomstrip
1056
+ +-->
1057
+ </div>
1058
+ </body>
1059
+ </html>