wukong 0.1.4 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL.textile +89 -0
- data/README.textile +41 -74
- data/docpages/INSTALL.textile +94 -0
- data/{doc → docpages}/LICENSE.textile +0 -0
- data/{doc → docpages}/README-wulign.textile +6 -0
- data/docpages/UsingWukong-part1-get_ready.textile +17 -0
- data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
- data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
- data/docpages/_config.yml +39 -0
- data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
- data/{doc → docpages}/code/api_response_example.txt +0 -0
- data/{doc → docpages}/code/parser_skeleton.rb +0 -0
- data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +16 -0
- data/docpages/hadoop-tips.textile +83 -0
- data/docpages/index.textile +90 -0
- data/docpages/intro.textile +8 -0
- data/docpages/moreinfo.textile +174 -0
- data/docpages/news.html +24 -0
- data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
- data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
- data/docpages/tutorial.textile +283 -0
- data/docpages/usage.textile +195 -0
- data/docpages/wutils.textile +263 -0
- data/wukong.gemspec +80 -50
- metadata +87 -54
- data/doc/INSTALL.textile +0 -41
- data/doc/README-tutorial.textile +0 -163
- data/doc/README-wutils.textile +0 -128
- data/doc/TODO.textile +0 -61
- data/doc/UsingWukong-part1-setup.textile +0 -2
- data/doc/UsingWukong-part2-scraping.textile +0 -2
- data/doc/hadoop-nfs.textile +0 -51
- data/doc/hadoop-setup.textile +0 -29
- data/doc/index.textile +0 -124
- data/doc/links.textile +0 -42
- data/doc/usage.textile +0 -102
- data/doc/utils.textile +0 -48
- data/examples/and_pig/sample_queries.rb +0 -128
- data/lib/wukong/and_pig.rb +0 -62
- data/lib/wukong/and_pig/README.textile +0 -12
- data/lib/wukong/and_pig/as.rb +0 -37
- data/lib/wukong/and_pig/data_types.rb +0 -30
- data/lib/wukong/and_pig/functions.rb +0 -50
- data/lib/wukong/and_pig/generate.rb +0 -85
- data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
- data/lib/wukong/and_pig/junk.rb +0 -51
- data/lib/wukong/and_pig/operators.rb +0 -8
- data/lib/wukong/and_pig/operators/compound.rb +0 -29
- data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
- data/lib/wukong/and_pig/operators/execution.rb +0 -15
- data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
- data/lib/wukong/and_pig/operators/foreach.rb +0 -98
- data/lib/wukong/and_pig/operators/groupies.rb +0 -212
- data/lib/wukong/and_pig/operators/load_store.rb +0 -65
- data/lib/wukong/and_pig/operators/meta.rb +0 -42
- data/lib/wukong/and_pig/operators/relational.rb +0 -129
- data/lib/wukong/and_pig/pig_struct.rb +0 -48
- data/lib/wukong/and_pig/pig_var.rb +0 -95
- data/lib/wukong/and_pig/symbol.rb +0 -29
- data/lib/wukong/and_pig/utils.rb +0 -0
@@ -0,0 +1,263 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: mrflip.github.com/wukong - wu-utils utilities
|
4
|
+
collapse: false
|
5
|
+
---
|
6
|
+
|
7
|
+
h1(gemheader). Wukong Utility Scripts
|
8
|
+
|
9
|
+
** "Overview of wutils":#wutils -- command listing
|
10
|
+
** "Stupid command-line tricks":#cmdlinetricks using the wutils
|
11
|
+
** "wu-lign":#wulign -- present a tab-separated file as aligned columns
|
12
|
+
** Dear Lazyweb, please build this for us: "tab-oriented version of the Textutils library":#wutilsinc
|
13
|
+
|
14
|
+
<notextile><div class="toggle"></notextile>
|
15
|
+
|
16
|
+
h2(#cmdlinetricks). Stupid command-line tricks
|
17
|
+
|
18
|
+
Here are a few useful little snippets you can run from the command line:
|
19
|
+
|
20
|
+
h3. Histogram
|
21
|
+
|
22
|
+
Given data with a date column:
|
23
|
+
|
24
|
+
<pre>
|
25
|
+
message 235623 20090423012345 Now is the winter of our discontent Made glorious summer by this son of York
|
26
|
+
message 235623 20080101230900 These pretzels are making me THIRSTY!
|
27
|
+
...
|
28
|
+
</pre>
|
29
|
+
|
30
|
+
You can calculate number of messages sent by day with
|
31
|
+
|
32
|
+
<pre>
|
33
|
+
cat messages | cuttab 3 | cutc 8 | sort | uniq -c
|
34
|
+
</pre>
|
35
|
+
|
36
|
+
(see the wuhist command, below.)
|
37
|
+
|
38
|
+
h3. Simple intersection, union, etc
|
39
|
+
|
40
|
+
For two datasets (batch_1 and batch_2) with unique entries (no repeated lines),
|
41
|
+
|
42
|
+
* Their union is simple:
|
43
|
+
|
44
|
+
<pre>
|
45
|
+
cat batch_1 batch_2 | sort -u
|
46
|
+
</pre>
|
47
|
+
|
48
|
+
* To find their intersection, concatenate the two sets and filters out everything that only occurred once.
|
49
|
+
|
50
|
+
<pre>
|
51
|
+
cat batch_1 batch_2 | sort | uniq -c | egrep -v '^ *1 '
|
52
|
+
</pre>
|
53
|
+
|
54
|
+
* For the complement of the intersection, use @... | egrep '^ *1 '@
|
55
|
+
|
56
|
+
* In both cases, if the files are each internally sorted, the commandline sort takes a --merge flag:
|
57
|
+
|
58
|
+
<pre>
|
59
|
+
sort --merge -u batch_1 batch_2
|
60
|
+
</pre>
|
61
|
+
|
62
|
+
<notextile></div><div class="toggle"></notextile>
|
63
|
+
|
64
|
+
h2(#wutils). Wutils Command Listing
|
65
|
+
|
66
|
+
h3. cutc
|
67
|
+
|
68
|
+
@cutc [colnum]@
|
69
|
+
|
70
|
+
Ex.
|
71
|
+
|
72
|
+
@echo -e 'foo\tbar\tbaz' | cutc 6@
|
73
|
+
@foo ba@
|
74
|
+
|
75
|
+
Cuts from beginning of line to given column (default 200). A tab is one character, so right margin can still be ragged.
|
76
|
+
|
77
|
+
h3. cuttab
|
78
|
+
|
79
|
+
@cuttab [colspec]@
|
80
|
+
|
81
|
+
Cuts given tab-separated columns. You can give a comma separated list of numbers
|
82
|
+
or ranges 1-4. columns are numbered from 1.
|
83
|
+
|
84
|
+
Ex.
|
85
|
+
|
86
|
+
<pre>
|
87
|
+
echo -e 'foo\tbar\tbaz' | cuttab 1,3
|
88
|
+
foo baz
|
89
|
+
</pre>
|
90
|
+
|
91
|
+
h3. hdp-*
|
92
|
+
|
93
|
+
These perform the corresponding commands on the HDFS filesystem. In general,
|
94
|
+
where they accept command-line flags, they go with the GNU-style ones, not the
|
95
|
+
hadoop-style: so, @hdp-du -s dir@ or @hdp-rm -r foo/@
|
96
|
+
|
97
|
+
* @hdp-cat@
|
98
|
+
* @hdp-catd@ -- cats the files that don't start with '_' in a directory. Use this for a pile of @.../part-00000@ files
|
99
|
+
* @hdp-du@
|
100
|
+
* @hdp-get@
|
101
|
+
* @hdp-kill@
|
102
|
+
* @hdp-ls@
|
103
|
+
* @hdp-mkdir@
|
104
|
+
* @hdp-mv@
|
105
|
+
* @hdp-ps@
|
106
|
+
* @hdp-put@
|
107
|
+
* @hdp-rm@
|
108
|
+
* @hdp-sync@
|
109
|
+
|
110
|
+
h3. hdp-sort, hdp-stream, hdp-stream-flat
|
111
|
+
|
112
|
+
* @hdp-sort@
|
113
|
+
* @hdp-stream@
|
114
|
+
* @hdp-stream-flat@
|
115
|
+
|
116
|
+
<code><pre>
|
117
|
+
hdp-stream input_filespec output_file map_cmd reduce_cmd num_key_fields
|
118
|
+
</pre></code>
|
119
|
+
|
120
|
+
h3. tabchar
|
121
|
+
|
122
|
+
Outputs a single tab character.
|
123
|
+
|
124
|
+
h3. wuhist
|
125
|
+
|
126
|
+
Occasionally useful to gather a lexical histogram of a single column:
|
127
|
+
|
128
|
+
Ex.
|
129
|
+
|
130
|
+
<code><pre>
|
131
|
+
$ echo -e 'foo\nbar\nbar\nfoo\nfoo\nfoo\n7' | ./wuhist
|
132
|
+
4 foo
|
133
|
+
2 bar
|
134
|
+
1 7
|
135
|
+
</pre></code>
|
136
|
+
|
137
|
+
(the output will have a tab between the first and second column, for futher processing.)
|
138
|
+
|
139
|
+
h3. wulign
|
140
|
+
|
141
|
+
Intelligently format a tab-separated file into aligned columns (while remaining tab-separated for further processing). See "below":#wulign.
|
142
|
+
|
143
|
+
h3. hdp-parts_to_keys.rb
|
144
|
+
|
145
|
+
A *very* clumsy script to rename reduced hadoop output files by their initial key.
|
146
|
+
|
147
|
+
If your output file has an initial key in the first column and you pass it through hdp-sort, they will be distributed across reducers and thus output files. (Because of the way hadoop hashes the keys, there's no guarantee that each file will get a distinct key. You could have 2 keys with a million entries and they could land sequentially on the same reducer, always fun.)
|
148
|
+
|
149
|
+
If you're willing to roll the dice, this script will rename files according to the first key in the first line.
|
150
|
+
|
151
|
+
**Do you have or know of a native hadoop utility to do this?** If so, please get in touch!
|
152
|
+
|
153
|
+
<notextile></div><div class="toggle"></notextile>
|
154
|
+
|
155
|
+
h2(#wulign). wu-lign -- format a tab-separated file as aligned columns
|
156
|
+
|
157
|
+
wu-lign will intelligently reformat a tab-separated file into a tab-separated, space aligned file that is still suitable for further processing. For example, given the log-file input
|
158
|
+
|
159
|
+
<pre><code>
|
160
|
+
2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
|
161
|
+
2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
|
162
|
+
2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
|
163
|
+
2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
|
164
|
+
2009-07-21T21:44:29 world 65536 1.09110 32850 200916
|
165
|
+
2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
|
166
|
+
2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
|
167
|
+
</code></pre>
|
168
|
+
|
169
|
+
wu-lign will reformat it to read
|
170
|
+
|
171
|
+
<pre><code>
|
172
|
+
2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
|
173
|
+
2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
|
174
|
+
2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
|
175
|
+
2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
|
176
|
+
2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
|
177
|
+
2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
|
178
|
+
2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
|
179
|
+
</code></pre>
|
180
|
+
|
181
|
+
The fields are still tab-delimited by exactly one tab -- only spaces are used to pad out fields. You can still use cuttab and friends to manipulate columns.
|
182
|
+
|
183
|
+
wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful for previewing and organizing tab-formatted files. In general @wu-lign(foo).split("\t").map(&:strip)@ *should* give output semantically equivalent to its input. (That is, the only changes should be insertion of spaces and re-formatting of numerics.) But still -- reserve its use for human inspection only.
|
184
|
+
|
185
|
+
(Note: tab characters in this source code file have been converted to spaces; replace whitespace with tab in the first example if you'd like to play along at home.)
|
186
|
+
|
187
|
+
h3. How it works
|
188
|
+
|
189
|
+
Wu-Lign takes the first 1000 lines, splits by TAB characters into fields, and tries to guess the format -- int, float, or string -- for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
|
190
|
+
|
191
|
+
h3. Command-line arguments
|
192
|
+
|
193
|
+
You can give sprintf-style positional arguments on the command line that will be applied to the corresponding columns. (Blank args are used for placeholding and auto-formatting is still applied). So with the example above,
|
194
|
+
|
195
|
+
@cat foo | wu-lign '' '' '' '%8.4e'@
|
196
|
+
|
197
|
+
will format the fourth column with "%8.4e", while the first three columns and fifth-and-higher columns are formatted as usual.
|
198
|
+
|
199
|
+
<pre><code>
|
200
|
+
...
|
201
|
+
2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
|
202
|
+
2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
|
203
|
+
2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
|
204
|
+
....
|
205
|
+
</code></pre>
|
206
|
+
|
207
|
+
h3. Notes
|
208
|
+
|
209
|
+
* It has no knowledge of header rows. An all-text first line will screw everything up.
|
210
|
+
* It also requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
|
211
|
+
* It won't set columns wider than 70 chars -- this allows for the occasional super-wide column without completely breaking your screen.
|
212
|
+
* For :float values, wu-lign tries to guess at the right number of significant digits to the left and right of the decimal point.
|
213
|
+
* wu-lign does not parse 'TSV files' in their strict sense -- there is no quoting or escaping; every tab delimits a field, every newline a record.
|
214
|
+
|
215
|
+
h2(#wutilsinc). Dear Lazyweb, please build this
|
216
|
+
|
217
|
+
* uniq - report or filter out repeated lines in a file
|
218
|
+
** -c produces line<tab>count
|
219
|
+
** --ignore f1,f2,... discards given fields from consideration. field syntax same as for cut, etc.
|
220
|
+
|
221
|
+
* sort - sort lines of text files
|
222
|
+
** columns indexed as tab-separated
|
223
|
+
** can specify any column order, uses same field spec as cut
|
224
|
+
* tsort - topological sort of a directed graph
|
225
|
+
|
226
|
+
* cut - select portions of each line of a file
|
227
|
+
** can reorder columns
|
228
|
+
* nl - line numbering filter
|
229
|
+
** takes prefix, suffix
|
230
|
+
** count \t line -OR- line \t count
|
231
|
+
|
232
|
+
* wc - word, line, character, and byte count
|
233
|
+
** field count (tab-separated fields)
|
234
|
+
* paste - merge corresponding or subsequent lines of files
|
235
|
+
* expand, unexpand - expand tabs to spaces, and vice versa
|
236
|
+
* seq
|
237
|
+
* simple row, column sums
|
238
|
+
* join - relational database operator
|
239
|
+
* tac
|
240
|
+
|
241
|
+
* cat - concatenate and print files
|
242
|
+
* head - display first lines of a file
|
243
|
+
* tail - display the last part of a file
|
244
|
+
* shuf
|
245
|
+
* split - split a file into pieces
|
246
|
+
* csplit - split files based on context
|
247
|
+
* tee - pipe fitting
|
248
|
+
|
249
|
+
* ls - list directory contents.
|
250
|
+
* df - display free disk space
|
251
|
+
* du - display disk usage statistics
|
252
|
+
** tab-delimited, space aligned
|
253
|
+
|
254
|
+
* od - octal, decimal, hex, ASCII dump
|
255
|
+
* printf - formatted output
|
256
|
+
* cksum, sum - display file checksums and block counts
|
257
|
+
* md5sum
|
258
|
+
|
259
|
+
* diff
|
260
|
+
* comm
|
261
|
+
|
262
|
+
|
263
|
+
<notextile></div></notextile>
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "
|
8
|
+
s.version = "1.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2009-10-
|
12
|
+
s.date = %q{2009-10-12}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it’s efficient to process by lines
|
@@ -25,31 +25,74 @@ Gem::Specification.new do |s|
|
|
25
25
|
"README.textile"
|
26
26
|
]
|
27
27
|
s.files = [
|
28
|
-
"
|
29
|
-
"
|
30
|
-
"
|
31
|
-
"
|
32
|
-
"
|
33
|
-
"
|
34
|
-
"
|
35
|
-
"
|
36
|
-
"
|
37
|
-
"
|
38
|
-
"
|
39
|
-
"
|
40
|
-
"
|
41
|
-
"
|
42
|
-
"
|
43
|
-
"
|
44
|
-
"
|
45
|
-
"
|
46
|
-
"
|
47
|
-
"
|
48
|
-
"
|
49
|
-
"
|
50
|
-
"
|
28
|
+
"INSTALL.textile",
|
29
|
+
"LICENSE.textile",
|
30
|
+
"README.textile",
|
31
|
+
"bin/cutc",
|
32
|
+
"bin/cuttab",
|
33
|
+
"bin/greptrue",
|
34
|
+
"bin/hdp-cat",
|
35
|
+
"bin/hdp-catd",
|
36
|
+
"bin/hdp-du",
|
37
|
+
"bin/hdp-get",
|
38
|
+
"bin/hdp-kill",
|
39
|
+
"bin/hdp-ls",
|
40
|
+
"bin/hdp-mkdir",
|
41
|
+
"bin/hdp-mv",
|
42
|
+
"bin/hdp-parts_to_keys.rb",
|
43
|
+
"bin/hdp-ps",
|
44
|
+
"bin/hdp-put",
|
45
|
+
"bin/hdp-rm",
|
46
|
+
"bin/hdp-sort",
|
47
|
+
"bin/hdp-stream",
|
48
|
+
"bin/hdp-stream-flat",
|
49
|
+
"bin/hdp-sync",
|
50
|
+
"bin/hdp-wc",
|
51
|
+
"bin/md5sort",
|
52
|
+
"bin/tabchar",
|
53
|
+
"bin/uniqc",
|
54
|
+
"bin/wu-hist",
|
55
|
+
"bin/wu-lign",
|
56
|
+
"bin/wu-sum",
|
57
|
+
"docpages/INSTALL.textile",
|
58
|
+
"docpages/INSTALL.textile",
|
59
|
+
"docpages/LICENSE.textile",
|
60
|
+
"docpages/LICENSE.textile",
|
61
|
+
"docpages/README-wulign.textile",
|
62
|
+
"docpages/README-wulign.textile",
|
63
|
+
"docpages/UsingWukong-part1-get_ready.textile",
|
64
|
+
"docpages/UsingWukong-part1-get_ready.textile",
|
65
|
+
"docpages/UsingWukong-part2-ThinkingBigData.textile",
|
66
|
+
"docpages/UsingWukong-part2-ThinkingBigData.textile",
|
67
|
+
"docpages/UsingWukong-part3-parsing.textile",
|
68
|
+
"docpages/UsingWukong-part3-parsing.textile",
|
69
|
+
"docpages/_config.yml",
|
70
|
+
"docpages/bigdata-tips.textile",
|
71
|
+
"docpages/bigdata-tips.textile",
|
72
|
+
"docpages/code/api_response_example.txt",
|
73
|
+
"docpages/code/parser_skeleton.rb",
|
74
|
+
"docpages/diagrams/MapReduceDiagram.graffle",
|
75
|
+
"docpages/favicon.ico",
|
76
|
+
"docpages/gem.css",
|
77
|
+
"docpages/hadoop-tips.textile",
|
78
|
+
"docpages/hadoop-tips.textile",
|
79
|
+
"docpages/index.textile",
|
80
|
+
"docpages/index.textile",
|
81
|
+
"docpages/intro.textile",
|
82
|
+
"docpages/intro.textile",
|
83
|
+
"docpages/moreinfo.textile",
|
84
|
+
"docpages/moreinfo.textile",
|
85
|
+
"docpages/news.html",
|
86
|
+
"docpages/pig/PigLatinExpressionsList.txt",
|
87
|
+
"docpages/pig/PigLatinReferenceManual.html",
|
88
|
+
"docpages/pig/PigLatinReferenceManual.txt",
|
89
|
+
"docpages/tutorial.textile",
|
90
|
+
"docpages/tutorial.textile",
|
91
|
+
"docpages/usage.textile",
|
92
|
+
"docpages/usage.textile",
|
93
|
+
"docpages/wutils.textile",
|
94
|
+
"docpages/wutils.textile",
|
51
95
|
"examples/README.txt",
|
52
|
-
"examples/and_pig/sample_queries.rb",
|
53
96
|
"examples/apache_log_parser.rb",
|
54
97
|
"examples/count_keys.rb",
|
55
98
|
"examples/count_keys_at_mapper.rb",
|
@@ -61,6 +104,7 @@ Gem::Specification.new do |s|
|
|
61
104
|
"examples/package-local.rb",
|
62
105
|
"examples/package.rb",
|
63
106
|
"examples/pagerank/README.textile",
|
107
|
+
"examples/pagerank/README.textile",
|
64
108
|
"examples/pagerank/gen_initial_pagerank_graph.pig",
|
65
109
|
"examples/pagerank/pagerank.rb",
|
66
110
|
"examples/pagerank/pagerank_initialize.rb",
|
@@ -71,28 +115,6 @@ Gem::Specification.new do |s|
|
|
71
115
|
"examples/size.rb",
|
72
116
|
"examples/word_count.rb",
|
73
117
|
"lib/wukong.rb",
|
74
|
-
"lib/wukong/and_pig.rb",
|
75
|
-
"lib/wukong/and_pig/README.textile",
|
76
|
-
"lib/wukong/and_pig/as.rb",
|
77
|
-
"lib/wukong/and_pig/data_types.rb",
|
78
|
-
"lib/wukong/and_pig/functions.rb",
|
79
|
-
"lib/wukong/and_pig/generate.rb",
|
80
|
-
"lib/wukong/and_pig/generate/variable_inflections.rb",
|
81
|
-
"lib/wukong/and_pig/junk.rb",
|
82
|
-
"lib/wukong/and_pig/operators.rb",
|
83
|
-
"lib/wukong/and_pig/operators/compound.rb",
|
84
|
-
"lib/wukong/and_pig/operators/evaluators.rb",
|
85
|
-
"lib/wukong/and_pig/operators/execution.rb",
|
86
|
-
"lib/wukong/and_pig/operators/file_methods.rb",
|
87
|
-
"lib/wukong/and_pig/operators/foreach.rb",
|
88
|
-
"lib/wukong/and_pig/operators/groupies.rb",
|
89
|
-
"lib/wukong/and_pig/operators/load_store.rb",
|
90
|
-
"lib/wukong/and_pig/operators/meta.rb",
|
91
|
-
"lib/wukong/and_pig/operators/relational.rb",
|
92
|
-
"lib/wukong/and_pig/pig_struct.rb",
|
93
|
-
"lib/wukong/and_pig/pig_var.rb",
|
94
|
-
"lib/wukong/and_pig/symbol.rb",
|
95
|
-
"lib/wukong/and_pig/utils.rb",
|
96
118
|
"lib/wukong/bad_record.rb",
|
97
119
|
"lib/wukong/boot.rb",
|
98
120
|
"lib/wukong/datatypes.rb",
|
@@ -141,7 +163,7 @@ Gem::Specification.new do |s|
|
|
141
163
|
"spec/spec_helper.rb",
|
142
164
|
"wukong.gemspec"
|
143
165
|
]
|
144
|
-
s.homepage = %q{http://github.com/
|
166
|
+
s.homepage = %q{http://mrflip.github.com/wukong}
|
145
167
|
s.rdoc_options = ["--charset=UTF-8"]
|
146
168
|
s.require_paths = ["lib"]
|
147
169
|
s.rubygems_version = %q{1.3.5}
|
@@ -149,7 +171,6 @@ Gem::Specification.new do |s|
|
|
149
171
|
s.test_files = [
|
150
172
|
"spec/bin/hdp-wc_spec.rb",
|
151
173
|
"spec/spec_helper.rb",
|
152
|
-
"examples/and_pig/sample_queries.rb",
|
153
174
|
"examples/apache_log_parser.rb",
|
154
175
|
"examples/count_keys.rb",
|
155
176
|
"examples/count_keys_at_mapper.rb",
|
@@ -173,8 +194,17 @@ Gem::Specification.new do |s|
|
|
173
194
|
s.specification_version = 3
|
174
195
|
|
175
196
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
197
|
+
s.add_runtime_dependency(%q<addressable>, [">= 0"])
|
198
|
+
s.add_runtime_dependency(%q<extlib>, [">= 0"])
|
199
|
+
s.add_runtime_dependency(%q<htmlentities>, [">= 0"])
|
176
200
|
else
|
201
|
+
s.add_dependency(%q<addressable>, [">= 0"])
|
202
|
+
s.add_dependency(%q<extlib>, [">= 0"])
|
203
|
+
s.add_dependency(%q<htmlentities>, [">= 0"])
|
177
204
|
end
|
178
205
|
else
|
206
|
+
s.add_dependency(%q<addressable>, [">= 0"])
|
207
|
+
s.add_dependency(%q<extlib>, [">= 0"])
|
208
|
+
s.add_dependency(%q<htmlentities>, [">= 0"])
|
179
209
|
end
|
180
210
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Philip (flip) Kromer
|
@@ -9,10 +9,39 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-10-
|
12
|
+
date: 2009-10-12 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: addressable
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: extlib
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: htmlentities
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
16
45
|
description: " Treat your dataset like a:\n\n * stream of lines when it\xE2\x80\x99s efficient to process by lines\n * stream of field arrays when it\xE2\x80\x99s efficient to deal directly with fields\n * stream of lightweight objects when it\xE2\x80\x99s efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
|
17
46
|
email: flip@infochimps.org
|
18
47
|
executables:
|
@@ -48,31 +77,60 @@ extra_rdoc_files:
|
|
48
77
|
- LICENSE.textile
|
49
78
|
- README.textile
|
50
79
|
files:
|
51
|
-
-
|
52
|
-
-
|
53
|
-
-
|
54
|
-
-
|
55
|
-
-
|
56
|
-
-
|
57
|
-
-
|
58
|
-
-
|
59
|
-
-
|
60
|
-
-
|
61
|
-
-
|
62
|
-
-
|
63
|
-
-
|
64
|
-
-
|
65
|
-
-
|
66
|
-
-
|
67
|
-
-
|
68
|
-
-
|
69
|
-
-
|
70
|
-
-
|
71
|
-
-
|
72
|
-
-
|
73
|
-
-
|
80
|
+
- INSTALL.textile
|
81
|
+
- LICENSE.textile
|
82
|
+
- README.textile
|
83
|
+
- bin/cutc
|
84
|
+
- bin/cuttab
|
85
|
+
- bin/greptrue
|
86
|
+
- bin/hdp-cat
|
87
|
+
- bin/hdp-catd
|
88
|
+
- bin/hdp-du
|
89
|
+
- bin/hdp-get
|
90
|
+
- bin/hdp-kill
|
91
|
+
- bin/hdp-ls
|
92
|
+
- bin/hdp-mkdir
|
93
|
+
- bin/hdp-mv
|
94
|
+
- bin/hdp-parts_to_keys.rb
|
95
|
+
- bin/hdp-ps
|
96
|
+
- bin/hdp-put
|
97
|
+
- bin/hdp-rm
|
98
|
+
- bin/hdp-sort
|
99
|
+
- bin/hdp-stream
|
100
|
+
- bin/hdp-stream-flat
|
101
|
+
- bin/hdp-sync
|
102
|
+
- bin/hdp-wc
|
103
|
+
- bin/md5sort
|
104
|
+
- bin/tabchar
|
105
|
+
- bin/uniqc
|
106
|
+
- bin/wu-hist
|
107
|
+
- bin/wu-lign
|
108
|
+
- bin/wu-sum
|
109
|
+
- docpages/INSTALL.textile
|
110
|
+
- docpages/LICENSE.textile
|
111
|
+
- docpages/README-wulign.textile
|
112
|
+
- docpages/UsingWukong-part1-get_ready.textile
|
113
|
+
- docpages/UsingWukong-part2-ThinkingBigData.textile
|
114
|
+
- docpages/UsingWukong-part3-parsing.textile
|
115
|
+
- docpages/_config.yml
|
116
|
+
- docpages/bigdata-tips.textile
|
117
|
+
- docpages/code/api_response_example.txt
|
118
|
+
- docpages/code/parser_skeleton.rb
|
119
|
+
- docpages/diagrams/MapReduceDiagram.graffle
|
120
|
+
- docpages/favicon.ico
|
121
|
+
- docpages/gem.css
|
122
|
+
- docpages/hadoop-tips.textile
|
123
|
+
- docpages/index.textile
|
124
|
+
- docpages/intro.textile
|
125
|
+
- docpages/moreinfo.textile
|
126
|
+
- docpages/news.html
|
127
|
+
- docpages/pig/PigLatinExpressionsList.txt
|
128
|
+
- docpages/pig/PigLatinReferenceManual.html
|
129
|
+
- docpages/pig/PigLatinReferenceManual.txt
|
130
|
+
- docpages/tutorial.textile
|
131
|
+
- docpages/usage.textile
|
132
|
+
- docpages/wutils.textile
|
74
133
|
- examples/README.txt
|
75
|
-
- examples/and_pig/sample_queries.rb
|
76
134
|
- examples/apache_log_parser.rb
|
77
135
|
- examples/count_keys.rb
|
78
136
|
- examples/count_keys_at_mapper.rb
|
@@ -94,28 +152,6 @@ files:
|
|
94
152
|
- examples/size.rb
|
95
153
|
- examples/word_count.rb
|
96
154
|
- lib/wukong.rb
|
97
|
-
- lib/wukong/and_pig.rb
|
98
|
-
- lib/wukong/and_pig/README.textile
|
99
|
-
- lib/wukong/and_pig/as.rb
|
100
|
-
- lib/wukong/and_pig/data_types.rb
|
101
|
-
- lib/wukong/and_pig/functions.rb
|
102
|
-
- lib/wukong/and_pig/generate.rb
|
103
|
-
- lib/wukong/and_pig/generate/variable_inflections.rb
|
104
|
-
- lib/wukong/and_pig/junk.rb
|
105
|
-
- lib/wukong/and_pig/operators.rb
|
106
|
-
- lib/wukong/and_pig/operators/compound.rb
|
107
|
-
- lib/wukong/and_pig/operators/evaluators.rb
|
108
|
-
- lib/wukong/and_pig/operators/execution.rb
|
109
|
-
- lib/wukong/and_pig/operators/file_methods.rb
|
110
|
-
- lib/wukong/and_pig/operators/foreach.rb
|
111
|
-
- lib/wukong/and_pig/operators/groupies.rb
|
112
|
-
- lib/wukong/and_pig/operators/load_store.rb
|
113
|
-
- lib/wukong/and_pig/operators/meta.rb
|
114
|
-
- lib/wukong/and_pig/operators/relational.rb
|
115
|
-
- lib/wukong/and_pig/pig_struct.rb
|
116
|
-
- lib/wukong/and_pig/pig_var.rb
|
117
|
-
- lib/wukong/and_pig/symbol.rb
|
118
|
-
- lib/wukong/and_pig/utils.rb
|
119
155
|
- lib/wukong/bad_record.rb
|
120
156
|
- lib/wukong/boot.rb
|
121
157
|
- lib/wukong/datatypes.rb
|
@@ -163,10 +199,8 @@ files:
|
|
163
199
|
- spec/bin/hdp-wc_spec.rb
|
164
200
|
- spec/spec_helper.rb
|
165
201
|
- wukong.gemspec
|
166
|
-
- LICENSE.textile
|
167
|
-
- README.textile
|
168
202
|
has_rdoc: true
|
169
|
-
homepage: http://github.com/
|
203
|
+
homepage: http://mrflip.github.com/wukong
|
170
204
|
licenses: []
|
171
205
|
|
172
206
|
post_install_message:
|
@@ -196,7 +230,6 @@ summary: Wukong makes Hadoop so easy a chimpanzee can use it.
|
|
196
230
|
test_files:
|
197
231
|
- spec/bin/hdp-wc_spec.rb
|
198
232
|
- spec/spec_helper.rb
|
199
|
-
- examples/and_pig/sample_queries.rb
|
200
233
|
- examples/apache_log_parser.rb
|
201
234
|
- examples/count_keys.rb
|
202
235
|
- examples/count_keys_at_mapper.rb
|