wukong 0.1.4 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/INSTALL.textile +89 -0
  2. data/README.textile +41 -74
  3. data/docpages/INSTALL.textile +94 -0
  4. data/{doc → docpages}/LICENSE.textile +0 -0
  5. data/{doc → docpages}/README-wulign.textile +6 -0
  6. data/docpages/UsingWukong-part1-get_ready.textile +17 -0
  7. data/{doc/overview.textile → docpages/UsingWukong-part2-ThinkingBigData.textile} +8 -24
  8. data/{doc → docpages}/UsingWukong-part3-parsing.textile +8 -2
  9. data/docpages/_config.yml +39 -0
  10. data/{doc/tips.textile → docpages/bigdata-tips.textile} +71 -44
  11. data/{doc → docpages}/code/api_response_example.txt +0 -0
  12. data/{doc → docpages}/code/parser_skeleton.rb +0 -0
  13. data/{doc/intro_to_map_reduce → docpages/diagrams}/MapReduceDiagram.graffle +0 -0
  14. data/docpages/favicon.ico +0 -0
  15. data/docpages/gem.css +16 -0
  16. data/docpages/hadoop-tips.textile +83 -0
  17. data/docpages/index.textile +90 -0
  18. data/docpages/intro.textile +8 -0
  19. data/docpages/moreinfo.textile +174 -0
  20. data/docpages/news.html +24 -0
  21. data/{doc → docpages}/pig/PigLatinExpressionsList.txt +0 -0
  22. data/{doc → docpages}/pig/PigLatinReferenceManual.html +0 -0
  23. data/{doc → docpages}/pig/PigLatinReferenceManual.txt +0 -0
  24. data/docpages/tutorial.textile +283 -0
  25. data/docpages/usage.textile +195 -0
  26. data/docpages/wutils.textile +263 -0
  27. data/wukong.gemspec +80 -50
  28. metadata +87 -54
  29. data/doc/INSTALL.textile +0 -41
  30. data/doc/README-tutorial.textile +0 -163
  31. data/doc/README-wutils.textile +0 -128
  32. data/doc/TODO.textile +0 -61
  33. data/doc/UsingWukong-part1-setup.textile +0 -2
  34. data/doc/UsingWukong-part2-scraping.textile +0 -2
  35. data/doc/hadoop-nfs.textile +0 -51
  36. data/doc/hadoop-setup.textile +0 -29
  37. data/doc/index.textile +0 -124
  38. data/doc/links.textile +0 -42
  39. data/doc/usage.textile +0 -102
  40. data/doc/utils.textile +0 -48
  41. data/examples/and_pig/sample_queries.rb +0 -128
  42. data/lib/wukong/and_pig.rb +0 -62
  43. data/lib/wukong/and_pig/README.textile +0 -12
  44. data/lib/wukong/and_pig/as.rb +0 -37
  45. data/lib/wukong/and_pig/data_types.rb +0 -30
  46. data/lib/wukong/and_pig/functions.rb +0 -50
  47. data/lib/wukong/and_pig/generate.rb +0 -85
  48. data/lib/wukong/and_pig/generate/variable_inflections.rb +0 -82
  49. data/lib/wukong/and_pig/junk.rb +0 -51
  50. data/lib/wukong/and_pig/operators.rb +0 -8
  51. data/lib/wukong/and_pig/operators/compound.rb +0 -29
  52. data/lib/wukong/and_pig/operators/evaluators.rb +0 -7
  53. data/lib/wukong/and_pig/operators/execution.rb +0 -15
  54. data/lib/wukong/and_pig/operators/file_methods.rb +0 -29
  55. data/lib/wukong/and_pig/operators/foreach.rb +0 -98
  56. data/lib/wukong/and_pig/operators/groupies.rb +0 -212
  57. data/lib/wukong/and_pig/operators/load_store.rb +0 -65
  58. data/lib/wukong/and_pig/operators/meta.rb +0 -42
  59. data/lib/wukong/and_pig/operators/relational.rb +0 -129
  60. data/lib/wukong/and_pig/pig_struct.rb +0 -48
  61. data/lib/wukong/and_pig/pig_var.rb +0 -95
  62. data/lib/wukong/and_pig/symbol.rb +0 -29
  63. data/lib/wukong/and_pig/utils.rb +0 -0
@@ -0,0 +1,263 @@
1
+ ---
2
+ layout: default
3
+ title: mrflip.github.com/wukong - wu-utils utilities
4
+ collapse: false
5
+ ---
6
+
7
+ h1(gemheader). Wukong Utility Scripts
8
+
9
+ ** "Overview of wutils":#wutils -- command listing
10
+ ** "Stupid command-line tricks":#cmdlinetricks using the wutils
11
+ ** "wu-lign":#wulign -- present a tab-separated file as aligned columns
12
+ ** Dear Lazyweb, please build this for us: "tab-oriented version of the Textutils library":#wutilsinc
13
+
14
+ <notextile><div class="toggle"></notextile>
15
+
16
+ h2(#cmdlinetricks). Stupid command-line tricks
17
+
18
+ Here are a few useful little snippets you can run from the command line:
19
+
20
+ h3. Histogram
21
+
22
+ Given data with a date column:
23
+
24
+ <pre>
25
+ message 235623 20090423012345 Now is the winter of our discontent Made glorious summer by this son of York
26
+ message 235623 20080101230900 These pretzels are making me THIRSTY!
27
+ ...
28
+ </pre>
29
+
30
+ You can calculate number of messages sent by day with
31
+
32
+ <pre>
33
+ cat messages | cuttab 3 | cutc 8 | sort | uniq -c
34
+ </pre>
35
+
36
+ (see the wuhist command, below.)
37
+
38
+ h3. Simple intersection, union, etc
39
+
40
+ For two datasets (batch_1 and batch_2) with unique entries (no repeated lines),
41
+
42
+ * Their union is simple:
43
+
44
+ <pre>
45
+ cat batch_1 batch_2 | sort -u
46
+ </pre>
47
+
48
+ * To find their intersection, concatenate the two sets and filters out everything that only occurred once.
49
+
50
+ <pre>
51
+ cat batch_1 batch_2 | sort | uniq -c | egrep -v '^ *1 '
52
+ </pre>
53
+
54
+ * For the complement of the intersection, use @... | egrep '^ *1 '@
55
+
56
+ * In both cases, if the files are each internally sorted, the commandline sort takes a --merge flag:
57
+
58
+ <pre>
59
+ sort --merge -u batch_1 batch_2
60
+ </pre>
61
+
62
+ <notextile></div><div class="toggle"></notextile>
63
+
64
+ h2(#wutils). Wutils Command Listing
65
+
66
+ h3. cutc
67
+
68
+ @cutc [colnum]@
69
+
70
+ Ex.
71
+
72
+ @echo -e 'foo\tbar\tbaz' | cutc 6@
73
+ @foo ba@
74
+
75
+ Cuts from beginning of line to given column (default 200). A tab is one character, so right margin can still be ragged.
76
+
77
+ h3. cuttab
78
+
79
+ @cuttab [colspec]@
80
+
81
+ Cuts given tab-separated columns. You can give a comma separated list of numbers
82
+ or ranges 1-4. columns are numbered from 1.
83
+
84
+ Ex.
85
+
86
+ <pre>
87
+ echo -e 'foo\tbar\tbaz' | cuttab 1,3
88
+ foo baz
89
+ </pre>
90
+
91
+ h3. hdp-*
92
+
93
+ These perform the corresponding commands on the HDFS filesystem. In general,
94
+ where they accept command-line flags, they go with the GNU-style ones, not the
95
+ hadoop-style: so, @hdp-du -s dir@ or @hdp-rm -r foo/@
96
+
97
+ * @hdp-cat@
98
+ * @hdp-catd@ -- cats the files that don't start with '_' in a directory. Use this for a pile of @.../part-00000@ files
99
+ * @hdp-du@
100
+ * @hdp-get@
101
+ * @hdp-kill@
102
+ * @hdp-ls@
103
+ * @hdp-mkdir@
104
+ * @hdp-mv@
105
+ * @hdp-ps@
106
+ * @hdp-put@
107
+ * @hdp-rm@
108
+ * @hdp-sync@
109
+
110
+ h3. hdp-sort, hdp-stream, hdp-stream-flat
111
+
112
+ * @hdp-sort@
113
+ * @hdp-stream@
114
+ * @hdp-stream-flat@
115
+
116
+ <code><pre>
117
+ hdp-stream input_filespec output_file map_cmd reduce_cmd num_key_fields
118
+ </pre></code>
119
+
120
+ h3. tabchar
121
+
122
+ Outputs a single tab character.
123
+
124
+ h3. wuhist
125
+
126
+ Occasionally useful to gather a lexical histogram of a single column:
127
+
128
+ Ex.
129
+
130
+ <code><pre>
131
+ $ echo -e 'foo\nbar\nbar\nfoo\nfoo\nfoo\n7' | ./wuhist
132
+ 4 foo
133
+ 2 bar
134
+ 1 7
135
+ </pre></code>
136
+
137
+ (the output will have a tab between the first and second column, for futher processing.)
138
+
139
+ h3. wulign
140
+
141
+ Intelligently format a tab-separated file into aligned columns (while remaining tab-separated for further processing). See "below":#wulign.
142
+
143
+ h3. hdp-parts_to_keys.rb
144
+
145
+ A *very* clumsy script to rename reduced hadoop output files by their initial key.
146
+
147
+ If your output file has an initial key in the first column and you pass it through hdp-sort, they will be distributed across reducers and thus output files. (Because of the way hadoop hashes the keys, there's no guarantee that each file will get a distinct key. You could have 2 keys with a million entries and they could land sequentially on the same reducer, always fun.)
148
+
149
+ If you're willing to roll the dice, this script will rename files according to the first key in the first line.
150
+
151
+ **Do you have or know of a native hadoop utility to do this?** If so, please get in touch!
152
+
153
+ <notextile></div><div class="toggle"></notextile>
154
+
155
+ h2(#wulign). wu-lign -- format a tab-separated file as aligned columns
156
+
157
+ wu-lign will intelligently reformat a tab-separated file into a tab-separated, space aligned file that is still suitable for further processing. For example, given the log-file input
158
+
159
+ <pre><code>
160
+ 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
161
+ 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
162
+ 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
163
+ 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
164
+ 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
165
+ 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
166
+ 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
167
+ </code></pre>
168
+
169
+ wu-lign will reformat it to read
170
+
171
+ <pre><code>
172
+ 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
173
+ 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
174
+ 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
175
+ 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
176
+ 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
177
+ 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
178
+ 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
179
+ </code></pre>
180
+
181
+ The fields are still tab-delimited by exactly one tab -- only spaces are used to pad out fields. You can still use cuttab and friends to manipulate columns.
182
+
183
+ wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful for previewing and organizing tab-formatted files. In general @wu-lign(foo).split("\t").map(&:strip)@ *should* give output semantically equivalent to its input. (That is, the only changes should be insertion of spaces and re-formatting of numerics.) But still -- reserve its use for human inspection only.
184
+
185
+ (Note: tab characters in this source code file have been converted to spaces; replace whitespace with tab in the first example if you'd like to play along at home.)
186
+
187
+ h3. How it works
188
+
189
+ Wu-Lign takes the first 1000 lines, splits by TAB characters into fields, and tries to guess the format -- int, float, or string -- for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
190
+
191
+ h3. Command-line arguments
192
+
193
+ You can give sprintf-style positional arguments on the command line that will be applied to the corresponding columns. (Blank args are used for placeholding and auto-formatting is still applied). So with the example above,
194
+
195
+ @cat foo | wu-lign '' '' '' '%8.4e'@
196
+
197
+ will format the fourth column with "%8.4e", while the first three columns and fifth-and-higher columns are formatted as usual.
198
+
199
+ <pre><code>
200
+ ...
201
+ 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
202
+ 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
203
+ 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
204
+ ....
205
+ </code></pre>
206
+
207
+ h3. Notes
208
+
209
+ * It has no knowledge of header rows. An all-text first line will screw everything up.
210
+ * It also requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
211
+ * It won't set columns wider than 70 chars -- this allows for the occasional super-wide column without completely breaking your screen.
212
+ * For :float values, wu-lign tries to guess at the right number of significant digits to the left and right of the decimal point.
213
+ * wu-lign does not parse 'TSV files' in their strict sense -- there is no quoting or escaping; every tab delimits a field, every newline a record.
214
+
215
+ h2(#wutilsinc). Dear Lazyweb, please build this
216
+
217
+ * uniq - report or filter out repeated lines in a file
218
+ ** -c produces line<tab>count
219
+ ** --ignore f1,f2,... discards given fields from consideration. field syntax same as for cut, etc.
220
+
221
+ * sort - sort lines of text files
222
+ ** columns indexed as tab-separated
223
+ ** can specify any column order, uses same field spec as cut
224
+ * tsort - topological sort of a directed graph
225
+
226
+ * cut - select portions of each line of a file
227
+ ** can reorder columns
228
+ * nl - line numbering filter
229
+ ** takes prefix, suffix
230
+ ** count \t line -OR- line \t count
231
+
232
+ * wc - word, line, character, and byte count
233
+ ** field count (tab-separated fields)
234
+ * paste - merge corresponding or subsequent lines of files
235
+ * expand, unexpand - expand tabs to spaces, and vice versa
236
+ * seq
237
+ * simple row, column sums
238
+ * join - relational database operator
239
+ * tac
240
+
241
+ * cat - concatenate and print files
242
+ * head - display first lines of a file
243
+ * tail - display the last part of a file
244
+ * shuf
245
+ * split - split a file into pieces
246
+ * csplit - split files based on context
247
+ * tee - pipe fitting
248
+
249
+ * ls - list directory contents.
250
+ * df - display free disk space
251
+ * du - display disk usage statistics
252
+ ** tab-delimited, space aligned
253
+
254
+ * od - octal, decimal, hex, ASCII dump
255
+ * printf - formatted output
256
+ * cksum, sum - display file checksums and block counts
257
+ * md5sum
258
+
259
+ * diff
260
+ * comm
261
+
262
+
263
+ <notextile></div></notextile>
data/wukong.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "0.1.4"
8
+ s.version = "1.4.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2009-10-05}
12
+ s.date = %q{2009-10-12}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it’s efficient to process by lines
@@ -25,31 +25,74 @@ Gem::Specification.new do |s|
25
25
  "README.textile"
26
26
  ]
27
27
  s.files = [
28
- "doc/INSTALL.textile",
29
- "doc/LICENSE.textile",
30
- "doc/README-tutorial.textile",
31
- "doc/README-wulign.textile",
32
- "doc/README-wutils.textile",
33
- "doc/TODO.textile",
34
- "doc/UsingWukong-part1-setup.textile",
35
- "doc/UsingWukong-part2-scraping.textile",
36
- "doc/UsingWukong-part3-parsing.textile",
37
- "doc/code/api_response_example.txt",
38
- "doc/code/parser_skeleton.rb",
39
- "doc/hadoop-nfs.textile",
40
- "doc/hadoop-setup.textile",
41
- "doc/index.textile",
42
- "doc/intro_to_map_reduce/MapReduceDiagram.graffle",
43
- "doc/links.textile",
44
- "doc/overview.textile",
45
- "doc/pig/PigLatinExpressionsList.txt",
46
- "doc/pig/PigLatinReferenceManual.html",
47
- "doc/pig/PigLatinReferenceManual.txt",
48
- "doc/tips.textile",
49
- "doc/usage.textile",
50
- "doc/utils.textile",
28
+ "INSTALL.textile",
29
+ "LICENSE.textile",
30
+ "README.textile",
31
+ "bin/cutc",
32
+ "bin/cuttab",
33
+ "bin/greptrue",
34
+ "bin/hdp-cat",
35
+ "bin/hdp-catd",
36
+ "bin/hdp-du",
37
+ "bin/hdp-get",
38
+ "bin/hdp-kill",
39
+ "bin/hdp-ls",
40
+ "bin/hdp-mkdir",
41
+ "bin/hdp-mv",
42
+ "bin/hdp-parts_to_keys.rb",
43
+ "bin/hdp-ps",
44
+ "bin/hdp-put",
45
+ "bin/hdp-rm",
46
+ "bin/hdp-sort",
47
+ "bin/hdp-stream",
48
+ "bin/hdp-stream-flat",
49
+ "bin/hdp-sync",
50
+ "bin/hdp-wc",
51
+ "bin/md5sort",
52
+ "bin/tabchar",
53
+ "bin/uniqc",
54
+ "bin/wu-hist",
55
+ "bin/wu-lign",
56
+ "bin/wu-sum",
57
+ "docpages/INSTALL.textile",
58
+ "docpages/INSTALL.textile",
59
+ "docpages/LICENSE.textile",
60
+ "docpages/LICENSE.textile",
61
+ "docpages/README-wulign.textile",
62
+ "docpages/README-wulign.textile",
63
+ "docpages/UsingWukong-part1-get_ready.textile",
64
+ "docpages/UsingWukong-part1-get_ready.textile",
65
+ "docpages/UsingWukong-part2-ThinkingBigData.textile",
66
+ "docpages/UsingWukong-part2-ThinkingBigData.textile",
67
+ "docpages/UsingWukong-part3-parsing.textile",
68
+ "docpages/UsingWukong-part3-parsing.textile",
69
+ "docpages/_config.yml",
70
+ "docpages/bigdata-tips.textile",
71
+ "docpages/bigdata-tips.textile",
72
+ "docpages/code/api_response_example.txt",
73
+ "docpages/code/parser_skeleton.rb",
74
+ "docpages/diagrams/MapReduceDiagram.graffle",
75
+ "docpages/favicon.ico",
76
+ "docpages/gem.css",
77
+ "docpages/hadoop-tips.textile",
78
+ "docpages/hadoop-tips.textile",
79
+ "docpages/index.textile",
80
+ "docpages/index.textile",
81
+ "docpages/intro.textile",
82
+ "docpages/intro.textile",
83
+ "docpages/moreinfo.textile",
84
+ "docpages/moreinfo.textile",
85
+ "docpages/news.html",
86
+ "docpages/pig/PigLatinExpressionsList.txt",
87
+ "docpages/pig/PigLatinReferenceManual.html",
88
+ "docpages/pig/PigLatinReferenceManual.txt",
89
+ "docpages/tutorial.textile",
90
+ "docpages/tutorial.textile",
91
+ "docpages/usage.textile",
92
+ "docpages/usage.textile",
93
+ "docpages/wutils.textile",
94
+ "docpages/wutils.textile",
51
95
  "examples/README.txt",
52
- "examples/and_pig/sample_queries.rb",
53
96
  "examples/apache_log_parser.rb",
54
97
  "examples/count_keys.rb",
55
98
  "examples/count_keys_at_mapper.rb",
@@ -61,6 +104,7 @@ Gem::Specification.new do |s|
61
104
  "examples/package-local.rb",
62
105
  "examples/package.rb",
63
106
  "examples/pagerank/README.textile",
107
+ "examples/pagerank/README.textile",
64
108
  "examples/pagerank/gen_initial_pagerank_graph.pig",
65
109
  "examples/pagerank/pagerank.rb",
66
110
  "examples/pagerank/pagerank_initialize.rb",
@@ -71,28 +115,6 @@ Gem::Specification.new do |s|
71
115
  "examples/size.rb",
72
116
  "examples/word_count.rb",
73
117
  "lib/wukong.rb",
74
- "lib/wukong/and_pig.rb",
75
- "lib/wukong/and_pig/README.textile",
76
- "lib/wukong/and_pig/as.rb",
77
- "lib/wukong/and_pig/data_types.rb",
78
- "lib/wukong/and_pig/functions.rb",
79
- "lib/wukong/and_pig/generate.rb",
80
- "lib/wukong/and_pig/generate/variable_inflections.rb",
81
- "lib/wukong/and_pig/junk.rb",
82
- "lib/wukong/and_pig/operators.rb",
83
- "lib/wukong/and_pig/operators/compound.rb",
84
- "lib/wukong/and_pig/operators/evaluators.rb",
85
- "lib/wukong/and_pig/operators/execution.rb",
86
- "lib/wukong/and_pig/operators/file_methods.rb",
87
- "lib/wukong/and_pig/operators/foreach.rb",
88
- "lib/wukong/and_pig/operators/groupies.rb",
89
- "lib/wukong/and_pig/operators/load_store.rb",
90
- "lib/wukong/and_pig/operators/meta.rb",
91
- "lib/wukong/and_pig/operators/relational.rb",
92
- "lib/wukong/and_pig/pig_struct.rb",
93
- "lib/wukong/and_pig/pig_var.rb",
94
- "lib/wukong/and_pig/symbol.rb",
95
- "lib/wukong/and_pig/utils.rb",
96
118
  "lib/wukong/bad_record.rb",
97
119
  "lib/wukong/boot.rb",
98
120
  "lib/wukong/datatypes.rb",
@@ -141,7 +163,7 @@ Gem::Specification.new do |s|
141
163
  "spec/spec_helper.rb",
142
164
  "wukong.gemspec"
143
165
  ]
144
- s.homepage = %q{http://github.com/mrflip/wukong}
166
+ s.homepage = %q{http://mrflip.github.com/wukong}
145
167
  s.rdoc_options = ["--charset=UTF-8"]
146
168
  s.require_paths = ["lib"]
147
169
  s.rubygems_version = %q{1.3.5}
@@ -149,7 +171,6 @@ Gem::Specification.new do |s|
149
171
  s.test_files = [
150
172
  "spec/bin/hdp-wc_spec.rb",
151
173
  "spec/spec_helper.rb",
152
- "examples/and_pig/sample_queries.rb",
153
174
  "examples/apache_log_parser.rb",
154
175
  "examples/count_keys.rb",
155
176
  "examples/count_keys_at_mapper.rb",
@@ -173,8 +194,17 @@ Gem::Specification.new do |s|
173
194
  s.specification_version = 3
174
195
 
175
196
  if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
197
+ s.add_runtime_dependency(%q<addressable>, [">= 0"])
198
+ s.add_runtime_dependency(%q<extlib>, [">= 0"])
199
+ s.add_runtime_dependency(%q<htmlentities>, [">= 0"])
176
200
  else
201
+ s.add_dependency(%q<addressable>, [">= 0"])
202
+ s.add_dependency(%q<extlib>, [">= 0"])
203
+ s.add_dependency(%q<htmlentities>, [">= 0"])
177
204
  end
178
205
  else
206
+ s.add_dependency(%q<addressable>, [">= 0"])
207
+ s.add_dependency(%q<extlib>, [">= 0"])
208
+ s.add_dependency(%q<htmlentities>, [">= 0"])
179
209
  end
180
210
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Philip (flip) Kromer
@@ -9,10 +9,39 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-05 00:00:00 -05:00
12
+ date: 2009-10-12 00:00:00 -05:00
13
13
  default_executable:
14
- dependencies: []
15
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: addressable
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: extlib
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: htmlentities
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
16
45
  description: " Treat your dataset like a:\n\n * stream of lines when it\xE2\x80\x99s efficient to process by lines\n * stream of field arrays when it\xE2\x80\x99s efficient to deal directly with fields\n * stream of lightweight objects when it\xE2\x80\x99s efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
17
46
  email: flip@infochimps.org
18
47
  executables:
@@ -48,31 +77,60 @@ extra_rdoc_files:
48
77
  - LICENSE.textile
49
78
  - README.textile
50
79
  files:
51
- - doc/INSTALL.textile
52
- - doc/LICENSE.textile
53
- - doc/README-tutorial.textile
54
- - doc/README-wulign.textile
55
- - doc/README-wutils.textile
56
- - doc/TODO.textile
57
- - doc/UsingWukong-part1-setup.textile
58
- - doc/UsingWukong-part2-scraping.textile
59
- - doc/UsingWukong-part3-parsing.textile
60
- - doc/code/api_response_example.txt
61
- - doc/code/parser_skeleton.rb
62
- - doc/hadoop-nfs.textile
63
- - doc/hadoop-setup.textile
64
- - doc/index.textile
65
- - doc/intro_to_map_reduce/MapReduceDiagram.graffle
66
- - doc/links.textile
67
- - doc/overview.textile
68
- - doc/pig/PigLatinExpressionsList.txt
69
- - doc/pig/PigLatinReferenceManual.html
70
- - doc/pig/PigLatinReferenceManual.txt
71
- - doc/tips.textile
72
- - doc/usage.textile
73
- - doc/utils.textile
80
+ - INSTALL.textile
81
+ - LICENSE.textile
82
+ - README.textile
83
+ - bin/cutc
84
+ - bin/cuttab
85
+ - bin/greptrue
86
+ - bin/hdp-cat
87
+ - bin/hdp-catd
88
+ - bin/hdp-du
89
+ - bin/hdp-get
90
+ - bin/hdp-kill
91
+ - bin/hdp-ls
92
+ - bin/hdp-mkdir
93
+ - bin/hdp-mv
94
+ - bin/hdp-parts_to_keys.rb
95
+ - bin/hdp-ps
96
+ - bin/hdp-put
97
+ - bin/hdp-rm
98
+ - bin/hdp-sort
99
+ - bin/hdp-stream
100
+ - bin/hdp-stream-flat
101
+ - bin/hdp-sync
102
+ - bin/hdp-wc
103
+ - bin/md5sort
104
+ - bin/tabchar
105
+ - bin/uniqc
106
+ - bin/wu-hist
107
+ - bin/wu-lign
108
+ - bin/wu-sum
109
+ - docpages/INSTALL.textile
110
+ - docpages/LICENSE.textile
111
+ - docpages/README-wulign.textile
112
+ - docpages/UsingWukong-part1-get_ready.textile
113
+ - docpages/UsingWukong-part2-ThinkingBigData.textile
114
+ - docpages/UsingWukong-part3-parsing.textile
115
+ - docpages/_config.yml
116
+ - docpages/bigdata-tips.textile
117
+ - docpages/code/api_response_example.txt
118
+ - docpages/code/parser_skeleton.rb
119
+ - docpages/diagrams/MapReduceDiagram.graffle
120
+ - docpages/favicon.ico
121
+ - docpages/gem.css
122
+ - docpages/hadoop-tips.textile
123
+ - docpages/index.textile
124
+ - docpages/intro.textile
125
+ - docpages/moreinfo.textile
126
+ - docpages/news.html
127
+ - docpages/pig/PigLatinExpressionsList.txt
128
+ - docpages/pig/PigLatinReferenceManual.html
129
+ - docpages/pig/PigLatinReferenceManual.txt
130
+ - docpages/tutorial.textile
131
+ - docpages/usage.textile
132
+ - docpages/wutils.textile
74
133
  - examples/README.txt
75
- - examples/and_pig/sample_queries.rb
76
134
  - examples/apache_log_parser.rb
77
135
  - examples/count_keys.rb
78
136
  - examples/count_keys_at_mapper.rb
@@ -94,28 +152,6 @@ files:
94
152
  - examples/size.rb
95
153
  - examples/word_count.rb
96
154
  - lib/wukong.rb
97
- - lib/wukong/and_pig.rb
98
- - lib/wukong/and_pig/README.textile
99
- - lib/wukong/and_pig/as.rb
100
- - lib/wukong/and_pig/data_types.rb
101
- - lib/wukong/and_pig/functions.rb
102
- - lib/wukong/and_pig/generate.rb
103
- - lib/wukong/and_pig/generate/variable_inflections.rb
104
- - lib/wukong/and_pig/junk.rb
105
- - lib/wukong/and_pig/operators.rb
106
- - lib/wukong/and_pig/operators/compound.rb
107
- - lib/wukong/and_pig/operators/evaluators.rb
108
- - lib/wukong/and_pig/operators/execution.rb
109
- - lib/wukong/and_pig/operators/file_methods.rb
110
- - lib/wukong/and_pig/operators/foreach.rb
111
- - lib/wukong/and_pig/operators/groupies.rb
112
- - lib/wukong/and_pig/operators/load_store.rb
113
- - lib/wukong/and_pig/operators/meta.rb
114
- - lib/wukong/and_pig/operators/relational.rb
115
- - lib/wukong/and_pig/pig_struct.rb
116
- - lib/wukong/and_pig/pig_var.rb
117
- - lib/wukong/and_pig/symbol.rb
118
- - lib/wukong/and_pig/utils.rb
119
155
  - lib/wukong/bad_record.rb
120
156
  - lib/wukong/boot.rb
121
157
  - lib/wukong/datatypes.rb
@@ -163,10 +199,8 @@ files:
163
199
  - spec/bin/hdp-wc_spec.rb
164
200
  - spec/spec_helper.rb
165
201
  - wukong.gemspec
166
- - LICENSE.textile
167
- - README.textile
168
202
  has_rdoc: true
169
- homepage: http://github.com/mrflip/wukong
203
+ homepage: http://mrflip.github.com/wukong
170
204
  licenses: []
171
205
 
172
206
  post_install_message:
@@ -196,7 +230,6 @@ summary: Wukong makes Hadoop so easy a chimpanzee can use it.
196
230
  test_files:
197
231
  - spec/bin/hdp-wc_spec.rb
198
232
  - spec/spec_helper.rb
199
- - examples/and_pig/sample_queries.rb
200
233
  - examples/apache_log_parser.rb
201
234
  - examples/count_keys.rb
202
235
  - examples/count_keys_at_mapper.rb