wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,90 +0,0 @@
1
- job_201006200508_0002 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b 100.00%
2
- s3 => hdfs bz2 parser, cond_em empty (?)
3
- 201006200508_0002 35mins, 34sec 1 1812031232 0 12495736645 7240978546 8180472 388863907 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
4
-
5
- job_201006200508_0003 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes 100.00%
6
- s3 => hdfs bz2 parser, cond_em duplicate
7
- 201006200508_0003 15mins, 50sec 1 1812031232 0 11877866580 7240978546 8180472 383928615 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
8
-
9
- job_201006200508_0004 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2 100.00%
10
- hdfs => hdfs bz2 parser, cond_em empty
11
- 201006200508_0004 36mins, 56sec 1 1812031232 13334645497 7240978546 8180472 395564272 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
12
-
13
- job_201006200508_0005 NORMAL flip parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em 100.00%
14
- hdfs => hdfs bz2 parser, no_cond_em --
15
- 201006200508_0005 35mins, 23sec 1 1812031232 13479823318 7240978546 8180472 396757046 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
16
-
17
- job_201006200508_0006 NORMAL flip hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111 100.00%
18
- hdfs => hdfs bz2 `which cat`
19
- 201006200508_0006 1mins, 10sec 1 1812031232 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
20
-
21
- job_201006200508_0007 NORMAL flip hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n 100.00%
22
- s3 => hdfs bz2 `which cat`
23
- 201006200508_0007 1mins, 55sec 1 1812031232 0 7240978549 7240978546 8180472 8180472 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
24
-
25
- job_201006200508_0008 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db 100.00%
26
- hdfs => hdfs flat parser no cond_em no db
27
- 201006200508_0008 10mins, 59sec 1 7240978549 13545881166 7240978549 8180472 397172723 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
28
-
29
- job_201006200508_0015 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db 100.00%
30
- hdfs => hdfs flat parser cond_em on users only no DB
31
- 201006200508_0015 23mins, 48sec 1 7240978549 13415414554 7240978549 8180472 396101235 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
32
-
33
- job_201006200508_0016 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-nodupes 100.00%
34
- hdfs => hdfs flat parser cond_em on users only - vanished saving id/sn to DB
35
- 201006200508_0016 28mins, 7sec 1 0 7240978549 13414285504 7240978549 8180472 396091251 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
36
-
37
- job_201006200508_0017 NORMAL flip parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes 100.00%
38
- hdfs => hdfs flat parser cond_em on users only - duped saving id/sn to DB
39
- 201006200508_0017 11mins, 51sec 1 0 7240978549 12221205449 7240978549 8180472 386114331 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
40
-
41
- ===========================================================================
42
- == Parse
43
- ==
44
-
45
- job_201006200508_0018 NORMAL flip parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056 100.00%
46
- 201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 141729936525 128606199040 14198839 3918844056 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
47
-
48
- for foo in 0016 0017 0018 ; do echo $foo ; ~/ics/hadoop/chimpmark/bin/elephantscat.rb job_201006200508_$foo ; done
49
- cat ~/timings/job/201006200508/*/*.tsv | wu-lign
50
-
51
- job_id scraped_at run_time succ? s3n_in hdfs_in file_in hdfs_out file_out map_in map_out map_recs_in map_recs_out red_recs_in red_recs_out job_name
52
- 201006200508_0002 35mins, 34sec 1 1812031232 0 0 12495736645 0 7240978546 0 8180472 388863907 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b
53
- 201006200508_0003 15mins, 50sec 1 1812031232 0 0 11877866580 0 7240978546 0 8180472 383928615 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-dupes
54
- 201006200508_0004 36mins, 56sec 1 1812031232 0 13334645497 0 7240978546 0 8180472 395564272 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2
55
- 201006200508_0005 35mins, 23sec 1 1812031232 0 13479823318 0 7240978546 0 8180472 396757046 0 0 parse_twitter_api_requests.rb---/data/ripd/com.tw/com.twitter/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-from_hdfs_bz2_no_cond_em
56
- 201006200508_0006 1mins, 10sec 1 1812031232 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111
57
- 201006200508_0007 1mins, 55sec 1 1812031232 0 0 7240978549 0 7240978546 0 8180472 8180472 0 0 hdp-stream-flat-/bin/cat-s3n://monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2009111*-/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111-from_s3n
58
- 201006200508_0008 10mins, 59sec 1 7240978549 0 13545881166 0 7240978549 0 8180472 397172723 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-no_cond_em-no_db
59
- 201006200508_0015 23mins, 48sec 1 7240978549 0 13415414554 0 7240978549 0 8180472 396101235 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-no_db
60
- 201006200508_0016 28mins, 7sec 1 7240978549 0 13414285504 0 7240978549 0 8180472 396091251 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
61
- 201006200508_0017 11mins, 51sec 1 7240978549 0 12221205449 0 7240978549 0 8180472 386114331 0 0 parse_twitter_api_requests.rb---/data/sn/tw/ripd/com.tw/com.twitter-flat/2009111---/data/sn/tw/rawd/parsed/20091110-20091119b-hdfs-flat-cond_em_users-yes_db-dupes
62
- 201006200508_0018 11hrs, 12mins, 43sec 1 25560337747 0 0 141729936525 0 128606199040 0 14198839 3918844056 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/2010056---/data/sn/tw/rawd/parsed/2010056
63
- 201006200508_0021 8hrs, 50mins, 52sec 1 141779023755 62208536220 24722859867 73825391771 141729936525 189098533358 3918844056 3918844056 155139258 155139258 Unsplicer
64
- 201006200508_0029 1mins, 20sec 1 1763173995 0 1762322014 0 1762322014 0 22764940 22764940 0 0 hdp-stream-flat-/bin/cat-/data/sn/tw/rawd/unspliced/twitter_user-/tmp/foo
65
- 201006200508_0031 3hrs, 48mins, 6sec 1 14930014182 0 0 48106164389 0 113092707367 0 8408164 753481311 0 0 parse_twitter_api_requests.rb---s3n//monkeyshines.infochimps.org/data/ripd/com.tw/com.twitter/201004---/data/sn/tw/rawd/parsed/api/201004
66
- 201006200508_0034 30mins, 46sec 1 7170990599 2203578261 8389754083 5031160348 7170990599 7170990510 143461243 143461241 143461241 67443309 bulk_load_conversation.rb---/data/sn/tw/fixd/objects/a_replies_b---/data/sn/tw/fixd/apeyeye/conversation/a_replies_b_json
67
-
68
- Identity mapper Wukong `which cat` pig
69
- Identity reducer wukong `which cat` pig
70
- * no skew
71
- * data/reducer > ram
72
-
73
- Do a sort|uniq on 150GB
74
-
75
-
76
- * 1.8 GB bz2, S3 => HDFS 1m55s
77
- * 1.8 GB bz2, HDFS => HDFS 1m10s
78
-
79
- TokyoTyrant, 1 node => 4 m1.large (Balancer) 15_000 inserts/sec
80
- TokyoTyrant, 20 tasks => 4 m1.large (Balancer) 2_000 inserts/sec
81
-
82
- ===========================================================================
83
-
84
- Parse:
85
-
86
- hdp-du s3n://monkeyshines.infochimps.org/data/ripd/com.tw/\*/ > /mnt/tmp/ripd_com.tw-du.tsv
87
-
88
-
89
-
90
- 1050 entries 448483502374 417.7 GB
@@ -1,65 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong - wu-lign utility
4
- collapse: false
5
- ---
6
-
7
- h1. wu-lign -- format a tab-separated file as aligned columns
8
-
9
- wu-lign will intelligently reformat a tab-separated file into a tab-separated, space aligned file that is still suitable for further processing. For example, given the log-file input
10
-
11
- <pre><code>
12
- 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
13
- 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
14
- 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
15
- 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
16
- 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
17
- 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
18
- 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
19
- </code></pre>
20
-
21
- wu-lign will reformat it to read
22
-
23
- <pre><code>
24
- 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
25
- 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
26
- 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
27
- 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
28
- 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
29
- 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
30
- 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
31
- </code></pre>
32
-
33
- The fields are still tab-delimited by exactly one tab -- only spaces are used to pad out fields. You can still use cuttab and friends to manipulate columns.
34
-
35
- wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful for previewing and organizing tab-formatted files. In general @wu-lign(foo).split("\t").map(&:strip)@ *should* give output semantically equivalent to its input. (That is, the only changes should be insertion of spaces and re-formatting of numerics.) But still -- reserve its use for human inspection only.
36
-
37
- (Note: tab characters in this source code file have been converted to spaces; replace whitespace with tab in the first example if you'd like to play along at home.)
38
-
39
- h2. How it works
40
-
41
- Wu-Lign takes the first 500ish lines, splits into fields on TAB characters, and tries to guess the format (int, float, or string) for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
42
-
43
- h2. Command-line arguments
44
-
45
- You can give sprintf-style positional arguments on the command line that will be applied to the corresponding columns. (Blank args are used for placeholding and auto-formatting is still applied). So with the example above,
46
-
47
- @cat foo | wu-lign '' '' '' '%8.4e'@
48
-
49
- will format the fourth column with "%8.4e", while the first three columns and fifth-and-higher columns are formatted as usual.
50
-
51
- <pre><code>
52
- ...
53
- 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
54
- 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
55
- 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
56
- ....
57
- </code></pre>
58
-
59
- h2. Notes
60
-
61
- * Header rows: the first line is used for width alignment but not for type detection. This means that an initial row of text headers will inform column spacing but still allow a column of floats (say) to be properly aligned as floats.
62
- * It requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
63
- * It won't set columns wider than 100 chars -- this allows for the occasional super-wide column without completely breaking your screen.
64
- * For :float values, wulign tries to guess at the right number of significant digits to the left and right of the decimal point.
65
- * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab delimits a field, every newline a record.
@@ -1,17 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong - Using Wukong and Wuclan, Part 1 - Setup
4
- collapse: false
5
- ---
6
-
7
- h1. Using Wukong and Wuclan, Part 0 - Setup
8
-
9
- Please follow the "installation and setup directions":setup.html for wukong, hadoop and a compute cluster.
10
-
11
- h1. Using Wukong and Wuclan, Part 1 - Scraping
12
-
13
- This part needs writing.
14
-
15
- Later, it will tell you how to get a large corpus of data to use in part 2.
16
-
17
- In the meantime check out http://mrflip.github.com/monkeyshines/ and http://mrflip.github.com/wuclan/ -- in particular the "Twitter Search Scraper":http://github.com/mrflip/wuclan/tree/master/examples/twitter/scrape_twitter_search/ example. We use this in production to gather and analyze tens of gigabytes of twitter conversations.
@@ -1,75 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong - Overview
4
- collapse: false
5
- ---
6
-
7
- h1. Thinking Big Data
8
-
9
- h2. There's lots of data, Wukong and Hadoop can help
10
-
11
-
12
- There are two disruptive
13
-
14
- * We're instrumenting every realm of human activity
15
- ** Conversation
16
- ** Relationships
17
- **
18
-
19
- * We have linearly scaling multiprocessing
20
- ** Old frontier computing: expensive, N log N, SUUUUUUCKS
21
- ** It's cheap, it's scaleable and it's fun
22
-
23
- h2. == Map|Reduce ==
24
-
25
- h3. cat input.tsv | mapper.sh | sort | reducer.sh > output.tsv
26
-
27
- * Bobo histogram:
28
-
29
- cat twitter_users.tsv | cuttab 3 | cutc 1-6 | sort | uniq -c > histogram.tsv
30
-
31
- cat twitter_users.tsv | \
32
- cuttab 3 | # extract the date column \
33
- cutc 1-6 | # chop off all but the yearmonth \
34
- sort | # sort, to ensure locality \
35
- uniq -c > # roll up lines, along with their count \
36
- histogram.tsv # save into output file
37
-
38
-
39
- h3. Word Count
40
-
41
- mapper:
42
-
43
- # output each word on its own line
44
- @readlines.each{|line| puts line.split(/[^\w]+/) }@
45
-
46
- reducer:
47
-
48
- # every word is _guaranteed_ to land in the same place and next to its
49
- # friends, so we can just output the repetition count for each
50
- # distinct line.
51
- uniq -c
52
-
53
-
54
- h3. Word Count by Person
55
-
56
- * Partition Keys vs. Reduce Keys
57
-
58
- - reduce by [word, <total>, count] and [word, user_id, count]
59
-
60
-
61
- h2. == Global Structure ==
62
-
63
- h3. Enumerating neighborhood
64
-
65
- * adjacency list
66
-
67
- * join on center link
68
-
69
- * list of 3-paths ==
70
-
71
- h2. == Mechanics, HDFS ==
72
-
73
-
74
- x M _
75
- _ M y
@@ -1,138 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong - Using Wukong and Wuclan, Part 3 - Parsing
4
- collapse: false
5
- ---
6
-
7
- h1. Using Wukong and Wuclan - Parsing
8
-
9
- In part 1 we begain a scraper to trawl our desired part of the social web. Now
10
- we're ready to start using Wukong to process the files.
11
-
12
- Files come off the wire as
13
-
14
- :url :scraped_at :response_code :response_message :contents
15
- String DateTime (flat) Integer String String (JSON-formatted, tab&newline-munged)
16
-
17
- The contents field is a JSON-formatted mix of records:
18
-
19
- * TwitterFollowersRequest and TwitterFriendsRequest yield an @Array[Hash{user => raw_tweet}]@. We want to extract a stream of AFollowsB (with the request user as user_a for a friends request and user_b for a followers request) along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
20
- * TwitterFavoritesRequest yields an array of @Array[Hash{tweet_hash => user_hash}]. We want to extract a stream of AFavoritesB along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records
21
- * TwitterUser yields a single @user_hash@ making one each of TwitterUser, TwitterUserProfile and TwitterUserStyle.
22
- * UserTimelineRequest and PublicTimelineRequest yield an Array[Hash{tweet => user}]. We want to extract the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
23
- * TwitterFollowerIdsRequest and TwitterFriendIdsRequest return an Array[user_ids] (each user_id is a simple Integer). We extract a series of AFollowsB (using the request's user_id as user_a_id or user_b_id)
24
-
25
- We want to split each API response into a stream of those TwitterUser, Tweet, etc. records.
26
-
27
- # Stream in each line (each line holds one request)
28
- # turn the line into the corresponding TwitterRequest
29
- # have the TwitterRequest parse its JSON contents and construct the TwitterUser, Tweet, etc.
30
- # seriealize those records back out as tab-separated lines suitable for further processing with Wukong
31
-
32
- h4. The basics of StructStreamer
33
-
34
- Wukong handles the first and last steps through its StructStreamer and the standard .to_flat method. So the actual structure is really simple:
35
-
36
- #
37
- # Instantiate each incoming request.
38
- # Stream out the contained classes it generates.
39
- #
40
- class TwitterRequestParser < Wukong::Streamer::StructStreamer
41
- def process request
42
- request.parse do |obj|
43
- yield obj
44
- end
45
- end
46
- end
47
-
48
- # This makes the script go.
49
- Wukong::Script.new(TwitterRequestParser, nil).run
50
-
51
- In practice, all you need to know is that a StructStreamer gets a stream of objects to parse. Here's an outline of its internals. The Wukong StructStreamer:
52
-
53
- # takes each flattened line:
54
-
55
- "twitter_friends_request http://.... 20090701123456 ...fields... [{...}, {...}, ...json..., {...}]"
56
-
57
- # splits by tabs to create an array of fields
58
-
59
- ["twitter_friends_request", "http://...", ... "[{...}, {...}, ...json..., {...}]"]
60
-
61
- # constructs the class name indicated in the first field,
62
- using the values extracted from the remaining fields.
63
-
64
- TwitterFriendsRequest.new "http://...", "20090701123456", ... "[{...}, {...}, ...json..., {...}]"
65
-
66
- The last (contents) field is still just a string: there's nothing special about it to Wukong.
67
-
68
- h4. Parsing
69
-
70
- Since each requests' contents are handled in a slightly (and brittle-ly) different manner, we just ask each request object to parse itself and feed out all the TwitterXXXX objects it generates.
71
-
72
- class TwitterFollowersRequest
73
- # ...
74
-
75
- def parse &block
76
- return unless healthy?
77
- # for each raw user/tweet pair in the parsed JSON contents,
78
- parsed_contents.each do |hsh|
79
- json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
80
- next unless json_obj && json_obj.healthy?
81
- # Extract user, tweet and relationship
82
- yield AFollowsB.new(json_obj.user.id, self.twitter_user_id) if json_obj.user
83
- json_obj.each(&block)
84
- end
85
- end
86
-
87
- # ...
88
- end
89
-
90
- The TwitterXXXRequest objects consist of one or many hashes with (a raw user hash, and possibly its latest raw tweet hash) or (a raw tweet hash and its raw user hash). The user hash might have only the fields for a TwitterPartialUser or it might have the fields for a full set of TwitterUser, TwitterUserProfile, TwitterUserStyle. Besides which, the fields themselves need some massaging to be compatible with Wukong and other tools in our Map/Reduce toolkit (details explained in a later section).
91
-
92
- The fiddly little details are handled by a JsonUserWithTweet or JsonTweetWithUser (as appropriate) adapter pattern:
93
-
94
- class JsonUserTweetPair
95
- def initialize raw, moreinfo
96
- # clean up fields in entries (flatten date, true/false -> 1/0, etc)
97
- fix_raw_user!
98
- fix_raw_tweet!
99
- end
100
-
101
- # generate all the contained TwitterXXX objects
102
- def each
103
- #
104
- end
105
-
106
- # create TwitterUser object from raw info
107
- def user
108
- end
109
- # create Tweet object from raw tweet hash
110
- def tweet
111
- end
112
- # ... and so forth
113
- end
114
-
115
- I'll ignore the gory details; view the source if you're interested.
116
-
117
-
118
- h4. Running the script
119
-
120
- Here, again, is the code (in full!) for the twitter_request_parser.rb script.
121
-
122
- #
123
- # Instantiate each incoming request.
124
- # Stream out the contained classes it generates.
125
- #
126
- class TwitterRequestParser < Wukong::Streamer::StructStreamer
127
- def process request
128
- request.parse do |obj|
129
- yield obj
130
- end
131
- end
132
- end
133
-
134
- # This makes the script go.
135
- Wukong::Script.new(TwitterRequestParser, nil).run
136
-
137
- That last line is the runner: it makes this a Wukong script with a map phase only. (We'll add in a reducer later on.)
138
-
@@ -1,39 +0,0 @@
1
- ---
2
- permalink: ":year-:month/:title.html"
3
- markdown: rdiscount
4
- pygments: true
5
- auto: true
6
- server: true
7
- server_port: 4000
8
- maruku:
9
- use_tex: false
10
- use_divs: false
11
- png_dir: images/latex
12
- png_url: /images/latex
13
-
14
- header_ref: '.html' # .html for subdirs, / for main.
15
- assets_path: '/' # http://github.mrflip.com
16
-
17
- gemuser: mrflip
18
- gemname: wukong
19
- gemversion: 0.1.1
20
- title: mrflip.github.com/wukong
21
-
22
- keywords: [ 'wukong,hadoop,ruby,mrflip,infochimps,map,reduce,streaming,dumbo,happy,mrtoolkit,script,simple' ]
23
- description: "Wukong: Hadoop made so easy a Chimpanzee could run it."
24
- header_files:
25
- - INSTALL
26
- - LICENSE
27
- - usage
28
- - wutils
29
- - moreinfo
30
- - tutorial
31
-
32
- credits:
33
- <p>Wukong image courtesy
34
- <a href="http://www.curtbusse.com/okavango/page1/oka1.html">Curt Busse</a> under
35
- an <a href="http://www.curtbusse.com/copyright.html">open license</a>.
36
- It's a Chacma Baboon from the Okavango site. Make sure to read the
37
- <a href="http://www.curtbusse.com/okavango/page1/oka1.html#note3">story at the bottom of that page</a>.
38
- </p>
39
-
@@ -1,56 +0,0 @@
1
- * Spec: http://avro.apache.org/docs/current/spec.html
2
- * Jira: https://issues.apache.org/jira/browse/AVRO
3
- * Wiki: https://cwiki.apache.org/confluence/display/AVRO/Index
4
-
5
- * http://github.com/phunt/avro-rpc-quickstart
6
-
7
- * http://lucene.apache.org/java/2_4_0/fileformats.html#VInt -- types
8
- * http://code.google.com/apis/protocolbuffers/docs/encoding.html#types -- a good reference
9
- * Avro + Eventlet (Python evented code): http://unethicalblogger.com/node/282
10
-
11
-
12
-
13
- Cassandra + Avro
14
-
15
- * Make bulk loading into Cassandra less crappy, more pluggable https://issues.apache.org/jira/browse/CASSANDRA-1278
16
- * Refactor Streaming: https://issues.apache.org/jira/browse/CASSANDRA-1189
17
- * Increment Counters: https://issues.apache.org/jira/browse/CASSANDRA-1072
18
-
19
- == From hammer's avro tools:
20
-
21
- #! /usr/bin/env python
22
-
23
- import sys
24
- from avro import schema
25
- from avro.genericio import DatumReader
26
- from avro.io import DataFileReader
27
-
28
- if __name__ == "__main__":
29
- if len(sys.argv) < 2:
30
- print "Need to at least specify an Avro file."
31
- outfile_name = sys.argv[1]
32
-
33
- message_schema = None
34
- if len(sys.argv) > 2:
35
- message_schema = schema.parse(schema.parse(sys.argv[2].encode("utf-8")))
36
-
37
- r = file(outfile_name, 'r')
38
- dr = DatumReader(expected = message_schema)
39
- dfr = DataFileReader(r, dr)
40
- for record in dfr:
41
- print record
42
- dfr.close()
43
-
44
- from binascii import hexlify
45
-
46
- def avro_hexlify(reader):
47
- """Return the hex value, as a string, of a binary-encoded int or long."""
48
- bytes = []
49
- current_byte = reader.read(1)
50
- bytes.append(hexlify(current_byte))
51
- while (ord(current_byte) & 0x80) != 0:
52
- current_byte = reader.read(1)
53
- bytes.append(hexlify(current_byte))
54
- return ' '.join(bytes)
55
-
56
-