wukong 3.0.0.pre → 3.0.0.pre2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,283 +0,0 @@
1
- ---
2
- layout: default
3
- title: mrflip.github.com/wukong - Tutorial
4
- collapse: false
5
- ---
6
-
7
- h1(gemheader). Tutorial by Examples
8
-
9
-
10
- <notextile><div class="toggle"></notextile>
11
-
12
- h2(#wordcount). Count Words
13
-
14
- Here's a script to count words in a text stream:
15
-
16
- {% highlight ruby %}
17
- require 'wukong'
18
- module WordCount
19
- class Mapper < Wukong::Streamer::LineStreamer
20
- # Emit each word in the line.
21
- def process line
22
- words = line.strip.split(/\W+/).reject(&:blank?)
23
- words.each{|word| yield [word, 1] }
24
- end
25
- end
26
-
27
- class Reducer < Wukong::Streamer::ListReducer
28
- def finalize
29
- yield [ key, values.map(&:last).map(&:to_i).sum ]
30
- end
31
- end
32
- end
33
-
34
- Wukong::Script.new(
35
- WordCount::Mapper,
36
- WordCount::Reducer
37
- ).run # Execute the script
38
- {% endhighlight %}
39
-
40
- The first class, the Mapper, eats lines and craps @[word, count]@ records. Here
41
- the /key/ is the word, and the /value/ is its count.
42
-
43
- The second class is an example of an accumulated list reducer. The values for
44
- each key are stacked up into a list; then the record(s) yielded by @#finalize@
45
- are emitted.
46
-
47
- Here's another way to write the Reducer: accumulate the count of each line, then
48
- yield the sum in @#finalize@:
49
-
50
- {% highlight ruby %}
51
- class Reducer2 < Wukong::Streamer::AccumulatingReducer
52
- attr_accessor :key_count
53
- def start! *args
54
- self.key_count = 0
55
- end
56
- def accumulate(word, count)
57
- self.key_count += count.to_i
58
- end
59
- def finalize
60
- yield [ key, key_count ]
61
- end
62
- end
63
- {% endhighlight %}
64
-
65
- Of course you can be really lazy (i.e. smart) and write your script as
66
-
67
- {% highlight ruby %}
68
- class Script < Wukong::Script
69
- def reducer_command
70
- 'uniq -c'
71
- end
72
- end
73
- {% endhighlight %}
74
-
75
- h2(#structstream). Structured data
76
-
77
- The previous example dealt with unstructured data. Wukong also lets you view your data as a stream of structured objects.
78
-
79
- Let's say you have a blog; its records look like
80
-
81
- {% highlight ruby %}
82
- Post = Struct.new( :id, :created_at, :user_id, :title, :body, :link )
83
- Comment = Struct.new( :id, :created_at, :post_id, :user_id, :body )
84
- User = Struct.new( :id, :username, :fullname, :homepage, :description )
85
- UserLoc = Struct.new( :user_id, :text, :lat, :lng )
86
- {% endhighlight %}
87
-
88
- You've been using "twitter":http://twitter.com for a long time, and you've written something that from now on will inject all your tweets as Posts, and all replies to them as Comments (by a common 'twitter_bot' account on your blog).What about the past two years' worth of tweets? Let's assume you're so chatty that a Map/Reduce script is warranted to handle the volume. (Actually, wukong makes a really nice ETL package, so this may be convienient even at small scale).
89
-
90
- Cook up something that scrapes your tweets and all replies to your tweets:
91
-
92
- {% highlight ruby %}
93
- Tweet = Struct.new( :id, :created_at, :twitter_user_id,
94
- :in_reply_to_user_id, :in_reply_to_status_id, :text )
95
- TwitterUser = Struct.new( :id, :username, :fullname,
96
- :homepage, :location, :description )
97
- {% endhighlight %}
98
-
99
- Now we'll just process all those in a big pile, converting to Posts, Comments and Users as appropriate. Serialize your scrape results so that each Tweet and each TwitterUser is a single lines containing first the class name ('tweet' or 'twitter_user') followed by its constituent fields, in order, separated by tabs.
100
-
101
- The RecordStreamer takes each such line, constructs its corresponding class, and instantiates it with the
102
-
103
- {% highlight ruby %}
104
- require 'wukong'
105
- require 'my_blog' #defines the blog models
106
- module TwitBlog
107
- class Mapper < Wukong::Streamer::RecordStreamer
108
- # Watch for tweets by me
109
- MY_USER_ID = 24601
110
- # structs for our input objects
111
- Tweet = Struct.new( :id, :created_at, :twitter_user_id,
112
- :in_reply_to_user_id, :in_reply_to_status_id, :text )
113
- TwitterUser = Struct.new( :id, :username, :fullname,
114
- :homepage, :location, :description )
115
- #
116
- # If this is a tweet is by me, convert it to a Post.
117
- #
118
- # If it is a tweet not by me, convert it to a Comment that
119
- # will be paired with the correct Post.
120
- #
121
- # If it is a TwitterUser, convert it to a User record and
122
- # a user_location record
123
- #
124
- def process record
125
- case record
126
- when TwitterUser
127
- user = MyBlog::User.new.merge(record) # grab the fields in common
128
- user_loc = MyBlog::UserLoc.new(record.id, record.location, nil, nil)
129
- yield user
130
- yield user_loc
131
- when Tweet
132
- if record.twitter_user_id == MY_USER_ID
133
- post = MyBlog::Post.new.merge record
134
- post.link = "http://twitter.com/statuses/show/#{record.id}"
135
- post.body = record.text
136
- post.title = record.text[0..65] + "..."
137
- yield post
138
- else
139
- comment = MyBlog::Comment.new.merge record
140
- comment.body = record.text
141
- comment.post_id = record.in_reply_to_status_id
142
- yield comment
143
- end
144
- end
145
- end
146
- end
147
- end
148
- Wukong::Script.new( TwitBlog::Mapper, nil ).run # identity reducer
149
- {% endhighlight %}
150
-
151
- h2(#accumulators). Accumulators
152
-
153
- h3(#uniqifying). A Uniqifying Accumulator
154
-
155
-
156
- The script above uses the identity reducer: every record from the mapper is sent
157
- to the output. But what if you had grabbed the replying user's record every time you saw a reply?
158
-
159
- You'd like to just pass it through @uniq@. But if something has changed in the interim, or if you record a timestamp for each sample, you won't be able to use the simple @uniq@ command. You'd like to just get one example for each key!
160
-
161
- Wukong includes just such a reducer, the UniqByLastReducer:
162
-
163
- {% highlight ruby %}
164
- #
165
- # UniqByLastReducer accepts all records for a given key and emits only the
166
- # last-seen.
167
- #
168
- # It acts like an insecure high-school kid: for each record of a given key
169
- # it discards whatever record it's holding and adopts this new value. When a
170
- # new key comes on the scene it emits the last record, like an older brother
171
- # handing off his Depeche Mode collection.
172
- #
173
- # For example, to extract the *latest* value for each property, emit your
174
- # records as
175
- #
176
- # [resource_type, key, timestamp, ... fields ...]
177
- #
178
- # then set :sort_fields to 3 and :partition_fields to 2.
179
- #
180
- class UniqByLastReducer < Wukong::Streamer::AccumulatingReducer
181
- attr_accessor :final_value
182
-
183
- #
184
- # Use first two fields as keys by default
185
- #
186
- def get_key *vals
187
- vals[0..1]
188
- end
189
-
190
- #
191
- # Adopt each value in turn: the last one's the one you want.
192
- #
193
- def accumulate *vals
194
- self.final_value = vals
195
- end
196
-
197
- #
198
- # Emit the last-seen value
199
- #
200
- def finalize
201
- yield final_value if final_value
202
- end
203
-
204
- #
205
- # Clear state on reset
206
- #
207
- def start! *args
208
- self.final_value = nil
209
- end
210
- end
211
- {% endhighlight %}
212
-
213
- h3(#groupby). A GroupBy Accumulator
214
-
215
- Wukong has a good collection of map/reduce patterns. For example, it's quite common to accumulate all records for a given key and emit some result based on the whole group. The
216
-
217
- The AccumulatingReducer calls start! on the first record for each key, calls accumulate() on every example for that key (including the first), and calls finalize() once the last record for that key is seen.
218
-
219
- Here's an AccumulatingReducer that takes a long list of key-value pairs and emits, for each key, all its corresponding values in one line.
220
-
221
- {% highlight ruby %}
222
- #
223
- # Roll up all values for each key into a single line
224
- #
225
- class GroupByReducer < Wukong::Streamer::AccumulatingReducer
226
- attr_accessor :values
227
-
228
- # Start with an empty list
229
- def start! *args
230
- self.values = []
231
- end
232
-
233
- # Aggregate each value in turn
234
- def accumulate key, value
235
- self.values << value
236
- end
237
-
238
- # Emit the key and all values, tab-separated
239
- def finalize
240
- yield [key, values].flatten
241
- end
242
- end
243
- {% endhighlight %}
244
-
245
- So given adjacency pairs for the following directed friend graph:
246
-
247
- <pre>
248
- @jerry @elaine
249
- @elaine @jerry
250
- @jerry @kramer
251
- @kramer @jerry
252
- @kramer @bobsacamato
253
- @kramer @newman
254
- @jerry @superman
255
- @newman @kramer
256
- @newman @elaine
257
- @newman @jerry
258
- </pre>
259
-
260
- You'd end up with
261
-
262
- <pre> @elaine @jerry
263
- @jerry @elaine @kramer @superman
264
- @kramer @bobsacamato @jerry @newman
265
- @newman @elaine @jerry @kramer
266
- </pre>
267
-
268
-
269
- h2. A note about keys
270
-
271
- Now we're going to write this using the synthetic keys already extant in the
272
- twitter records, making the unwarranted assumption that they won't collide with
273
- the keys in your database.
274
-
275
- Map/Reduce paradigm does badly with synthetic keys. Synthetic keys demand
276
- locality, and map/reduce's remarkable scaling comes from not assuming
277
- locality. In general, write your map/reduce scripts to use natural keys (the scre
278
-
279
- h2. More...
280
-
281
- There are many useful examples (including an actually-useful version of this
282
- WordCount script) in the "examples/ directory.":http://github.com/mrflip/wukong/tree/master/examples
283
-
@@ -1,195 +0,0 @@
1
- ---
2
- layout: default
3
- title: Usage notes
4
- ---
5
-
6
- h1(gemheader). {{ site.gemname }} %(small):: usage%
7
-
8
- ** "How to run a Wukong script":#running
9
- ** "How to test your scripts":#testing
10
- ** "Wukong Plays nicely with others":#playnice
11
- ** "Schema export":#schema_export to Pig or SQL
12
- ** "Wukong's internal workflow":#workflow
13
- ** "Using wukong with internal streaming":#stayinruby
14
- ** "Using wukong to Batch-Process ActiveRecord Objects":#activerecord
15
-
16
-
17
- <notextile><div class="toggle"></notextile>
18
-
19
- h2(#running). How to run a Wukong script
20
-
21
- To run your script using local files and no connection to a hadoop cluster,
22
-
23
- pre. your/script.rb --run=local path/to/input_files path/to/output_dir
24
-
25
- To run the command across a Hadoop cluster,
26
-
27
- pre. your/script.rb --run=hadoop path/to/input_files path/to/output_dir
28
-
29
- You can set the default in the config/wukong-site.yaml file, and then just use @--run@ instead of @--run=something@ --it will just use the default run mode.
30
-
31
- If you're running @--run=hadoop@, all file paths are HDFS paths. If you're running @--run=local@, all file paths are local paths. (your/script path, of course, lives on the local filesystem).
32
-
33
- You can supply arbitrary command line arguments (they wind up as key-value pairs in the options path your mapper and reducer receive), and you can use the hadoop syntax to specify more than one input file:
34
-
35
- pre. ./path/to/your/script.rb --any_specific_options --options=can_have_vals \
36
- --run "input_dir/part_*,input_file2.tsv,etc.tsv" path/to/output_dir
37
-
38
- Note that all @--options@ must precede (in any order) all non-options.
39
-
40
- <notextile></div><div class="toggle"></notextile>
41
-
42
- h2(#testing). How to test your scripts
43
-
44
- To run mapper on its own:
45
-
46
- pre. cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
47
-
48
- or if your test data lies on the HDFS,
49
-
50
- pre. hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
51
-
52
- Next graduate to running @--run=local@ mode so you can inspect the reducer.
53
-
54
- <notextile></div><div class="toggle"></notextile>
55
-
56
- h2(#playnice). Wukong Plays nicely with others
57
-
58
- Wukong is friends with "Hadoop":http://hadoop.apache.org/core the elephant, "Pig":http://hadoop.apache.org/pig/ the query language, and the @cat@ on your command line. It even has limited support for "martinis":http://datamapper.org (Datamapper) and "express trains":http://wiki.rubyonrails.org/rails/pages/ActiveRecord (ActiveRecord).
59
-
60
- * "Export Wukong classes to SQL or Pig":#schema_export -- easily bulk-load and define SQL tables, or kickstart your pig scripts
61
- * "Batch-Process records from ActiveRecord":#activerecord (the datamapper case is similar)
62
- * Cascade Mappers and Reducers "purely in ruby":#stayinruby -- reportedly useful in an "ETL":http://en.wikipedia.org/wiki/Extract,_transform,_load context.
63
-
64
- h3(#schema_export). Schema export to Pig or SQL
65
-
66
- There is preliminary support for dumping wukong classes as schemata for other tools. For example, given the following:
67
-
68
- {% highlight ruby %}
69
- require "wukong" ;
70
- require "wukong/schema"
71
- User = TypedStruct.new(
72
- [:id, Integer],
73
- [:scraped_at, Bignum],
74
- [:screen_name, String],
75
- [:followers_count, Integer],
76
- [:created_at, Bignum]
77
- );
78
- {% endhighlight %}
79
-
80
- You can make a snippet for loading into pig with @puts User.load_pig@:
81
-
82
- <pre> LOAD users.tsv AS ( rsrc:chararray, id: int, scraped_at: long, screen_name: chararray, followers_count: int, created_at: long )</pre>
83
-
84
- Export to SQL with @puts User.sql_create_table ; puts User.sql_load_mysql@:
85
-
86
- {% highlight sql %}
87
- CREATE TABLE `users` (
88
- `id` INT,
89
- `scraped_at` BIGINT,
90
- `screen_name` VARCHAR(255) CHARACTER SET ASCII,
91
- `followers_count` INT,
92
- `created_at` BIGINT
93
- ) ;
94
- ALTER TABLE `user` DISABLE KEYS;
95
- LOAD DATA LOCAL INFILE 'user.tsv'
96
- REPLACE INTO TABLE `user`
97
- COLUMNS
98
- TERMINATED BY '\t'
99
- OPTIONALLY ENCLOSED BY ''
100
- ESCAPED BY ''
101
- LINES STARTING BY 'user'
102
- ( @dummy,
103
- `id`, `scraped_at`, `screen_name`, `followers_count`, `created_at`
104
- );
105
- ALTER TABLE `user` ENABLE KEYS ;
106
- SELECT 'user', NOW(), COUNT(*) FROM `user`;
107
- {% endhighlight %}
108
-
109
- <notextile></div><div class="toggle"></notextile>
110
-
111
- h2(#workflow). Wukong's internal workflow
112
-
113
- Here's a somewhat detailed overview of a wukong script's internal workflow.
114
-
115
- # You call @./myscript.rb --run infile outfile@
116
- # Execution begins in the run method of the Script class (@wukong/script.rb@). It launches (depending on if you're local or remote) one of
117
- ** @cat infile | ./myscript.rb --map | sort | ./myscript.rb --reduce > outfile@
118
- ** @hadoop [a_crapton_of_streaming_args] -mapper './myscript.rb --map' -reducer './myscript.rb --reduce' @
119
- # In either case, the effect is to spawn the exact same script you ran at the command line: one or more times with the --map command in place of the --run command, and one or more times with the --reduce command in place of the --run command. %(quiet)(well, unless you specify no reducers or a :map_command or something)%
120
-
121
- # With the @--map@ or @--reduce@ flag given, the Script flag turns over control to the corresponding class: either @mapper_klass.new(self.options).stream@ or @reducer_klass.new(self.options).stream@
122
-
123
- When in @--map@ or @--reduce@ mode (we'll just use @--map@ as an example):
124
-
125
- # The mapper_klass is usually a subclass of @Streamer::Base@, but in actual fact it can be anything that initializes from a hash of options and responds to #stream.
126
- # The default #stream method
127
- ** calls the before_stream hook
128
- ** reads each line from stdin ; #recordizes it ; passes it (if non-nil) to #process ; and emits each object yielded by #process
129
- ** calls its after_stream hook
130
- # You typically leave #stream alone and just override #process.
131
- # The accumulator classes build on these patterns (they're proper subclasses of Streamer::Base), but are used differently. With an accumulator, you should implement some or all of
132
- ** #start! -- called at the start of each accumulation, passing in the first record for that key
133
- ** #accumulate -- called on each record (including that first one)
134
- ** #finalize -- called when the last key of this accumulation is seen.
135
- ** #get_key -- called on each record to recover its key.
136
-
137
-
138
- h3(#stayinruby). Using wukong with internal streaming
139
-
140
- If you're using wukong in local mode, you may not want to spawn new processes all over the place. Or your records may arrive not from the command line but from, say, a database call.
141
-
142
- In that case, just override #stream. The original:
143
-
144
- {% highlight ruby %}
145
- #
146
- # Pass each record to +#process+
147
- #
148
- def stream
149
- before_stream
150
- $stdin.each do |line|
151
- record = recordize(line.chomp)
152
- next unless record
153
- process(*record) do |output_record|
154
- emit output_record
155
- end
156
- end
157
- after_stream
158
- end
159
- {% endhighlight %}
160
-
161
- h3(#activerecord). Using wukong to Batch-Process ActiveRecord Objects
162
-
163
- Here's a stream method, overridden to batch-process ActiveRecord objects (untested sample code):
164
-
165
- {% highlight ruby %}
166
- class Mapper < Wukong::Streamer
167
- # Set record_klass to the ActiveRecord class you'd like to batch process
168
- cattr_accessor :record_klass
169
- # Size of each batch to pull from the database
170
- cattr_accessor :batch_size
171
-
172
- #
173
- # Grab records from the database in batches,
174
- # pass each record to +#process+
175
- #
176
- # Everything downstream of this is agnostic of the fact that
177
- # records are coming from the database and not $stdin
178
- #
179
- def stream
180
- before_stream
181
- record_klass.find_in_batches(:batch_size => batch_size ) do |record_batch|
182
- record_batch.each do |record|
183
- process(record.id, record) do |output_record|
184
- emit output_record
185
- end
186
- end
187
- end
188
- after_stream
189
- end
190
-
191
- # ....
192
- end
193
- {% endhighlight %}
194
-
195
- <notextile></div></notextile>