wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,60 @@
1
+ class String
2
+ def match_all regex
3
+ self.to_enum(:scan, regex).map {Regexp.last_match}
4
+ end
5
+ end
6
+
7
+ module Wukong
8
+ module FlatPack
9
+
10
+ # Creates a 'simple' token from the supplied string
11
+ # and position.
12
+ def self.simple_token_from_string(str, position)
13
+ token_pieces = str.match(Language::NAMED_SIMPLE_TYPE_RE)
14
+ t = Flat::Tokens.token_for_indicator(token_pieces[:type])
15
+ t.position = position
16
+ t.length = token_pieces[:length].nil? ? nil : token_pieces[:length].to_i
17
+ t.modifier = token_pieces[:modifier]
18
+ return t
19
+ end
20
+
21
+ # Creates a fixed point token. Strict input formatting is
22
+ # enforced if the strict param is true.
23
+ def self.fixed_point_token_from_string(str, position, strict=true)
24
+ float_pieces = str.match(Language::NAMED_FIXED_POINT_RE)
25
+ t = Flat::Tokens::FixedPointToken.new
26
+ t.position = position
27
+ t.strict = strict
28
+ t.power = float_pieces[:power].nil? ? nil : float_pieces[:power].to_i
29
+ t.length = float_pieces[:length].to_i
30
+ return t
31
+ end
32
+
33
+ # Validates the supplied format string
34
+ # and creates a parser from it.
35
+ def self.create_parser(str, delimiter_width=0, strict_fixed_point=true)
36
+ return nil unless Language.string_in_lang str
37
+ lang = []
38
+ str.match_all(Language::CAPTURE_TOKEN_RE).each do |match|
39
+ token_str = match[0]
40
+ case token_str
41
+ when Language::TOTAL_SIMPLE_TYPE_RE
42
+ lang << simple_token_from_string(token_str, match.begin(0))
43
+ when Language::TOTAL_FIXED_POINT_RE
44
+ lang << fixed_point_token_from_string(token_str, match.begin(0), strict_fixed_point)
45
+ when Language::TOTAL_DATE_RE
46
+ date_match = token_str.match(Language::NAMED_DATE_RE)
47
+ #TODO: Implement
48
+ end
49
+ if delimiter_width != 0
50
+ t = Flat::Tokens::IgnoreToken.new
51
+ t.position = -1
52
+ t.length = delimiter_width
53
+ lang << t
54
+ end
55
+ end
56
+ lang = lang[0..-2] if delimiter_width != 0 #pop off the delimiter on the end
57
+ return Flat::Parser.new(lang)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,4 @@
1
+ require 'flat'
2
+ require 'lang'
3
+ require 'parser'
4
+ require 'tokens'
@@ -0,0 +1,46 @@
1
+ module Wukong
2
+ module FlatPack
3
+ module Language
4
+
5
+ #language definition
6
+ SIMPLE_TYPES = %w{i f s b _}
7
+ SIMPLE_TYPE_RE = "[#{SIMPLE_TYPES.join}]"
8
+
9
+ MODIFIERS = %w{+ *}
10
+ MODIFIER_RE = "[#{MODIFIERS.join}]"
11
+
12
+ SIMPLE_TOKEN_RE = "#{SIMPLE_TYPE_RE}(?:#{MODIFIER_RE}|[0-9]+)?"
13
+
14
+ DATE_TYPES = %w{a A b B c d H I j m M p S U w W x X Y Z}
15
+ DATE_TYPES_RE = "[#{DATE_TYPES.join} ]" # the extra space is supposed to be there
16
+ DATE_TOKEN_RE = "%#{DATE_TYPES_RE}*%"
17
+
18
+ FIXED_POINT_TYPE = 'D'
19
+ FIXED_POINT_SEP = 'e'
20
+ FIXED_POINT_TOKEN_RE = "#{FIXED_POINT_TYPE}\\d+(?:#{FIXED_POINT_SEP}\\d+)?"
21
+
22
+ TOKENS = [SIMPLE_TOKEN_RE, DATE_TOKEN_RE, FIXED_POINT_TOKEN_RE]
23
+ TOKEN_RE = "#{TOKENS.join('|')}"
24
+ CAPTURE_TOKEN_RE = /(#{TOKENS.join('|')})/
25
+
26
+ LANGUAGE_RE = /^(?:(#{TOKEN_RE}) *)+$/
27
+
28
+ #total regexes, i.e. regexes that must match the whole string
29
+ TOTAL_SIMPLE_TYPE_RE = /^#{SIMPLE_TOKEN_RE}$/
30
+ TOTAL_FIXED_POINT_RE = /^#{FIXED_POINT_TOKEN_RE}$/
31
+ TOTAL_DATE_RE = /^#{DATE_TOKEN_RE}$/
32
+
33
+ #named regexes used for parsing tokens
34
+ NAMED_SIMPLE_TYPE_RE = /(?<type>#{SIMPLE_TYPE_RE})(?:(?<length>[0-9]+)|(?<modifier>#{MODIFIER_RE}))?/
35
+ NAMED_FIXED_POINT_RE = /#{FIXED_POINT_TYPE}(?<length>\d+)(?:#{FIXED_POINT_SEP}(?<power>\d+))?/
36
+ NAMED_DATE_RE = /%(?<format>#{DATE_TYPES_RE})%/
37
+
38
+ # Returns true if the supplied string is in
39
+ # Flat's formatting language, as determined
40
+ # by the LANGUAGE_RE regex.
41
+ def self.string_in_lang(str)
42
+ return (not (str =~ LANGUAGE_RE).nil?)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,55 @@
1
+ module Wukong
2
+ module FlatPack
3
+ class Parser
4
+ attr_accessor :re
5
+ attr_accessor :lang
6
+
7
+ def initialize(lang)
8
+ @lang = lang
9
+ @re = re_from_language @lang
10
+ end
11
+
12
+ # returns true if the supplied string is in the parser's language
13
+ def string_in_lang? str
14
+ return (not (str =~ @re).nil?)
15
+ end
16
+
17
+ # Creates a regular expression from the
18
+ # supplied language
19
+ def re_from_language lang
20
+ regex = "^"
21
+ lang.each do |token|
22
+ regex += "(#{token.re})"
23
+ end
24
+ regex += "$"
25
+ return Regexp.new(regex)
26
+ end
27
+
28
+ def parse(str,trim=false)
29
+ return nil unless string_in_lang? str
30
+ result = []
31
+ str.match(@re)[1..-1].each_with_index do |val,index|
32
+ token = lang[index].translate(val)
33
+ if trim and token.is_a?(String)
34
+ token.strip!
35
+ end
36
+ result << token
37
+ end
38
+ return result - [:ignore]
39
+ end
40
+
41
+ def file_to_tsv(in_filename,out_filename,trim=true)
42
+ infile = File.open(in_filename,'r')
43
+ outfile = File.open(out_filename,'a')
44
+ infile.each_line do |line|
45
+ outfile.write(line_to_tsv(line,trim))
46
+ end
47
+ end
48
+
49
+ def line_to_tsv(line,trim=true)
50
+ fields = parse(line,trim)
51
+ return fields.join("\t") + "\n"
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,130 @@
1
+ module Wukong
2
+ module FlatPack
3
+ module Tokens
4
+ TOKEN_CLASSES = {}
5
+
6
+ def self.token_for_indicator(indicator)
7
+ return TOKEN_CLASSES[indicator].new
8
+ end
9
+
10
+ class Token
11
+ attr_accessor :position
12
+ attr_accessor :length
13
+ attr_accessor :indicator
14
+
15
+ def self.indicator= indicator
16
+ TOKEN_CLASSES[indicator] = self
17
+ @indicator = indicator
18
+ end
19
+ end
20
+
21
+ class FixedPointToken < Token
22
+ attr_accessor :power
23
+ attr_accessor :strict
24
+
25
+ self.indicator = 'D'
26
+
27
+ #TODO: Allow negative powers
28
+ def re
29
+ strict ? "(?:(?:\\+|-)\\d{#{@length-1}}|\\d{#{@length}})" : ".{#{@length}}"
30
+ end
31
+
32
+ def translate str
33
+ return nil if str.strip == ""
34
+ base = str.to_f
35
+ return base / (10**@power)
36
+ end
37
+ end
38
+
39
+ class BasicToken < Token
40
+ attr_accessor :modifier
41
+
42
+ def re token= '.'
43
+ if not @length.nil?
44
+ return "#{token}{#{@length}}"
45
+ elsif not @modifier.nil?
46
+ return "#{token}#{@modifier}"
47
+ else
48
+ return token
49
+ end
50
+ end
51
+
52
+ end
53
+
54
+ class IntToken < BasicToken
55
+ self.indicator = 'i'
56
+ RE = '(?:\+|-)?\\d'
57
+
58
+ def re
59
+ if not @length.nil?
60
+ return "(?:(?:\\+|-)\\d{#{@length-1}}|\\d{#{@length}})"
61
+ elsif not @modifier.nil?
62
+ return "#{RE}#{@modifier}"
63
+ else
64
+ return RE
65
+ end
66
+ end
67
+
68
+ def translate str
69
+ return Integer(str)
70
+ rescue ArgumentError => err
71
+ return str.to_i
72
+ end
73
+ end
74
+
75
+ class StringToken < BasicToken
76
+ self.indicator = 's'
77
+
78
+ def translate str
79
+ return str
80
+ end
81
+ end
82
+
83
+ class FloatToken < BasicToken
84
+ self.indicator = 'f'
85
+ #TODO: Implement floats
86
+
87
+ def get_re
88
+ #TODO: Implement
89
+ end
90
+
91
+ def translate
92
+ #TODO: Implement
93
+ end
94
+ end
95
+
96
+ class BoolToken < BasicToken
97
+ self.indicator = 'b'
98
+ TRUE_TOKENS = ['t','y','1']
99
+ FALSE_TOKENS = ['f','n','0']
100
+
101
+ #TODO: Add back multi-char options and think through allowing padding
102
+ #TODO: Allow users to override true and false
103
+
104
+ def re
105
+ return "(?:#{(TRUE_TOKENS + TRUE_TOKENS.map {|c| c.upcase} +
106
+ FALSE_TOKENS + FALSE_TOKENS.map{|c| c.upcase}).join('|')})"
107
+ end
108
+
109
+ def translate str
110
+ if TRUE_TOKENS.include?(str.downcase)
111
+ return true
112
+ elsif FALSE_TOKENS.include?(str.downcase)
113
+ return false
114
+ else
115
+ return nil
116
+ end
117
+ end
118
+ end
119
+
120
+ class IgnoreToken < BasicToken
121
+ self.indicator = '_'
122
+
123
+ # ignore symbols are removed from the final output
124
+ def translate str
125
+ return :ignore
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -1,142 +1,88 @@
1
- require 'vayacondios-client'
2
-
3
- Settings.define :monitor_interval, :default => 50_000, :type => Integer
1
+ require 'log4r'
4
2
 
5
3
  module Wukong
6
- class ProcessorError < StandardError ; end
7
-
8
- class Processor < Hanuman::Action
9
- include Hanuman::IsOwnInputSlot
10
- include Hanuman::IsOwnOutputSlot
4
+ class ProcessorBuilder < Hanuman::StageBuilder
5
+ def namespace(*args)
6
+ args.first.is_a?(Class) ? args.first : Wukong::Processor
7
+ end
8
+ end
11
9
 
12
- field :name, Symbol, :default => ->{ self.class.handle }
13
- field :count, Integer, doc: 'Number of records seen this run', default: 0
10
+ # The Processor is the basic unit of computation in Wukong. A
11
+ # processor can be thought of as an arbitrary function that takes
12
+ # certain inputs and produces certain (or no) outputs.
13
+ #
14
+ # A Processor can be written and tested purely in Ruby and on your
15
+ # local machine. You can glue processors together
16
+ class Processor < Hanuman::Stage
17
+
18
+ field :action, Whatever
19
+ field :log, Whatever, :default => -> { log = Log4r::Logger.new(self.class.to_s) ; log.outputters = Log4r::StdoutOutputter.new('stdout', formatter: Log4r::PatternFormatter.new(pattern: "%d [%l] %c: %m")) ; log }
20
+ field :notifier, Vayacondios::NotifierFactory, :default => Vayacondios.default_notifier
14
21
 
15
- # override this in your subclass
16
- def process(record)
22
+ def self.describe desc
23
+ @description = desc
17
24
  end
18
25
 
19
- # passes a record on down the line
20
- def emit(record)
21
- self.count += 1
22
- if (count % Settings.monitor_interval.to_i == 0)
23
- log.info "emit\t%-23s\t%-47s\t%s" % [self.class, self.inspect, record.inspect]
24
- end
25
- output.process(record)
26
- rescue Wukong::ProcessorError
27
- raise
28
- rescue StandardError => err
29
- next_block = output.name rescue "(bad stage)"
30
- log.warn "#{self}: error emitting #{next_block}: #{err.message}"
31
- raise Wukong::ProcessorError, err.message, err.backtrace
26
+ def self.description
27
+ @description
32
28
  end
33
29
 
34
- def bad_record(*args)
35
- BadRecord.make(*args)
30
+ def self.consumes label
36
31
  end
37
32
 
38
- def self.register_processor(name=nil, &block)
39
- register_action(name, &block)
33
+ def self.produces label
40
34
  end
35
+
36
+ # This is a placeholder method intended to be overridden
37
+ def perform_action(*args) ; end
41
38
 
42
- include Vayacondios::Notifications
43
-
44
- class_attribute :log
45
- self.log = Log
46
-
47
- config :error_handler, Vayacondios::NotifierFactory, :default => ->{ Vayacondios::NotifierFactory.receive(type: 'log', log: self.log) }
48
-
49
- def bad_record(record, options = {})
50
- error_handler.notify(record, options.merge(level: 'error'))
39
+ # The action attribute is turned into the perform action method
40
+ def receive_action(action)
41
+ self.define_singleton_method(:perform_action, &action)
51
42
  end
52
- end
53
43
 
54
- class AsIs < Processor
55
- # accepts records, emits as-is
56
- def process(*args)
57
- emit(*args)
44
+ # Valid notifier types are currently :http or :log
45
+ # This processor's log is passed to vayacondios
46
+ def receive_notifier(type)
47
+ if type.is_a?(Hash)
48
+ @notifier = Vayacondios::NotifierFactory.receive({type: 'log'}.merge(type))
49
+ else
50
+ @notifier = Vayacondios::NotifierFactory.receive(type: type, log: log)
51
+ end
58
52
  end
59
- register_processor
60
- end
61
-
62
- class Null < Processor
63
- self.register_processor
64
53
 
65
- # accepts records, emits none
66
- def process(*)
67
- # ze goggles... zey do nussing!
54
+ # Send information to Vayacondios; data goes in, the right thing happens
55
+ def notify(topic, cargo)
56
+ notifier.notify(topic, cargo)
68
57
  end
69
- end
70
58
 
71
- #
72
- # Foreach calls a block on every record, and depends on the block to call
73
- # emit. You can emit one record, many records, or no records, and with any
74
- # contents. If you'll always emit exactly one record out per record in,
75
- # you may prefer Wukong::Widget::Map.
76
- #
77
- # @example regenerate a wordbag with counts matching the original
78
- # foreach{|rec| rec.count.times{ emit(rec.word) } }
79
- #
80
- # @see Project
81
- # @see Map
82
- class Foreach < Processor
83
- self.register_processor
84
-
85
- # @param [Proc] proc used for body of process method
86
- # @yield ... or supply it as a &block arg.
87
- def initialize(prc=nil, &block)
88
- prc ||= block or raise "Please supply a proc or a block to #{self.class}.new"
89
- define_singleton_method(:process, prc)
90
- end
91
-
92
- def self.make(workflow, *args, &block)
93
- obj = new(*args, &block)
94
- workflow.add_stage obj
95
- obj
59
+ # This method is called after the processor class has been instantiated
60
+ # but before any records are given to it to process
61
+ def setup
96
62
  end
97
- end
98
-
99
- #
100
- # Evaluates the block and emits the result if non-nil
101
- #
102
- # @example turn a record into a tuple
103
- # map{|rec| rec.attributes.values }
104
- #
105
- # @example pass along first matching term, drop on the floor otherwise
106
- # map{|str| str[/\b(love|hate|happy|sad)\b/] }
107
- #
108
- class Map < Processor
109
- self.register_processor
110
- attr_reader :blk
111
63
 
112
- # @param [Proc] proc to delegate for call
113
- # @yield if proc is omitted, block must be supplied
114
- def initialize(blk=nil, &block)
115
- @blk = blk || block or raise "Please supply a proc or a block to #{self.class}.new"
64
+ # This method is called once per record
65
+ # Override this in your subclass
66
+ def process(record, &emit)
67
+ yield record
116
68
  end
117
69
 
118
- def process(*args)
119
- result = blk.call(*args)
120
- emit result unless result.nil?
70
+ # This method is called to signal the last record has been
71
+ # received but that further processing may still be done, events
72
+ # still be yielded, &c.
73
+ #
74
+ # This can be used within an aggregating processor (like a reducer
75
+ # in a map/reduce job) to start processing the final aggregate of
76
+ # records since the "last record" has already been received.
77
+ def finalize
121
78
  end
122
79
 
123
- def self.make(workflow, *args, &block)
124
- obj = new(*args, &block)
125
- workflow.add_stage obj
126
- obj
80
+ # This method is called after all records have been passed. It
81
+ # signals that processing should stop.
82
+
83
+ # This method is called after all records have been processed
84
+ def stop
127
85
  end
128
- end
129
86
 
130
- #
131
- # Flatten emits each item in an enumerable as its own record
132
- #
133
- # @example turn a document into all its words
134
- # input > map{|line| line.split(/\W+/) } > flatten > output
135
- class Flatten < Processor
136
- self.register_processor
137
-
138
- def process(iter)
139
- iter.each{|*args| emit(*args) }
140
- end
141
87
  end
142
88
  end