wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,61 @@
1
+ require_relative("accumulator")
2
+
3
+ module Wukong
4
+ class Processor
5
+
6
+ # A processor which counts the total number of its input records.
7
+ #
8
+ # On it's own, this widget is really just a poor man's `wc -l`.
9
+ # It's really intended to serve as a superclass for more complex
10
+ # accumulators.
11
+ #
12
+ # @example Count the total number of input records on the command-line.
13
+ #
14
+ # $ wc -l input
15
+ # 283 input
16
+ # $ cat input | wu-local count
17
+ # 283
18
+ class Count < Accumulator
19
+
20
+ # The total size of the input recors.
21
+ attr_accessor :size
22
+
23
+ # Initializes the count to 0.
24
+ def setup
25
+ super()
26
+ @size = 0
27
+ end
28
+
29
+ # Accumulate a `record` by incrmenting the total size.
30
+ #
31
+ # @param [Object] record
32
+ def accumulate record
33
+ self.size += 1
34
+ end
35
+
36
+ # Keeps all records in the same group so that one count is
37
+ # emitted at the end.
38
+ #
39
+ # Overriding this method and returning different keys for
40
+ # different records is the beginning of constructing a "group
41
+ # by" type widget.
42
+ #
43
+ # @param [Object] record
44
+ # @return [:__first__group__]
45
+ # @see Group
46
+ def get_key record
47
+ :__first_group__
48
+ end
49
+
50
+ # Yields the total size.
51
+ #
52
+ # @yield [size]
53
+ # @yieldparam [Integer] size
54
+ def finalize
55
+ yield self.size
56
+ end
57
+
58
+ register
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,85 @@
1
+ require_relative("../utils")
2
+ require_relative("count")
3
+
4
+ module Wukong
5
+ class Processor
6
+
7
+ # Groups sorted input records and emits each group with a count.
8
+ #
9
+ # Allows you to use several ways of extracting the key that
10
+ # defines the group.
11
+ #
12
+ # **Note:** The input records must be previously sorted by the
13
+ # same key used for grouping in order to ensure that groups are
14
+ # not split up.
15
+ #
16
+ # @example Group simple string values on the command-line.
17
+ #
18
+ # $ cat input
19
+ # apple
20
+ # cat
21
+ # banana
22
+ # apple
23
+ # ...
24
+ # $ cat input | wu-local sort | wu-local group
25
+ # apple 4
26
+ # banana 2
27
+ # cat 5
28
+ # ...
29
+ #
30
+ # @example Group using a nested key within a JSON string on the command-line
31
+ #
32
+ # $ cat input
33
+ # {"id": 1, "word": "apple" }
34
+ # {"id": 2, "word": "cat" }
35
+ # {"id": 3, "word": "banana"}
36
+ # ...
37
+ # $ cat input | wu-local sort --on==word | wu-local group --by=word
38
+ # apple 4
39
+ # banana 2
40
+ # cat 5
41
+ # ...
42
+ #
43
+ # A group fits nicely at the end of a dataflow. Since it requires
44
+ # a sort, it is blocking.
45
+ #
46
+ # @example Using a group at the end of a dataflow
47
+ #
48
+ # Wukong.dataflow(:makes_groups) do
49
+ # ... | sort(on: 'field') | group(by: 'field')
50
+ # end
51
+ #
52
+ # @see Sort
53
+ class Group < Count
54
+
55
+ include DynamicGet
56
+ field :by, Whatever
57
+
58
+ # Get the key which defines the group for this `record`.
59
+ #
60
+ # @param [Object] record
61
+ # @return [Object]
62
+ def get_key(record)
63
+ get(self.by, record)
64
+ end
65
+
66
+ # Reset the size counter for new group.
67
+ #
68
+ # @param [Object] record
69
+ def start record
70
+ self.size = 0
71
+ end
72
+
73
+ # Yields the current group along with its size
74
+ #
75
+ # @yield [key, size]
76
+ # @yieldparam [Object] key the key defining the group
77
+ # @yieldparam [Integer] size the size of the group
78
+ def finalize
79
+ yield [key, size].map(&:to_s).join("\t")
80
+ end
81
+
82
+ register
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,70 @@
1
+ require_relative("group")
2
+
3
+ module Wukong
4
+ class Processor
5
+
6
+ # Concatenates the elements of a group, yielding the group key,
7
+ # the count, and its members.
8
+ #
9
+ # @example Concatenating elements of a group on the command-line.
10
+ #
11
+ # $ cat input
12
+ # {"id": 1, "parent_id": 4}
13
+ # {"id": 2, "parent_id": 3}
14
+ # {"id": 3, "parent_id": 3}
15
+ # ...
16
+ # $ cat input | wu-local group_concat --by=parent_id
17
+ # 4 1 {"id": 1, "parent_id": 4}
18
+ # 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
19
+ # ...
20
+ #
21
+ # GroupConcat takes all the same options as Group.
22
+ #
23
+ # @see Group
24
+ class GroupConcat < Group
25
+
26
+ # The members of the current group.
27
+ attr_accessor :members
28
+
29
+ # Initializes the empty members array.
30
+ def setup
31
+ super()
32
+ @members = []
33
+ end
34
+
35
+ # Initializes the empty members array.
36
+ #
37
+ # @param [Object] record
38
+ def start record
39
+ super(record)
40
+ self.members = []
41
+ end
42
+
43
+ # Accumulate each record, adding it to the current members.
44
+ #
45
+ # @param [Object] record
46
+ def accumulate record
47
+ super(record)
48
+ self.members << record
49
+ end
50
+
51
+ # Yields the group, including its key, its size, and each
52
+ # member.
53
+ #
54
+ # @yield [key, size, *members]
55
+ # @yieldparam [Object] key the key defining the group
56
+ # @yieldparam [Integer] size the number of members in the group
57
+ # @yieldparam [Array<Object>] the members of the group
58
+ def finalize
59
+ group = [key, size]
60
+ group.concat(members)
61
+ yield group.map(&:to_s).join("\t")
62
+ end
63
+
64
+ register
65
+ end
66
+ end
67
+ end
68
+
69
+
70
+
@@ -0,0 +1,72 @@
1
+ require_relative("group")
2
+
3
+ module Wukong
4
+ class Processor
5
+ class Moments < Group
6
+
7
+ field :group_by, Whatever
8
+
9
+ attr_accessor :measurements
10
+
11
+ field :of, Array, :default => []
12
+ field :std_dev, :boolean, :default => true
13
+
14
+ def get_key record
15
+ super(record) unless (self.group_by || self.by)
16
+ get(self.group_by || self.by, record)
17
+ end
18
+
19
+ def receive_of o
20
+ @of = case o
21
+ when String then o.split(',')
22
+ when Array then o
23
+ else []
24
+ end
25
+ end
26
+
27
+ def start record
28
+ super(record)
29
+ @measurements = {}.tap do |m|
30
+ self.of.each do |property|
31
+ m[property] = []
32
+ end
33
+ end
34
+ end
35
+
36
+ def accumulate record
37
+ super(record)
38
+ self.of.each do |property|
39
+ if raw = get(property, record)
40
+ self.measurements[property] << (raw.to_f rescue next)
41
+ end
42
+ end
43
+ end
44
+
45
+ def results
46
+ {}.tap do |r|
47
+ measurements.each_pair do |property, values|
48
+ r[property] = {}
49
+ next if values.empty?
50
+ count = values.size.to_f
51
+ r[property][:count] = count.to_i
52
+
53
+ mean = values.inject(0.0) { |sum, value| sum += value } / count
54
+ r[property][:mean] = mean
55
+ if std_dev
56
+ variance = values.inject(0.0) { |sum, value| diff = (value - mean) ; sum += diff * diff } / count
57
+ std = Math.sqrt(variance)
58
+ r[property][:std_dev] = std
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ def finalize
65
+ yield({:group => key, :count => size}.merge(:results => results))
66
+ end
67
+
68
+ register
69
+ end
70
+ end
71
+ end
72
+
@@ -0,0 +1,130 @@
1
+ require_relative("accumulator")
2
+ require_relative("../utils")
3
+
4
+ module Wukong
5
+ class Processor
6
+
7
+ # Sorts input records.
8
+ #
9
+ # For many use cases you're better off using native tools like
10
+ # `/bin/sort` because they are faster and already do what you
11
+ # need.
12
+ #
13
+ # @example When /bin/sort is more than enough on the command-line
14
+ #
15
+ # $ cat input
16
+ # 1 apple
17
+ # 2 banana
18
+ # 3 cat
19
+ # 4 banana
20
+ # ...
21
+ # $ cat input | sort -k2
22
+ # 1 apple
23
+ # 2 banana
24
+ # 4 banana
25
+ # 3 cat
26
+ # ...
27
+ #
28
+ # Other times, you need something that can introspect more on its
29
+ # input:
30
+ #
31
+ # @example When you may prefer the sort widget on the command-line
32
+ #
33
+ # $ cat input
34
+ # {"id": 1, "word": "apple" }
35
+ # {"id": 2, "word": "cat" }
36
+ # {"id": 3, "word": "banana"}
37
+ # ...
38
+ # $ cat input | wu-local sort --on word
39
+ # {"id": 1, "word": "apple" }
40
+ # {"id": 3, "word": "banana"}
41
+ # {"id": 2, "word": "cat" }
42
+ # ...
43
+ #
44
+ # The sort widget is useful for modeling Hadoop jobs, but don't
45
+ # forget that [Hadoop does its own
46
+ # sorting](http://hadoop.apache.org/docs/r0.20.2/mapred_tutorial.html#Sort),
47
+ # so the sort widget doesn't belong in your map/reduce jobs.
48
+ #
49
+ # @example The wrong way to model a Hadoop map/reduce job
50
+ #
51
+ # Wukong.dataflow(:my_incorrect_job_dataflow) do
52
+ # parse | extract(part: 'country') | sort | group
53
+ # end
54
+ #
55
+ # @example The right way to model a Hadoop map/reduce job
56
+ #
57
+ # Wukong.dataflow(:mapper) do
58
+ # parse | extract(part: 'country')
59
+ # end
60
+ #
61
+ # Wukong.dataflow(:reducer) do
62
+ # group
63
+ # end
64
+ class Sort < Accumulator
65
+
66
+ include DynamicGet
67
+ field :on, Whatever
68
+ field :reverse, :boolean, :default => false
69
+ field :numeric, :boolean, :default => false
70
+
71
+ # Intializes the array of records that will hold all the values.
72
+ def setup
73
+ super()
74
+ @records = []
75
+ end
76
+
77
+ # Keeps all the records in a single group so they can be sorted.
78
+ #
79
+ # @param [Object] record
80
+ # @return [:__first__group__]
81
+ def get_key(record)
82
+ :__first_group__
83
+ end
84
+
85
+ # Stores the `record` for later sorting.
86
+ #
87
+ # @param [Object] record
88
+ def accumulate record
89
+ @records << record
90
+ end
91
+
92
+ # Sorts all the stored records and yields in one sorted
93
+ # according to the field in the right order.
94
+ #
95
+ # @yield [record] each record in correct sort order
96
+ # @yeildparam [Object] record
97
+ def finalize
98
+ sorted = @records.sort{ |x, y| compare(x, y) }
99
+ sorted.reverse! if reverse
100
+ sorted.each{ |record| yield record }
101
+ end
102
+
103
+ # Extracts the sortable part of the input `record`.
104
+ #
105
+ # @param [Object] record
106
+ # @return [Object] the part of the record to sort on
107
+ def sortable(record)
108
+ get(self.on, record)
109
+ end
110
+
111
+ # Compare records `x` and `y` using their sortable parts.
112
+ #
113
+ # Will use numeric sorting when asked.
114
+ #
115
+ # @param [Object] x
116
+ # @param [Object] y
117
+ # @return [1,0,-1] depends on which of x or y is considered greater
118
+ def compare(x, y)
119
+ a = (sortable(x) or return -1)
120
+ b = (sortable(y) or return 1)
121
+ if numeric
122
+ a = a.to_f ; b = b.to_f
123
+ end
124
+ a <=> b
125
+ end
126
+
127
+ register
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,287 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # An empty parent class for all Serializers to subclass.
5
+ class Serializer < Processor
6
+ end
7
+
8
+ # A widget for serializing inputs to JSON.
9
+ #
10
+ # @example Serializing to JSON at the end of a data flow
11
+ #
12
+ # Wukong.dataflow(:emits_json) do
13
+ # ... | to_json
14
+ # end
15
+ #
16
+ # @see FromJson
17
+ class ToJson < Serializer
18
+ # Yields the input `record` serialized as JSON.
19
+ #
20
+ # @param [Object] record
21
+ # @yield [json] the serialized json output
22
+ # @yieldparam [String] json
23
+ def process(record)
24
+ begin
25
+ json = ::MultiJson.dump(record)
26
+ rescue => e
27
+ # FIXME -- should we log here or what?
28
+ return
29
+ end
30
+ yield json
31
+ end
32
+ register
33
+ end
34
+
35
+ # A widget for deserializing inputs from JSON.
36
+ #
37
+ # @example Deserializing from JSON at the beginning of a data flow
38
+ #
39
+ # Wukong.dataflow(:consumes_json) do
40
+ # from_json | ...
41
+ # end
42
+ #
43
+ # @see ToJson
44
+ class FromJson < Serializer
45
+ # Yields the input `record` deserialized from JSON.
46
+ #
47
+ # @param [String] json
48
+ # @yield [obj] the deserialized object
49
+ # @yieldparam [Object] obj
50
+ def process(json)
51
+ begin
52
+ obj = ::MultiJson.load(json)
53
+ rescue => e
54
+ # FIXME -- should we log here or what?
55
+ return
56
+ end
57
+ yield obj
58
+ end
59
+ register
60
+ end
61
+
62
+ # A widget for serializing inputs to TSV.
63
+ #
64
+ # @example Serializing to TSV at the end of a data flow
65
+ #
66
+ # Wukong.dataflow(:emits_tsv) do
67
+ # ... | to_tsv
68
+ # end
69
+ #
70
+ # @see FromTsv
71
+ class ToTsv < Serializer
72
+ # Yields the input `record` serialized as TSV.
73
+ #
74
+ # @param [Object] record
75
+ # @yield [tsv] the serialized TSV output
76
+ # @yieldparam [String] tsv
77
+ def process(record)
78
+ begin
79
+ tsv = record.map(&:to_s).join("\t")
80
+ rescue => e
81
+ # FIXME -- should we log here or what?
82
+ return
83
+ end
84
+ yield tsv
85
+ end
86
+ register
87
+ end
88
+
89
+ # A widget for deserializing inputs from TSV.
90
+ #
91
+ # @example Deserializing from TSV at the beginning of a data flow
92
+ #
93
+ # Wukong.dataflow(:consumes_tsv) do
94
+ # from_tsv | ...
95
+ # end
96
+ #
97
+ # @see ToTsv
98
+ class FromTsv < Serializer
99
+ # Yields the input `record` deserialized from TSV.
100
+ #
101
+ # @param [String] tsv
102
+ # @yield [obj] the deserialized object
103
+ # @yieldparam [Object] obj
104
+ def process(tsv)
105
+ begin
106
+ record = tsv.split(/\t/)
107
+ rescue => e
108
+ # FIXME -- should we log here or what?
109
+ return
110
+ end
111
+ yield record
112
+ end
113
+ register
114
+ end
115
+
116
+ # A widget for serializing inputs to CSV.
117
+ #
118
+ # @example Serializing to CSV at the end of a data flow
119
+ #
120
+ # Wukong.dataflow(:emits_csv) do
121
+ # ... | to_csv
122
+ # end
123
+ #
124
+ # @see FromCsv
125
+ class ToCsv < Serializer
126
+ # Yields the input `record` serialized as CSV.
127
+ #
128
+ # @param [Object] record
129
+ # @yield [csv] the serialized CSV output
130
+ # @yieldparam [String] csv
131
+ def process(record)
132
+ begin
133
+ csv = record.map(&:to_s).join(",")
134
+ rescue => e
135
+ # FIXME -- should we log here or what?
136
+ return
137
+ end
138
+ yield csv
139
+ end
140
+ register
141
+ end
142
+
143
+ # A widget for deserializing inputs from CSV.
144
+ #
145
+ # @example Deserializing from CSV at the beginning of a data flow
146
+ #
147
+ # Wukong.dataflow(:consumes_csv) do
148
+ # from_csv | ...
149
+ # end
150
+ #
151
+ # @see ToCsv
152
+ class FromCsv < Serializer
153
+ # Yields the input `record` deserialized from CSV.
154
+ #
155
+ # @param [String] csv
156
+ # @yield [obj] the deserialized object
157
+ # @yieldparam [Object] obj
158
+ def process(csv)
159
+ begin
160
+ record = csv.split(/,/)
161
+ rescue => e
162
+ # FIXME -- should we log here or what?
163
+ return
164
+ end
165
+ yield record
166
+ end
167
+ register
168
+ end
169
+
170
+ # A widget for serializing inputs to a delimited format.
171
+ #
172
+ # @example Serializing to a delimited format at the end of a data flow
173
+ #
174
+ # Wukong.dataflow(:emits_delimited) do
175
+ # ... | to_delimited(delimiter: "--")
176
+ # end
177
+ #
178
+ # @see FromDelimited
179
+ class ToDelimited < Serializer
180
+ field :delimiter, String, :default => "\t"
181
+ # Yields the input `record` serialized in a delimited format..
182
+ #
183
+ # @param [Object] record
184
+ # @yield [delimited] the serialized delimited output
185
+ # @yieldparam [String] delimited
186
+ def process(record)
187
+ begin
188
+ delimited = record.map(&:to_s).join(delimiter)
189
+ rescue => e
190
+ # FIXME -- should we log here or what?
191
+ return
192
+ end
193
+ yield delimited
194
+ end
195
+ register
196
+ end
197
+
198
+ # A widget for deserializing inputs from a delimited format.
199
+ #
200
+ # @example Deserializing from a delimited format at the beginning of a data flow
201
+ #
202
+ # Wukong.dataflow(:consumes_delimited) do
203
+ # from_delimited(delimiter: "--") | ...
204
+ # end
205
+ #
206
+ # @see ToDelimited
207
+ class FromDelimited < Serializer
208
+ field :delimiter, String, :default => "\t"
209
+ # Yields the input `record` deserialized from a delimited format.
210
+ #
211
+ # @param [String] delimited
212
+ # @yield [obj] the deserialized object
213
+ # @yieldparam [Object] obj
214
+ def process(delimited)
215
+ begin
216
+ record = delimited.split(delimiter)
217
+ rescue => e
218
+ # FIXME -- should we log here or what?
219
+ return
220
+ end
221
+ yield record
222
+ end
223
+ register
224
+ end
225
+
226
+ # A widget for serializing inputs to Ruby's `inspect` format.
227
+ #
228
+ # @example Serializing to Ruby's inspect format at the end of a data flow
229
+ #
230
+ # Wukong.dataflow(:emits_inspected) do
231
+ # ... | to_inspect
232
+ # end
233
+ class ToInspect < Serializer
234
+ # Yields the input record(s) passed through Ruby's `inspect`.
235
+ #
236
+ # @param [Array<Object>]
237
+ # @yield [inspected]
238
+ # @yieldparam [String] inspected
239
+ def process(*args)
240
+ yield args.size == 1 ? args.first.inspect : args.inspect
241
+ end
242
+ register
243
+ end
244
+
245
+ # A widget for pretty printing input records.
246
+ #
247
+ # @example Pretty printing JSON on the command-line
248
+ #
249
+ # $ cat input
250
+ # {"id": 1, "word": "apple" }
251
+ # $ cat input | wu-local pretty
252
+ # {
253
+ # "id":2,
254
+ # "parent_id":3
255
+ # }
256
+ class Pretty < Serializer
257
+ # Pretty print `record` if we can.
258
+ #
259
+ # @param [Object] record
260
+ # @yield [pretty]
261
+ # @yieldparam [String] pretty the pretty-printed record
262
+ def process record
263
+ if record.is_a?(String) && record =~ /^\s*\{/
264
+ yield pretty_json(record)
265
+ else
266
+ yield record.to_s
267
+ end
268
+ end
269
+
270
+ # Attempt to pretty-print the given `json`, returning the
271
+ # original on an error.
272
+ #
273
+ # @param [String] json ugly JSON
274
+ # @return [String] prettier JSON
275
+ def pretty_json json
276
+ begin
277
+ MultiJson.dump(MultiJson.load(json), :pretty => true)
278
+ rescue => e
279
+ json
280
+ end
281
+ end
282
+
283
+ register
284
+ end
285
+
286
+ end
287
+ end