wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
data/Rakefile CHANGED
@@ -1,12 +1,28 @@
1
- require 'bundler' ; Bundler::GemHelper.install_tasks
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
2
3
 
3
4
  require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:specs)
6
+
4
7
  require 'yard'
8
+ YARD::Rake::YardocTask.new
5
9
 
6
- RSpec::Core::RakeTask.new(:specs) do |spec|
7
- spec.pattern = 'spec/**/*_spec.rb'
10
+ desc 'Run RSpec with code coverage'
11
+ task :cov do
12
+ ENV['WUKONG_COV'] = true
13
+ Rake::Task[:specs].execute
8
14
  end
9
15
 
10
- YARD::Rake::YardocTask.new
11
-
12
16
  task :default => :specs
17
+
18
+ desc "Create a TAGS file for this project"
19
+ task :tags do
20
+ files = [%w[Gemfile Guardfile Rakefile README.md].map { |b| File.join(File.dirname(__FILE__), b) }]
21
+ %w[bin examples lib spec].each do |dir|
22
+ files << Dir[File.join(File.dirname(__FILE__), "#{dir}/**/*.rb")]
23
+ end
24
+ files.each do |arry|
25
+ sh "etags", '-a', *arry unless arry.empty?
26
+ end
27
+ end
28
+
data/TODO.md CHANGED
@@ -4,21 +4,21 @@
4
4
  - hooks up source to flow,
5
5
  - if iterated source, drives it, otherwise sits in the flow
6
6
 
7
- * these set the contract for the inbound resources
7
+ * these set the contract for the inbound products
8
8
 
9
9
  ### slots
10
10
 
11
11
  Typical case: one input, `:input`, one output `:output`
12
12
 
13
- * there are as many resources as
13
+ * there are as many products as
14
14
  - the total number of action stage outputs
15
- - the concrete input resources
15
+ - the concrete input products
16
16
  * The number of rsrc->action edges is at most the total number of input slots
17
- - (you cannot wire multiple resources to the same input slot)
17
+ - (you cannot wire multiple products to the same input slot)
18
18
 
19
19
 
20
20
  1. action stage B wires up to an action stage A (which really means "the full set of A's outputs")
21
- 2. I wire action A's output as production resource X
21
+ 2. I wire action A's output as production product X
22
22
  3.
23
23
 
24
24
  4. How do I address other stages?
@@ -71,7 +71,7 @@ __________________________________________________________________________
71
71
  | foo |
72
72
  ----------
73
73
 
74
- create a resource with no action? action with anonymous resource, wired up later?
74
+ create a product with no action? action with anonymous product, wired up later?
75
75
 
76
76
 
77
77
  * connections:
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding:UTF-8
3
+
4
+ if ARGV.include?('--help')
5
+ puts <<USAGE
6
+ wu-clean-encoding cleans malformed characters from stdin.
7
+
8
+ If a character is malformed, as defined by valid_encoding?,
9
+ it is replaced with a '�'.
10
+
11
+ wu-clean-encoding was built to work with UTF-8, and no
12
+ guarantees are provided for other encodings.
13
+ USAGE
14
+ exit(0)
15
+ end
16
+
17
+ ARGF.each do |line|
18
+ if line.valid_encoding?
19
+ $stdout.write line
20
+ else
21
+ repaired_line = []
22
+ line.each_char do |char|
23
+ if char.valid_encoding?
24
+ repaired_line << char
25
+ else
26
+ repaired_line << "�"
27
+ end
28
+ end
29
+ $stdout.write repaired_line.join
30
+ end
31
+ end
@@ -161,8 +161,8 @@ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type,
161
161
  case type
162
162
  when :mixed, nil then lambda{|s| "%-#{width}s" % s }
163
163
  when :str then lambda{|s| "%-#{width}s" % s }
164
- when :int then lambda{|s| "%#{width}d" % s.gsub(/\D+/, "").to_i }
165
- when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f }
164
+ when :int then lambda{|s| "%#{width}d" % s.gsub(/[^\d\-\+]+/, "").to_i }
165
+ when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.gsub(/[^\d\.eE\-\+]+/, "").to_f }
166
166
  else raise "oops type #{type}" end
167
167
  end
168
168
 
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wukong'
4
+
5
+ settings = Wukong::Local::Configuration
6
+ settings.use(:commandline)
7
+
8
+ def settings.usage
9
+ "usage: #{File.basename($0)} PROCESSOR|FLOW [ --param=value | -p value | --param | -p]"
10
+ end
11
+
12
+ settings.description = <<-EOF
13
+ wu-local is a tool for running Wukong processors and flows locally on
14
+ the command-line. Use wu-local by passing it a processor and feeding
15
+ in some data:
16
+
17
+ $ echo 'UNIX is Clever and Fun...' | wu-local tokenizer.rb
18
+ UNIX
19
+ is
20
+ Clever
21
+ and
22
+ Fun
23
+
24
+ If your processors have named fields you can pass them in as
25
+ arguments:
26
+
27
+ $ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4
28
+ UNIX
29
+ Clever
30
+
31
+ You can chain processors and calls to wu-local together:
32
+
33
+ $ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4 | wu-local downcaser.rb
34
+ unix
35
+ clever
36
+
37
+ Which is a good way to develop a combined data flow which you can
38
+ again test locally:
39
+
40
+ $ echo 'UNIX is clever and fun...' | wu-local tokenize_and_downcase_big_words.rb
41
+ unix
42
+ clever
43
+ EOF
44
+
45
+ settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of the given path.", flag: 'r'
46
+
47
+ require 'wukong/boot' ; Wukong.boot!(settings)
48
+
49
+ thing = settings.rest.first
50
+ case
51
+ when thing.nil?
52
+ settings.dump_help
53
+ exit(1)
54
+ when Wukong.registry.registered?(thing.to_sym)
55
+ processor = thing.to_sym
56
+ when File.exist?(thing)
57
+ load thing
58
+ processor = settings.run || File.basename(thing, '.rb')
59
+ else
60
+ settings.dump_help
61
+ exit(2)
62
+ end
63
+ # p settings
64
+ begin
65
+ Wukong::LocalDriver.run(processor.to_sym, settings)
66
+ rescue Wukong::Error => e
67
+ $stderr.puts e.message
68
+ exit(3)
69
+ end
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+ require 'configliere'
3
+ require 'wukong'
4
+ require 'eventmachine'
5
+ require 'em-synchrony'
6
+ require 'multi_json'
7
+
8
+ Settings({
9
+ host: "localhost",
10
+ port: 9500
11
+ })
12
+
13
+ Settings.use :commandline
14
+ Settings.resolve!
15
+
16
+ # Load the file on the command line
17
+ wu_file = ARGV.shift
18
+ load wu_file
19
+ $processor = File.basename(wu_file, '.rb').to_sym
20
+
21
+ class Wukong::Server
22
+ def prepare(options = {})
23
+ dataflow_class = Wukong.dataflow(:server){ send(options[:processor]) }
24
+ flow_builder = Wukong.registry.retrieve(:server)
25
+ flow = flow_builder.build(processor: $processor)
26
+
27
+ @buffer = []
28
+ @processor = flow.stages.values.first
29
+ @processor.emitter = ->(value){ @buffer << value }
30
+ end
31
+
32
+ def process(record)
33
+ @process.process(record)
34
+ end
35
+
36
+ def cleanup
37
+ end
38
+ end
39
+
40
+
41
+ # EventMachine server
42
+
43
+ class WukongMachine < EM::Protocols::LineAndTextProtocol
44
+ def post_init
45
+ puts "[server] Client connected"
46
+ @wukong = WukongInterface.new
47
+ @wukong.prepare(processor: $processor)
48
+ end
49
+
50
+ def receive_data(data)
51
+ @buffer = []
52
+ input = MultiJson.load data
53
+
54
+ op = proc { @wukong.process(input) }
55
+ callback = proc { send_data MultiJson.dump(@buffer) + "\n" }
56
+ EM.defer(op, callback)
57
+
58
+ rescue MultiJson::DecodeError => ex
59
+ STDERR.puts "[server] Dropped: Malformed request"
60
+ end
61
+
62
+ def unbind
63
+ puts "[server] Client disconnected."
64
+ end
65
+ end
66
+
67
+ EM::run {
68
+ EM::start_server(Settings[:host], Settings[:port], WukongMachine)
69
+ puts "Listening on #{Settings[:host]}:#{Settings[:port]}"
70
+ }
@@ -0,0 +1,38 @@
1
+ source :rubygems
2
+
3
+ gem "configliere", '~> 0.4'
4
+ gem "multi_json", '>= 1.3.6'
5
+ gem "vayacondios-client", '>= 0.0.3'
6
+ gem "gorillib", '>= 0.4.2'
7
+ gem "uuidtools"
8
+ gem "eventmachine"
9
+ gem "log4r"
10
+
11
+ group :examples do
12
+ gem "forgery"
13
+ gem "nokogiri"
14
+ # gem "sanitize"
15
+ gem "addressable"
16
+ gem "forgery"
17
+ gem "crack"
18
+ gem "oj"
19
+ gem "activesupport"
20
+ end
21
+
22
+ group :development do
23
+ gem "bundler", '~> 1.1'
24
+ gem "rake", '>= 0.9'
25
+ gem "rspec", '>= 2.8'
26
+ gem "guard", '>= 1.0'
27
+ gem "guard-rspec", '>= 0.6'
28
+ gem "simplecov", '>= 0.5'
29
+ gem "pry"
30
+ gem "ap"
31
+ end
32
+
33
+ group :docs do
34
+ gem "yard"
35
+ gem "redcarpet"
36
+ gem "addressable"
37
+ gem "htmlentities"
38
+ end
@@ -0,0 +1,9 @@
1
+
2
+
3
+ ### Geographic data
4
+
5
+
6
+
7
+ ### Munging
8
+
9
+
@@ -1,23 +1,30 @@
1
+ #
2
+ # Parses logs in either the [Apache Common Log Format](http://en.wikipedia.org/wiki/Common_Log_Format)
3
+ # or [Apache Combined Log Format](http://httpd.apache.org/docs/2.2/logs.html#combined)
4
+ #
5
+ # Common: `%h %l %u %t "%r" %>s %b`
6
+ # Combined: `%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i"`
7
+ #
1
8
  class ApacheLogLine
2
9
  include Gorillib::Model
3
10
 
4
- field :ip_address, IpAddress
5
- field :junk_1, String
6
- field :junk_2, String
11
+ field :client, Hostname
12
+ field :rfc_1413, String
13
+ field :userid, String
7
14
  field :log_timestamp, Time
8
15
  field :http_method, String
9
- field :path, String
16
+ field :rsrc, String
10
17
  field :protocol, String
11
18
  field :response_code, Integer
12
19
  field :size, Integer
13
- field :referer, String
14
- field :user_agent, String
20
+ # field :referer, String
21
+ # field :user_agent, String
15
22
 
16
23
  def page_type
17
24
  case
18
- when path =~ /\.(css|js)$/ then :asset
19
- when path =~ /\.(png|gif|ico)$/ then :image
20
- when path =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
25
+ when rsrc =~ /\.(css|js)$/ then :asset
26
+ when rsrc =~ /\.(png|gif|ico)$/ then :image
27
+ when rsrc =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
21
28
  else :other
22
29
  end
23
30
  end
@@ -25,37 +32,69 @@ class ApacheLogLine
25
32
  #
26
33
  # Regular expression to parse an apache log line.
27
34
  #
35
+ # local - - [24/Oct/1994:13:43:13 -0600] "GET index.html HTTP/1.0" 200 3185
28
36
  # 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
37
+ # whidbey.whidbey.com - - [04/Sep/1995:00:30:18 -0400] "GET /pub/sshay/images/btthumb.jpg" 200 4624
38
+ # jgbustam-ppp.clark.net - - [04/Sep/1995:00:00:28 -0400] "GET /pub/jgbustam/famosos/alpha.html HTTP/1.0" 304 -
29
39
  #
30
- LOG_RE = Regexp.compile(%r{\A
31
- (\S+) # ip 83.240.154.3
32
- \s(\S+) # j1 -
33
- \s(\S+) # j2 -
34
- \s\[([\w\:\+\ \/]+)\] # date part [07/Jun/2008:20:37:11 +0000]
40
+ COMMON_LOG_RE = Regexp.compile(%r{\A
41
+ (\S+) # client 83.240.154.3
42
+ \s(\S+) # rfc_1413 -
43
+ \s(\S+) # userid -
44
+ \s\[([\w\:\+\-\ \/]+)\] # date part [07/Jun/2008:20:37:11 +0000]
35
45
  \s\"(?:(\S+) # http_method "GET
36
- \s(\S+) # path /faq
37
- \s(\S+)|-)" # protocol HTTP/1.1"
38
- \s(\d+) # response_code 200
39
- \s(\d+) # size 569
40
- \s\"([^\"]*)\" # referer "http://infochimps.org/search?query=CAC"
41
- \s\"([^\"]*)\" # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
42
- \z}x)
46
+ \s(.+?) # rsrc /faq
47
+ (?:\s(HTTP/\d+\.\d+))?|-)\" # protocol HTTP/1.1"
48
+ \s(\d+|-) # response_code 200
49
+ \s(\d+|-) # size 569
50
+ \z
51
+ }x)
43
52
 
44
- MONTHS = { 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', }
53
+ COMBINED_LOG_RE = Regexp.compile(%r{\A
54
+ (\S+) # client 83.240.154.3
55
+ \s(\S+) # rfc_1413 -
56
+ \s(\S+) # userid -
57
+ \s\[([\w\:\+\-\ \/]+)\] # date part [07/Jun/2008:20:37:11 +0000]
58
+ \s\"(?:(\S+) # http_method "GET
59
+ \s([^\"]+?) # rsrc /faq
60
+ (?:\s(HTTP/\d+\.\d+))?|-)\" # protocol HTTP/1.1"
61
+ \s(\d+|-) # response_code 200
62
+ \s(\d+|-) # size 569
63
+ (?:\s\"([^\"]*)\") # referer "http://infochimps.org/search?query=CAC"
64
+ (?:\s\"([^\"]*)\") # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
65
+ \z
66
+ }x)
67
+
68
+ # LOG_RE = Regexp.compile(%r{\A(\S+)\s})
69
+
70
+ MONTHS = { 'Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, 'Nov' => 11, 'Dec' => 12, }
45
71
 
46
72
  # Converts a time like `10/Apr/2007:10:58:27 +0300` to something parseable
47
73
  def receive_log_timestamp(raw_ts)
74
+ return super(nil) if raw_ts.nil?
48
75
  match = %r{(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)\s([\+\-\w]+)}.match(raw_ts)
49
- warn "Can't parse date #{raw_ts}" unless match
76
+ unless match then warn "Can't parse date #{raw_ts}" ; return super(nil) ; end
77
+ #
50
78
  day, month_name, year, hour, min, sec, tz = match.captures
51
79
  month = MONTHS[month_name]
52
- super "#{year}-#{month}-#{day} #{hour}:#{min}:#{sec} #{tz}"
80
+ tz.insert(3, ':') # -0600 to -06:00
81
+ #
82
+ # super "#{year}-#{month}-#{day} #{hour}:#{min}:#{sec} #{tz}"
83
+ super Time.new(year.to_i, month, day.to_i, hour.to_i, min.to_i, sec.to_i, tz)
84
+ end
85
+
86
+ # @returns the log_timestamp in the common log format
87
+ def unparsed_log_timestamp
88
+ return if log_timestamp.blank?
89
+ log_timestamp.strftime("%d/%b/%Y:%H:%M:%S %z")
53
90
  end
54
91
 
55
92
  # Use the regex to break line into fields
56
93
  # Emit each record as flat line
57
94
  def self.make(line)
58
- m = LOG_RE.match(line.chomp) or return
95
+ m = COMMON_LOG_RE.match(line) or return
59
96
  from_tuple *m.captures
97
+ rescue ArgumentError => err
98
+ raise unless err.message =~ /invalid byte sequence in UTF-8/
60
99
  end
61
100
  end
@@ -0,0 +1,101 @@
1
+ require 'wukong/widget/many_to_many'
2
+ require 'gorillib/enumerable/sum'
3
+
4
+ #
5
+ # An example dataflow --
6
+ #
7
+
8
+ Wukong.processor(:delay_buffer) do
9
+ attr_accessor :queue
10
+ field :delay, Integer, position: 0, doc: "number of records to hold in buffer"
11
+
12
+ def process(rec)
13
+ queue << rec
14
+ emit(next_item) if ready?
15
+ end
16
+
17
+ def next_item
18
+ queue.shift
19
+ end
20
+
21
+ # true if there are records at the end of the delay stage
22
+ def ready?
23
+ warn "Hmm, too many records in queue: #{queue}" if queue.size > delay+1
24
+ queue.size > delay
25
+ end
26
+
27
+ # resets to an empty state
28
+ def setup(*)
29
+ super
30
+ @queue = Array.new
31
+ end
32
+
33
+ # emits all remaining elements of the queue
34
+ def stop
35
+ queue.each{|rec| emit(rec) }
36
+ super
37
+ end
38
+ end
39
+
40
+ class Wukong::Batcher < Wukong::Processor
41
+ register_action
42
+ include Hanuman::Slottable
43
+ include Hanuman::OutputSlotted
44
+
45
+ attr_accessor :queues
46
+ consume :n_1, Integer, doc: "n-1'th value: the one just emitted"
47
+ consume :tictoc, Integer, doc: "input to drive flow"
48
+ consume :n_2, Integer, doc: "n-2'nd value: the one before the one just emitted"
49
+
50
+ # resets to an empty state, calls super
51
+ def initialize(*)
52
+ super
53
+ @queues = Hash.new{|h,k| h[k] = Array.new } # autovivifying
54
+ end
55
+
56
+ def process_input(channel, rec)
57
+ queues[channel] << rec
58
+ emit(next_item) if ready?
59
+ end
60
+
61
+ def next_item
62
+ queues.map{|_, queue| queue.shift }
63
+ end
64
+
65
+ # true if there is at least one record in each queue
66
+ def ready?
67
+ inslots.values.all?{|inslot| queues[inslot.name].length > 0 }
68
+ end
69
+ end
70
+
71
+ Wukong.chain(:fibonacci_series) do
72
+
73
+ delay_buffer(1, name: :my_delay)
74
+
75
+ # * I don't want to have to name everything
76
+ # - are few/some/most things named?
77
+ # * I must be able to have the same stage type on a graph more than once
78
+ # * If naming things is a general case, I want it to
79
+ # - be clean, and for it to
80
+ # - not cause a ruckus when stage type has its own args
81
+ #
82
+
83
+ batcher(name: :feedback) >
84
+ map(name: :summer, &:sum) >
85
+ many_to_many(name: :fibonacci_n)
86
+
87
+ spew(6, item: 0, name: :ticker) > feedback.tictoc
88
+
89
+ fibonacci_n > feedback.n_1
90
+ fibonacci_n > output
91
+ fibonacci_n > :delay > feedback.n_2
92
+
93
+ # preload the feedback buffer
94
+ feedback.n_1.process(0)
95
+ feedback.n_2.process(0)
96
+ feedback.n_2.process(1)
97
+ end
98
+
99
+ # Wukong.dataflow(:dump) do
100
+ # stdout << Wukong.dataflow(:fibbonaci_series).out
101
+ # end