wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -1,53 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong/script'
4
-
5
- #
6
- # Bigram counts
7
- #
8
- # head -n 100 /usr/share/dict/words | ./examples/corpus/words_to_bigrams.rb | sort | /tmp/words_to_bigrams.rb
9
- #
10
-
11
-
12
- #
13
- # Kludge to work in Elastic map reduce:
14
- #
15
- # If your script is ./examples/corpus/words_to_bigrams.rb, make symlinks
16
- # to it from ./examples/corpus/words_to_bigrams__map.rb and
17
- # ./examples/corpus/words_to_bigrams__reduce.rb
18
- #
19
- if $0 =~ /__(map|reduce)\.rb$/
20
- Settings[$1.to_sym] = true
21
- end
22
-
23
-
24
- #
25
- # given one word per line
26
- # emits all successive pairs of characters in that word
27
- # eg 'boooo-urns' yields
28
- # bo oo oo oo o- -u ur rn ns
29
- #
30
- class WordNGrams < Wukong::Streamer::Base
31
- def process word
32
- word[0..-2].chars.zip(word[1..-1].chars).each do |ngram_2|
33
- yield ngram_2.join('')
34
- end
35
- end
36
- end
37
-
38
- #
39
- # number of unique keys in a row
40
- #
41
- class KeyCountStreamer < Wukong::Streamer::AccumulatingReducer
42
- def start! *args
43
- @count = 0
44
- end
45
- def accumulate *args
46
- @count += 1
47
- end
48
- def finalize
49
- yield [key, @count]
50
- end
51
- end
52
-
53
- Wukong::Script.new(WordNGrams, KeyCountStreamer).run
@@ -1,110 +0,0 @@
1
- h1. Using Elastic Map-Reduce in Wukong
2
-
3
- h2. Initial Setup
4
-
5
- # Sign up for elastic map reduce and S3 at Amazon AWS.
6
-
7
- # Download the Amazon elastic-mapreduce runner: either the official version at http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip or the infochimps fork (which has support for Ruby 1.9) at http://github.com/infochimps/elastic-mapreduce .
8
-
9
- # Create a bucket and path to hold your EMR logs, scripts and other ephemera. For instance you might choose 'emr.yourdomain.com' as the bucket and '/wukong' as a scoping path within that bucket. In that case you will refer to it with a path like s3://emr.yourdomain.com/wukong (see notes below about s3n:// vs. s3:// URLs).
10
-
11
- # Copy the contents of wukong/examples/emr/dot_wukong_dir to ~/.wukong
12
-
13
- # Edit emr.yaml and credentials.json, adding your keys where appropriate and following the other instructions. Start with a single-node m1.small cluster as you'll probably have some false starts beforethe flow of logging in, checking the logs, etc becomes clear.
14
-
15
- # You should now be good to launch a program. We'll give it the @--alive@ flag so that the machine sticks around if there were any issues:
16
-
17
- ./elastic_mapreduce_example.rb --run=emr --alive s3://emr.yourdomain.com/wukong/data/input s3://emr.yourdomain.com/wukong/data/output
18
-
19
- # If you visit the "AWS console":http://bit.ly/awsconsole you should now see a jobflow with two steps. The first sets up debugging for the job; the second is your hadoop task.
20
-
21
- # The "AWS console":http://bit.ly/awsconsole also has the public IP of the master node. You can log in to the machine directly:
22
-
23
- <pre>
24
- ssh -i /path/to/your/keypair.pem hadoop@ec2-148-37-14-128.compute-1.amazonaws.com
25
- </pre>
26
-
27
- h3. Lorkbong
28
-
29
- Lorkbong (named after the staff carried by Sun Wukong) is a very very simple example Heroku app that lets you trigger showing job status or launching a new job, either by visiting a special URL or by triggering a rake task. Get its code from
30
-
31
- http://github.com/mrflip/lorkbong
32
-
33
- h3. s3n:// vs. s3:// URLs
34
-
35
- Many external tools use a URI convention to address files in S3; they typically use the 's3://' scheme, which makes a lot of sense:
36
- s3://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
37
-
38
- Hadoop can maintain an HDFS on the Amazon S3: it uses a block structure and has optimizations for streaming, no file size limitation, and other goodness. However, only hadoop tools can interpret the contents of those blocks -- to everything else it just looks like a soup of blocks labelled block_-8675309 and so forth. Hadoop unfortunately chose the 's3://' scheme for URIs in this filesystem:
39
- s3://s3hdfs.yourcompany.com/path/to/data
40
-
41
- Hadoop is happy to read s3 native files -- 'native' as in, you can look at them with a browser and upload them an download them with any S3 tool out there. There's a 5GB limit on file size, and in some cases a performance hit (but not in our experience enough to worry about). You refer to these files with the 's3n://' scheme ('n' as in 'native'):
42
- s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-mapper.rb
43
- s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-reducer.rb
44
- s3n://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
45
-
46
- Wukong will coerce things to the right scheme when it knows what that scheme should be (eg. code should be s3n://). It will otherwise leave the path alone. Specifically, if you use a URI scheme for input and output paths you must use 's3n://' for normal s3 files.
47
-
48
- h2. Advanced Tips n' Tricks for common usage
49
-
50
- h3. Direct access to logs using your browser
51
-
52
- Each Hadoop component exposes a web dashboard for you to access. Use the following ports:
53
-
54
- * 9100: Job tracker (master only)
55
- * 9101: Namenode (master only)
56
- * 9102: Datanodes
57
- * 9103: Task trackers
58
-
59
- They will only, however, respond to web requests from within the private cluster
60
- subnet. You can browse the cluster by creating a persistent tunnel to the hadoop master node, and configuring your
61
- browser to use it as a proxy.
62
-
63
- h4. Create a tunneling proxy to your cluster
64
-
65
- To create a tunnel from your local machine to the master node, substitute the keypair and the master node's address into this command:
66
-
67
- <pre><code>
68
- ssh -i ~/.wukong/keypairs/KEYPAIR.pem -f -N -D 6666 -o StrictHostKeyChecking=no -o "ConnectTimeout=10" -o "ServerAliveInterval=60" -o "ControlPath=none" ubuntu@MASTER_NODE_PUBLIC_IP
69
- </code></pre>
70
-
71
- The command will silently background itself if it worked.
72
-
73
- h4. Make your browser use the proxy (but only for cluster machines)
74
-
75
- You can access basic information by pointing your browser to "this Proxy
76
- Auto-Configuration (PAC)
77
- file.":http://github.com/infochimps/cluster_chef/raw/master/config/proxy.pac
78
- You'll have issues if you browse around though, because many of the in-page
79
- links will refer to addresses that only resolve within the cluster's private
80
- namespace.
81
-
82
- h4. Setup Foxy Proxy
83
-
84
- To fix this, use "FoxyProxy":https://addons.mozilla.org/en-US/firefox/addon/2464
85
- It allows you to manage multiple proxy configurations and to use the proxy for
86
- DNS resolution (curing the private address problem).
87
-
88
- Once you've installed the FoxyProxy extension and restarted Firefox,
89
-
90
- * Set FoxyProxy to 'Use Proxies based on their pre-defined patterns and priorities'
91
- * Create a new proxy, called 'EC2 Socks Proxy' or something
92
- * Automatic proxy configuration URL: http://github.com/infochimps/cluster_chef/raw/master/config/proxy.pac
93
- * Under 'General', check yes for 'Perform remote DNS lookups on host'
94
- * Add the following URL patterns as 'whitelist' using 'Wildcards' (not regular expression):
95
-
96
- * <code>*.compute-*.internal*</code>
97
- * <code>*ec2.internal*</code>
98
- * <code>*domu*.internal*</code>
99
- * <code>*ec2*.amazonaws.com*</code>
100
- * <code>*://10.*</code>
101
-
102
- And this one as blacklist:
103
-
104
- * <code>https://us-*st-1.ec2.amazonaws.com/*</code>
105
-
106
-
107
- h3. Pulling to your local machine
108
-
109
- s3cmd sync s3://s3n.infinitemonkeys.info/emr/elastic_mapreduce_example/log/ /tmp/emr_log/
110
-
@@ -1,7 +0,0 @@
1
- {
2
- "key-pair": "gibbon",
3
- "key-pair-file": "/home/your/.wukong/keypairs/gibbon.pem",
4
- "access-id": "YOURACCESSID",
5
- "private-key": "YOURPRIVATEKEY",
6
- "region": "us-east-1",
7
- }
@@ -1,69 +0,0 @@
1
- #
2
- # Elastic MapReduce config in wukong
3
- #
4
-
5
- # ===========================================================================
6
- #
7
- # Infrastructure options
8
- #
9
-
10
- # == Fill all your information into yet another file with your amazon key
11
- # It needs to be in so many stupid places because nobody can agree on a
12
- # filename or format.
13
- #
14
- :emr_credentials_file: ~/.wukong/credentials.json
15
-
16
- #
17
- # == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here
18
- #
19
- # :access_key: ASDFAHKHASDF
20
- # :secret_access_key: ADSGHASDFJASDFASDF
21
-
22
- # == Path to your keypair file.
23
- #
24
- :key_pair_file: ~/.wukong/keypairs/gibbon.pem
25
-
26
- # == Keypair will be named after your file, or force the name
27
- #
28
- # :key_pair: ~
29
-
30
- # == Path to the Amazon elastic-mapreduce runner. Get a copy from
31
- # http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
32
- #
33
- :emr_runner: ~/ics/hadoop/elastic-mapreduce/elastic-mapreduce
34
-
35
- # ===========================================================================
36
- #
37
- # Remote Paths
38
- #
39
-
40
- # == Wukong is opinionated about the paths and locations of scripts and
41
- # everything. It will organize files by job name within the following path:
42
- #
43
- :emr_root: s3://s3n.infinitemonkeys.info/emr
44
-
45
- # == If you specify the :emr_data_root path, then relative pathnames -- ones that
46
- # do not look like a URI (s3://yadda/yada) and do not start with a '/' -- will
47
- # be prefixed with this path prefix.
48
- :emr_data_root: s3n://s3n.infinitemonkeys.info/data
49
-
50
-
51
- # ===========================================================================
52
- #
53
- # Cluster Config
54
- #
55
- :num_instances: 1
56
- :instance_type: m1.small
57
- :master_instance_type: ~
58
- :hadoop_version: '0.20'
59
- :availability_zone: us-east-1b
60
-
61
- # ===========================================================================
62
- #
63
- # Running and reporting options
64
- #
65
- :alive: true
66
- :enable_debugging: true
67
- :emr_runner_verbose: true
68
- :emr_runner_debug: ~
69
- :step_action: CANCEL_AND_WAIT # CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
@@ -1,33 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -x # turn on tracing
3
-
4
- # A url directory with the scripts you'd like to stuff into the machine
5
- REMOTE_FILE_URL_BASE="http://github.com/infochimps/wukong"
6
-
7
- # echo "`date` Broaden the apt universe"
8
- # sudo bash -c 'echo "deb http://ftp.us.debian.org/debian lenny multiverse restricted universe" >> /etc/apt/sources.list.d/multiverse.list'
9
-
10
- # Do a non interactive apt-get so the user is never prompted for input
11
- export DEBIAN_FRONTEND=noninteractive
12
-
13
- # Update package index and update the basic system files to newest versions
14
- echo "`date` Apt update"
15
- sudo apt-get -y update ;
16
- sudo dpkg --configure -a
17
- echo "`date` Apt upgrade, could take a while"
18
- sudo apt-get -y safe-upgrade
19
- echo "`date` Apt install"
20
- sudo apt-get -f install ;
21
-
22
- echo "`date` Installing base packages"
23
- # libopenssl-ruby1.8 ssl-cert
24
- sudo apt-get install -y unzip build-essential git-core ruby ruby1.8-dev rubygems ri irb build-essential wget git-core zlib1g-dev libxml2-dev;
25
- echo "`date` Unchaining rubygems from the tyrrany of ubuntu"
26
- sudo gem install --no-rdoc --no-ri rubygems-update --version=1.3.7 ; sudo /var/lib/gems/1.8/bin/update_rubygems; sudo gem update --no-rdoc --no-ri --system ; gem --version ;
27
-
28
- echo "`date` Installing wukong and related gems"
29
- sudo gem install --no-rdoc --no-ri addressable extlib htmlentities configliere yard wukong right_aws uuidtools cheat
30
- sudo gem list
31
-
32
- echo "`date` Wukong bootstrap complete: `date`"
33
- true
@@ -1,28 +0,0 @@
1
- #!/usr/bin/env ruby
2
- Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
3
- require 'rubygems'
4
- require 'wukong/script'
5
- require 'wukong/script/emr_command'
6
-
7
- #
8
- # * Copy the emr.yaml from here into ~/.wukong/emr.yaml
9
- # and edit it to suit.
10
- # * Download the Amazon elastic-mapreduce runner. Get a copy from
11
- # http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
12
- # * Find out what breaks, fix it or ask us for help (coders@infochimps.org) and
13
- # submit a patch
14
- #
15
-
16
- class FooStreamer < Wukong::Streamer::LineStreamer
17
- def initialize *args
18
- super *args
19
- @line_no = 0
20
- end
21
-
22
- def process *args
23
- yield ["%5d" % @line_no, *args]
24
- @line_no += 1
25
- end
26
- end
27
-
28
- Wukong::Script.new(FooStreamer, FooStreamer).run
@@ -1,74 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong/script'
4
-
5
- #
6
- # Given an adjacency pairs (from \t to) representation of a directed graph:
7
- #
8
- # 1 2
9
- # 1 7
10
- # 2 7
11
- # 2 9
12
- # 7 2
13
- #
14
- # It produces an "adjacency list":http://en.wikipedia.org/wiki/Adjacency_list representation:
15
- #
16
- # 1 > 2 7
17
- # 2 > 7 9
18
- # 7 > 2
19
- # 9 >
20
- #
21
- # and
22
- #
23
- # 1 <
24
- # 2 < 1 7
25
- # 7 < 1 2
26
- # 9 < 2
27
- #
28
- # (each column is tab-separated in the actual output)
29
- #
30
- #
31
- #
32
- module Gen1HoodEdges
33
- class Mapper < Wukong::Streamer::Base
34
- def process rsrc, src, dest, *_
35
- src = src.to_i ; dest = dest.to_i
36
- yield [ src, '>', dest ]
37
- yield [ dest, '<', src ]
38
- end
39
- end
40
-
41
- #
42
- # Accumulate links onto single line.
43
- #
44
- # The reduce key is the target node and direction; we just stream through all
45
- # pairs for each target node and output its neighbor nodes on the same line.
46
- #
47
- # To control memory usage, we will print directly to the output (and not run
48
- # through the Emitter)
49
- #
50
- class Reducer < Wukong::Streamer::AccumulatingReducer
51
- # clear the list of incoming paths
52
- def start! target, dir, *args
53
- print target + "\t" + dir # start line with target and list type
54
- end
55
- def accumulate target, dir, neighbor
56
- print "\t" + neighbor # append neighbor to output, same line
57
- end
58
- def finalize
59
- puts '' # start new line
60
- end
61
- end
62
-
63
- class Script < Wukong::Script
64
- def default_options
65
- super.merge :sort_fields => 1, :partition_fields => 1
66
- end
67
- end
68
- end
69
-
70
- # Execute the script
71
- Gen1HoodEdges::Script.new(
72
- Gen1HoodEdges::Mapper,
73
- Gen1HoodEdges::Reducer
74
- ).run
@@ -1,72 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong/script'
4
-
5
- #
6
- # Use this script to do a Breadth-First Search (BFS) of a graph.
7
- #
8
- # Usage:
9
- # ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
10
- #
11
- # For example, given an edge list in the file '1path.tsv' that looks like
12
- # 1path n1 n2
13
- # 1path n1 n3
14
- # ... and so forth ...
15
- # you can run
16
- # for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
17
- # to do a 9-deep breadth-first search.
18
- #
19
- module Gen1HoodEdges
20
- class Mapper < Wukong::Streamer::RecordStreamer
21
- def initialize
22
- @head = Settings[:head]
23
- @tail = Settings[:tail]
24
- end
25
- def process rsrc, *nodes
26
- yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
27
- yield [ nodes.first, 'o', nodes[1..-1] ] if (rsrc == self.tail)
28
- end
29
- end
30
-
31
- #
32
- # Accumulate ( !!in memory!!) all inbound links onto middle node
33
- #
34
- # Then for each outbound link, loop over those inbound links and emit the
35
- # triple (in, mid,out)
36
- #
37
- class Reducer < Wukong::Streamer::AccumulatingReducer
38
- attr_accessor :paths_in, :out_rsrc
39
- def initialize
40
- self.out_rsrc = Settings[:out_rsrc]
41
- end
42
- # clear the list of incoming paths
43
- def start! *args
44
- self.paths_in = []
45
- end
46
- def accumulate mid, dir, *nodes
47
- case dir
48
- when 'i'
49
- self.paths_in << nodes
50
- if (self.paths_in.length % 1000 == 0) && (self.paths_in.length > 10000)
51
- $stderr.puts ["Accumulating:", mid, self.paths_in.length].join("\t")
52
- end
53
- when 'o'
54
- paths_in.each do |path_in|
55
- yield [self.out_rsrc, path_in, mid, *nodes]
56
- end
57
- end
58
- end
59
- def finalize
60
- end
61
- def get_key mid, *_
62
- mid
63
- end
64
- end
65
- end
66
-
67
- # Execute the script
68
- Wukong.run(
69
- Gen1HoodEdges::Mapper,
70
- Gen1HoodEdges::Reducer,
71
- :sort_fields => 2, :partition_fields => 1
72
- )