wukong 2.0.2 → 3.0.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (268) hide show
  1. data/.document +5 -0
  2. data/.gitignore +46 -0
  3. data/.gitmodules +3 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +12 -0
  6. data/.yardopts +19 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +3 -0
  9. data/Guardfile +14 -0
  10. data/NOTES-travis.md +31 -0
  11. data/README.md +422 -0
  12. data/Rakefile +12 -0
  13. data/TODO.md +99 -0
  14. data/VERSION +1 -0
  15. data/bin/hdp-cp +0 -0
  16. data/bin/wu-flow +10 -0
  17. data/bin/wu-map +17 -0
  18. data/bin/wu-red +17 -0
  19. data/bin/wukong +17 -0
  20. data/data/CREDITS.md +355 -0
  21. data/data/graph/airfares.tsv +2174 -0
  22. data/data/text/gift_of_the_magi.txt +225 -0
  23. data/data/text/jabberwocky.txt +36 -0
  24. data/data/text/rectification_of_names.txt +33 -0
  25. data/{spec/data → data/twitter}/a_atsigns_b.tsv +0 -0
  26. data/{spec/data → data/twitter}/a_follows_b.tsv +0 -0
  27. data/{spec/data → data/twitter}/tweet.tsv +0 -0
  28. data/{spec/data → data/twitter}/twitter_user.tsv +0 -0
  29. data/data/wikipedia/dbpedia-sentences.tsv +1000 -0
  30. data/examples/dataflow.rb +28 -0
  31. data/examples/{server_logs/logline.rb → dataflow/apache_log_line.rb} +28 -18
  32. data/examples/dataflow/complex.rb +11 -0
  33. data/examples/dataflow/donuts.rb +13 -0
  34. data/examples/dataflow/parse_apache_logs.rb +16 -0
  35. data/examples/dataflow/pig_latinizer.rb +16 -0
  36. data/examples/dataflow/simple.rb +12 -0
  37. data/examples/dataflow/telegram.rb +45 -0
  38. data/examples/examples_helper.rb +9 -0
  39. data/examples/graph/minimum_spanning_tree.rb +73 -0
  40. data/examples/graph/union_find.rb +62 -0
  41. data/examples/text/latinize_text.rb +0 -0
  42. data/examples/text/pig_latin.rb +35 -0
  43. data/examples/tiny_count.rb +8 -0
  44. data/examples/tiny_count/jabberwocky_output.tsv +92 -0
  45. data/examples/twitter/locations.rb +29 -0
  46. data/examples/twitter/models.rb +24 -0
  47. data/examples/twitter/pt1-fiddle.pig +8 -0
  48. data/examples/twitter/pt2-simple_parse.pig +31 -0
  49. data/examples/twitter/pt2-simple_parse.rb +18 -0
  50. data/examples/twitter/pt3-join_on_zips.pig +39 -0
  51. data/examples/twitter/pt4-strong_links.rb +20 -0
  52. data/examples/twitter/pt5-lnglat_and_strong_links.pig +16 -0
  53. data/examples/twitter/states.tsv +50 -0
  54. data/examples/word_count.rb +36 -63
  55. data/examples/workflow/cherry_pie.md +104 -0
  56. data/examples/workflow/cherry_pie.rb +66 -0
  57. data/examples/workflow/fiddle.rb +24 -0
  58. data/examples/workflow/package_gem.rb +55 -0
  59. data/lib/{wukong/encoding.rb → away/escapement.rb} +0 -0
  60. data/lib/away/exe.rb +11 -0
  61. data/lib/away/experimental.rb +5 -0
  62. data/lib/away/from_file.rb +52 -0
  63. data/lib/away/job.rb +56 -0
  64. data/lib/away/job/rake_compat.rb +17 -0
  65. data/lib/away/registry.rb +79 -0
  66. data/lib/away/runner.rb +276 -0
  67. data/lib/away/runner/execute.rb +121 -0
  68. data/lib/away/script.rb +161 -0
  69. data/lib/away/script/hadoop_command.rb +240 -0
  70. data/lib/away/source/file_list_source.rb +15 -0
  71. data/lib/away/source/looper.rb +18 -0
  72. data/lib/away/task.rb +219 -0
  73. data/lib/hanuman.rb +9 -0
  74. data/lib/hanuman/action.rb +21 -0
  75. data/lib/hanuman/chain.rb +4 -0
  76. data/lib/hanuman/graph.rb +51 -0
  77. data/lib/hanuman/graphviz.rb +74 -0
  78. data/lib/hanuman/graphvizzer.rb +185 -0
  79. data/lib/hanuman/resource.rb +6 -0
  80. data/lib/hanuman/slot.rb +87 -0
  81. data/lib/hanuman/slottable.rb +220 -0
  82. data/lib/hanuman/stage.rb +51 -0
  83. data/lib/wukong.rb +31 -17
  84. data/lib/wukong/bad_record.rb +13 -16
  85. data/lib/wukong/dataflow.rb +103 -0
  86. data/lib/wukong/event.rb +44 -0
  87. data/lib/wukong/local_runner.rb +55 -0
  88. data/lib/wukong/mapred.rb +3 -0
  89. data/lib/wukong/model/faker.rb +136 -0
  90. data/lib/wukong/processor.rb +142 -0
  91. data/lib/wukong/settings.rb +0 -0
  92. data/lib/wukong/universe.rb +48 -0
  93. data/lib/wukong/version.rb +3 -0
  94. data/lib/wukong/widget/filter.rb +81 -0
  95. data/lib/wukong/widget/gibberish.rb +123 -0
  96. data/lib/wukong/widget/monitor.rb +26 -0
  97. data/lib/wukong/widget/reducer.rb +66 -0
  98. data/lib/wukong/widget/sink.rb +58 -0
  99. data/lib/wukong/widget/source.rb +120 -0
  100. data/lib/wukong/widget/stringifier.rb +50 -0
  101. data/lib/wukong/workflow.rb +22 -0
  102. data/lib/wukong/workflow/command.rb +42 -0
  103. data/old/config/emr-example.yaml +48 -0
  104. data/{examples → old/examples}/README.txt +0 -0
  105. data/{examples → old/examples}/contrib/jeans/README.markdown +0 -0
  106. data/{examples → old/examples}/contrib/jeans/data/normalized_sizes +0 -0
  107. data/{examples → old/examples}/contrib/jeans/data/orders.tsv +0 -0
  108. data/{examples → old/examples}/contrib/jeans/data/sizes +0 -0
  109. data/{examples → old/examples}/contrib/jeans/normalize.rb +0 -0
  110. data/{examples → old/examples}/contrib/jeans/sizes.rb +0 -0
  111. data/old/examples/corpus/bnc_word_freq.rb +44 -0
  112. data/{examples → old/examples}/corpus/bucket_counter.rb +0 -0
  113. data/{examples → old/examples}/corpus/dbpedia_abstract_to_sentences.rb +0 -0
  114. data/{examples → old/examples}/corpus/sentence_bigrams.rb +0 -0
  115. data/{examples → old/examples}/corpus/sentence_coocurrence.rb +0 -0
  116. data/old/examples/corpus/stopwords.rb +138 -0
  117. data/{examples → old/examples}/corpus/words_to_bigrams.rb +0 -0
  118. data/{examples → old/examples}/emr/README.textile +0 -0
  119. data/{examples → old/examples}/emr/dot_wukong_dir/credentials.json +0 -0
  120. data/{examples → old/examples}/emr/dot_wukong_dir/emr.yaml +0 -0
  121. data/{examples → old/examples}/emr/dot_wukong_dir/emr_bootstrap.sh +0 -0
  122. data/{examples → old/examples}/emr/elastic_mapreduce_example.rb +0 -0
  123. data/{examples → old/examples}/network_graph/adjacency_list.rb +0 -0
  124. data/{examples → old/examples}/network_graph/breadth_first_search.rb +0 -0
  125. data/{examples → old/examples}/network_graph/gen_2paths.rb +0 -0
  126. data/{examples → old/examples}/network_graph/gen_multi_edge.rb +0 -0
  127. data/{examples → old/examples}/network_graph/gen_symmetric_links.rb +0 -0
  128. data/{examples → old/examples}/pagerank/README.textile +0 -0
  129. data/{examples → old/examples}/pagerank/gen_initial_pagerank_graph.pig +0 -0
  130. data/{examples → old/examples}/pagerank/pagerank.rb +0 -0
  131. data/{examples → old/examples}/pagerank/pagerank_initialize.rb +0 -0
  132. data/{examples → old/examples}/pagerank/run_pagerank.sh +0 -0
  133. data/{examples → old/examples}/sample_records.rb +0 -0
  134. data/{examples → old/examples}/server_logs/apache_log_parser.rb +0 -4
  135. data/{examples → old/examples}/server_logs/breadcrumbs.rb +0 -0
  136. data/{examples → old/examples}/server_logs/nook.rb +0 -0
  137. data/{examples → old/examples}/server_logs/nook/faraday_dummy_adapter.rb +0 -0
  138. data/{examples → old/examples}/server_logs/user_agent.rb +0 -0
  139. data/{examples → old/examples}/simple_word_count.rb +0 -0
  140. data/{examples → old/examples}/size.rb +0 -0
  141. data/{examples → old/examples}/stats/avg_value_frequency.rb +0 -0
  142. data/{examples → old/examples}/stats/binning_percentile_estimator.rb +0 -0
  143. data/{examples → old/examples}/stats/data/avg_value_frequency.tsv +0 -0
  144. data/{examples → old/examples}/stats/rank_and_bin.rb +0 -0
  145. data/{examples → old/examples}/stupidly_simple_filter.rb +0 -0
  146. data/old/examples/word_count.rb +75 -0
  147. data/old/graph/graphviz_builder.rb +580 -0
  148. data/old/graph_easy/Attributes.pm +4181 -0
  149. data/old/graph_easy/Graphviz.pm +2232 -0
  150. data/old/wukong.rb +18 -0
  151. data/{lib → old}/wukong/and_pig.rb +0 -0
  152. data/old/wukong/bad_record.rb +18 -0
  153. data/{lib → old}/wukong/datatypes.rb +0 -0
  154. data/{lib → old}/wukong/datatypes/enum.rb +0 -0
  155. data/{lib → old}/wukong/datatypes/fake_types.rb +0 -0
  156. data/{lib → old}/wukong/decorator.rb +0 -0
  157. data/{lib → old}/wukong/encoding/asciize.rb +0 -0
  158. data/{lib → old}/wukong/extensions.rb +0 -0
  159. data/{lib → old}/wukong/extensions/array.rb +0 -0
  160. data/{lib → old}/wukong/extensions/blank.rb +0 -0
  161. data/{lib → old}/wukong/extensions/class.rb +0 -0
  162. data/{lib → old}/wukong/extensions/date_time.rb +0 -0
  163. data/{lib → old}/wukong/extensions/emittable.rb +0 -0
  164. data/{lib → old}/wukong/extensions/enumerable.rb +0 -0
  165. data/{lib → old}/wukong/extensions/hash.rb +0 -0
  166. data/{lib → old}/wukong/extensions/hash_keys.rb +0 -0
  167. data/{lib → old}/wukong/extensions/hash_like.rb +0 -0
  168. data/{lib → old}/wukong/extensions/hashlike_class.rb +0 -0
  169. data/{lib → old}/wukong/extensions/module.rb +0 -0
  170. data/{lib → old}/wukong/extensions/pathname.rb +0 -0
  171. data/{lib → old}/wukong/extensions/string.rb +0 -0
  172. data/{lib → old}/wukong/extensions/struct.rb +0 -0
  173. data/{lib → old}/wukong/extensions/symbol.rb +0 -0
  174. data/{lib → old}/wukong/filename_pattern.rb +0 -0
  175. data/old/wukong/helper.rb +7 -0
  176. data/old/wukong/helper/stopwords.rb +195 -0
  177. data/old/wukong/helper/tokenize.rb +35 -0
  178. data/{lib → old}/wukong/logger.rb +0 -0
  179. data/{lib → old}/wukong/periodic_monitor.rb +0 -0
  180. data/{lib → old}/wukong/schema.rb +0 -0
  181. data/{lib → old}/wukong/script.rb +0 -0
  182. data/{lib → old}/wukong/script/avro_command.rb +0 -0
  183. data/{lib → old}/wukong/script/cassandra_loader_script.rb +0 -0
  184. data/{lib → old}/wukong/script/emr_command.rb +0 -0
  185. data/{lib → old}/wukong/script/hadoop_command.rb +0 -0
  186. data/{lib → old}/wukong/script/local_command.rb +4 -1
  187. data/{lib → old}/wukong/store.rb +0 -0
  188. data/{lib → old}/wukong/store/base.rb +0 -0
  189. data/{lib → old}/wukong/store/cassandra.rb +0 -0
  190. data/{lib → old}/wukong/store/cassandra/streaming.rb +0 -0
  191. data/{lib → old}/wukong/store/cassandra/struct_loader.rb +0 -0
  192. data/{lib → old}/wukong/store/cassandra_model.rb +0 -0
  193. data/{lib → old}/wukong/store/chh_chunked_flat_file_store.rb +0 -0
  194. data/{lib → old}/wukong/store/chunked_flat_file_store.rb +0 -0
  195. data/{lib → old}/wukong/store/conditional_store.rb +0 -0
  196. data/{lib → old}/wukong/store/factory.rb +0 -0
  197. data/{lib → old}/wukong/store/flat_file_store.rb +0 -0
  198. data/{lib → old}/wukong/store/key_store.rb +0 -0
  199. data/{lib → old}/wukong/store/null_store.rb +0 -0
  200. data/{lib → old}/wukong/store/read_thru_store.rb +0 -0
  201. data/{lib → old}/wukong/store/tokyo_tdb_key_store.rb +0 -0
  202. data/{lib → old}/wukong/store/tyrant_rdb_key_store.rb +0 -0
  203. data/{lib → old}/wukong/store/tyrant_tdb_key_store.rb +0 -0
  204. data/{lib → old}/wukong/streamer.rb +8 -0
  205. data/{lib → old}/wukong/streamer/accumulating_reducer.rb +0 -0
  206. data/{lib → old}/wukong/streamer/base.rb +2 -1
  207. data/{lib → old}/wukong/streamer/counting_reducer.rb +0 -0
  208. data/{lib → old}/wukong/streamer/filter.rb +0 -0
  209. data/old/wukong/streamer/instance_streamer.rb +15 -0
  210. data/old/wukong/streamer/json_streamer.rb +21 -0
  211. data/{lib → old}/wukong/streamer/line_streamer.rb +0 -0
  212. data/{lib → old}/wukong/streamer/list_reducer.rb +0 -0
  213. data/{lib → old}/wukong/streamer/rank_and_bin_reducer.rb +0 -0
  214. data/{lib → old}/wukong/streamer/record_streamer.rb +0 -0
  215. data/{lib → old}/wukong/streamer/reducer.rb +0 -0
  216. data/{lib → old}/wukong/streamer/set_reducer.rb +0 -0
  217. data/{lib → old}/wukong/streamer/struct_streamer.rb +0 -0
  218. data/{lib → old}/wukong/streamer/summing_reducer.rb +0 -0
  219. data/{lib → old}/wukong/streamer/uniq_by_last_reducer.rb +0 -0
  220. data/{lib → old}/wukong/typed_struct.rb +0 -0
  221. data/spec/away/encoding_spec.rb +32 -0
  222. data/spec/away/exe_spec.rb +20 -0
  223. data/spec/away/flow_spec.rb +82 -0
  224. data/spec/away/graph_spec.rb +6 -0
  225. data/spec/away/job_spec.rb +15 -0
  226. data/spec/away/rake_compat_spec.rb +9 -0
  227. data/spec/away/script_spec.rb +81 -0
  228. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  229. data/spec/examples/dataflow/parsing_spec.rb +13 -0
  230. data/spec/examples/dataflow/simple_spec.rb +8 -0
  231. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  232. data/spec/examples/graph/minimum_spanning_tree_spec.rb +35 -0
  233. data/spec/examples/text/pig_latin_spec.rb +21 -0
  234. data/spec/examples/workflow/cherry_pie_spec.rb +6 -0
  235. data/spec/hanuman/graph_spec.rb +17 -0
  236. data/spec/hanuman/graphviz_spec.rb +29 -0
  237. data/spec/hanuman/slot_spec.rb +2 -0
  238. data/spec/hanuman/stage_spec.rb +12 -0
  239. data/spec/spec_helper.rb +24 -6
  240. data/spec/support/examples_helper.rb +10 -0
  241. data/spec/support/hanuman_test_helpers.rb +90 -0
  242. data/spec/support/streamer_test_helpers.rb +6 -0
  243. data/spec/support/wukong_test_helpers.rb +43 -0
  244. data/spec/support/wukong_widget_helpers.rb +66 -0
  245. data/spec/wukong/dataflow_spec.rb +65 -0
  246. data/spec/wukong/local_runner_spec.rb +31 -0
  247. data/spec/wukong/model/faker_spec.rb +131 -0
  248. data/spec/wukong/processor_spec.rb +109 -0
  249. data/spec/wukong/runner_spec.rb +12 -0
  250. data/spec/wukong/widget/filter_spec.rb +99 -0
  251. data/spec/wukong/widget/sink_spec.rb +19 -0
  252. data/spec/wukong/widget/source_spec.rb +41 -0
  253. data/spec/wukong/widget/stringifier_spec.rb +51 -0
  254. data/spec/wukong/workflow/command_spec.rb +5 -0
  255. data/wukong.gemspec +36 -277
  256. metadata +421 -165
  257. data/CHANGELOG.textile +0 -106
  258. data/INSTALL.textile +0 -89
  259. data/README.textile +0 -274
  260. data/TODO.textile +0 -11
  261. data/examples/ignore_me/counting.rb +0 -55
  262. data/examples/ignore_me/grouper.rb +0 -71
  263. data/old/cassandra_streaming/berlitz_for_cassandra.textile +0 -22
  264. data/old/cassandra_streaming/client_interface_notes.textile +0 -200
  265. data/old/cassandra_streaming/client_schema.textile +0 -318
  266. data/old/cassandra_streaming/tuning.textile +0 -73
  267. data/spec/wukong/encoding_spec.rb +0 -36
  268. data/spec/wukong/script_spec.rb +0 -80
@@ -0,0 +1,5 @@
1
+ README.textile
2
+ lib/**/*.rb
3
+ bin/*
4
+ LICENSE
5
+ examples/*.rb
@@ -0,0 +1,46 @@
1
+
2
+
3
+
4
+
5
+
6
+
7
+ ## COMPILED
8
+ ## EDITORS
9
+ ## OS
10
+ ## OTHER SCM
11
+ ## PROJECT::GENERAL
12
+ ## PROJECT::SPECIFIC
13
+ *.log
14
+ *.o
15
+ *.pyc
16
+ *.rdb
17
+ *.so
18
+ *.swp
19
+ *.tmproj
20
+ *_flymake
21
+ *_flymake.*
22
+ *private*
23
+ *~
24
+ .DS_Store
25
+ .\#*
26
+ .bak
27
+ .bzr
28
+ .hg
29
+ .project
30
+ .settings
31
+ .svn
32
+ .yardoc
33
+ /.rbenv-version
34
+ Gemfile.lock
35
+ Icon?
36
+ REVISION
37
+ TAGS*
38
+ \#*
39
+ a.out
40
+ coverage
41
+ doc
42
+ nohup.out
43
+ pkg
44
+ rdoc
45
+ tmp/*
46
+ tmtags
@@ -0,0 +1,3 @@
1
+ [submodule "notes"]
2
+ path = notes
3
+ url = git://github.com/infochimps-labs/wukong.wiki.git
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+
5
+ before_install: "git clone -b version_1 git://github.com/infochimps-labs/gorillib.git ~/builds/infochimps-labs/gorillib"
6
+
7
+ branches:
8
+ only:
9
+ - wukong_ng
10
+
11
+ notifications:
12
+ email: false
@@ -0,0 +1,19 @@
1
+ --readme README.md
2
+ --markup markdown
3
+ -
4
+ VERSION
5
+ CHANGELOG.md
6
+ LICENSE.md
7
+ README.md
8
+ notes/INSTALL.md
9
+ notes/core_concepts.md
10
+ notes/knife-cluster-commands.md
11
+ notes/philosophy.md
12
+ notes/silverware.md
13
+ notes/style_guide.md
14
+ notes/tips_and_troubleshooting.md
15
+ notes/walkthrough-hadoop.md
16
+ notes/homebase-layout.txt
17
+
18
+ notes/*.md
19
+ notes/*.txt
@@ -0,0 +1,7 @@
1
+ ## Version 3: Complete rewrite
2
+
3
+ Version 3 is a complete refresh of Wukong. There will probably be a compatibility layer.
4
+
5
+ The new version is highly modularized, and built on top of the Hanuman dataflow toolkit.
6
+
7
+ The central idea is to assemble your jobs as a stack of decoupled stages. These stages are agnostic to whether they are running in a hadoop batch job, from the command line, in a Flume decorator, or as middleware in a Hanuman request stack.
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source :rubygems
2
+
3
+ gemspec
@@ -0,0 +1,14 @@
1
+ # -*- ruby -*-
2
+
3
+ format = 'progress' # 'doc' for more verbose, 'progress' for less
4
+ tags = %w[ ] # builder_spec model_spec
5
+
6
+ guard 'rspec', :version => 2, :cli => "--format #{format} #{ tags.map{|tag| "--tag #{tag}"}.join(' ') }" do
7
+ watch(%r{^spec/.+_spec\.rb$})
8
+ watch(%r{^examples/(\w+)/(.+)\.rb$}) { |m| "spec/examples/#{m[1]}_spec.rb" }
9
+ watch(%r{^examples/(\w+)\.rb$}) { |m| "spec/examples/#{m[1]}_spec.rb" }
10
+ watch(%r{^lib/(.+)/(.+)\.rb$}) { |m| "spec/#{m[1]}/#{m[2]}_spec.rb" }
11
+ watch(%r{^lib/(\w+)\.rb$}) { |m| "spec/#{m[1]}}_spec.rb" }
12
+ watch('spec/spec_helper.rb') { 'spec' }
13
+ watch(/spec\/support\/(.+)\.rb/) { 'spec' }
14
+ end
@@ -0,0 +1,31 @@
1
+ # General
2
+
3
+ * There is this awkward and unstated need that a dataflow set its output using `set_output(sink)`.
4
+ ** `Wukong::LocalRunner` has this hard-coded to `sink(:test_sink)` - broke.
5
+
6
+ * The `map` method used in dataflow; from whence does it come? Found it in `wukong/processor.rb`; does this fit better in `wukong/widget`?
7
+
8
+ * A "null" processor AND an "as_is" processor don't both make sense. I think, conceptually, that `Wukong::Processor::Null` === `Wukong::Processor::AsIs` and we should choose only one. A "null" sink might be better suited for the purporse of "rejecting all records." And this already exists in `widget/sink.rb`.
9
+ ** There are also "all" and "none" filters in `widget/filter.rb`.
10
+
11
+ * `Wukong::Filter::All` and `Wukong::Filter::None` are not registered. Should they be? Do they work as they should, because it appears not...
12
+ ** Are these necessary as `Wukong::Processor::Null` and `Wukong::Processor::AsIs` already exist?
13
+
14
+ * In `Wukong::Runner` why do I have to specify sinks/sources wih a name? are these ever referenced/used in any context later? Seems that the runner might not need names...
15
+
16
+ * Is there a reason wukong has a config directory with a very outdated yaml file?
17
+
18
+ * DONE ~~The Guardfile has a lot of debuggy cruft. Fixit.~~
19
+
20
+ * DONE ~~Let's use Bundler to manage the gem instead of Jeweler.~~
21
+
22
+ * So many rad 80's references.
23
+
24
+ # Specs
25
+
26
+ * DONE ~~The graphviz spec could be made into an argument in spec helper, whether to run the file or not.~~
27
+
28
+ * There should be a cleanup phase after specs have run to delete artifacts.
29
+
30
+ * A lot of examples. Most don't work. Fixit.
31
+
@@ -0,0 +1,422 @@
1
+ # Wukong [![Build Status](https://secure.travis-ci.org/infochimps-labs/wukong.png)](http://travis-ci.org/infochimps-labs/wukong)
2
+
3
+ Wukong is a toolkit for rapid, agile development of dataflows at any scale.
4
+
5
+ (note: the syntax below is mostly false)
6
+
7
+
8
+ <a name="design"></a>
9
+ ## Design Overview
10
+
11
+ The fundamental principle of Wukong/Hanuman is *powerful black boxes, beautiful glue*. In general, they don't do the thing -- they coordinate the boxes that do the thing, to let you implement rapidly, nimbly and readably. Hanuman elegantly describes high-level data flows; Wukong is a pragmatic collection of dataflow primitives. They both emphasize scalability, readability and rapid development over performance or universality.
12
+
13
+ Wukong/Hanuman are chiefly concerned with these specific types of graphs:
14
+
15
+ * **dataflow** -- chains of simple modules to handle continuous data processing -- coordinates Flume, Unix pipes, ZeroMQ, Esper, Storm.
16
+ * **workflows** -- episodic jobs sequences, joined by dependency links -- comparable to Rake, Azkaban or Oozie.
17
+ * **map/reduce** -- Hadoop's standard *disordered/partitioned stream > partition, sort & group > process groups* workflow. Comparable to MRJob and Dumbo.
18
+ * **queue workers** -- pub/sub asynchronously triggered jobs -- comparable Resque, RabbitMQ/AMQP, Amazon Simple Worker, Heroku workers.
19
+
20
+ In addition, wukong stages may be deployed into **http middlware**: lightweight distributed API handlers -- comparable to Rack, Goliath or Twisted.
21
+
22
+ When you're describing a Wukong/Hanuman flow, you're writing pure expressive ruby, not some hokey interpreted language or clumsy XML format. Thanks to JRuby, it can speak directly to Java-based components like Hadoop, Flume, Storm or Spark.
23
+
24
+ ## What's where
25
+
26
+ * Configliere -- Manage settings
27
+ - Layer - Project settings through a late-resolved stack of config objects.
28
+ * Gorillib
29
+ - Type, RecordType
30
+ - TypeConversion
31
+ - Model
32
+ - PathHelpers
33
+ * Wukong
34
+ - fs - Abstracts file hdfs s3n s3hdfs scp
35
+ - streamer - Black-box data transform
36
+ - job - Workflow definition
37
+ - flow - Dataflow definition
38
+ - widgets - Common data transforms
39
+ - RubyHadoop - Hadoop jobs using streamers
40
+ - RubyFlume - Flume decorators using streamers
41
+ * Hanuman -- Elegant small graph assembly
42
+ * Swineherd -- Common interface on ugly tools
43
+ - Turn readable hash into safe commandline (param conv, escaping)
44
+ - Execute command, capture stdin/stderr
45
+ - Summarize execution with a broham-able hash
46
+ - Common modules: Input/output, Java, gnu, configliere
47
+ - Template
48
+ - Hadoop, pig, flume
49
+ - ?? Cp, mv, rm, zip, tar, bz2, gz, ssh, scp
50
+ - ?? Remotely execute command
51
+
52
+ <a name="design-rules"></a>
53
+ ### Story
54
+
55
+ [Narrative Method Structure](http://avdi.org/talks/confident-code-rubymidwest-2011/confident-code.html)
56
+
57
+ * Gather input
58
+ * Perform work
59
+ * Deliver results
60
+ * Handle failure
61
+
62
+
63
+ <a name="design-rules"></a>
64
+ ### Design Rules
65
+
66
+ * **whiteboard rule**: the user-facing conceptual model should match the picture you would draw on the whiteboard in an engineering discussion. The fundamental goal is to abstract the necessary messiness surrounding the industrial-strength components it orchestrates while still providing their essential power.
67
+ * **common cases are simple, complex cases are always possible**: The code should be as simple as the story it tells. For the things you do all the time, you only need to describe how this data flow is different from all other data flows. However, at no point in the project lifecycle should Wukong/Hanuman hit a brick wall or peat bog requiring its total replacement. A complex production system may, for example, require that you replace a critical path with custom Java code -- but that's a small set of substitutions in an otherwise stable, scalable graph. In the world of web programming, Ruby on Rails passes this test; Sinatra and Drupal do not.
68
+ * **petabyte rule**: Wukong/Hanuman coordinate industrial-strength components that wort at terabyte- and petabyte-scale. Conceptual simplicity makes it an excellent tool even for small jobs, but scalability is key. All components must assume an asynchronous, unreliable and distributed system.
69
+ * **laptop rule**:
70
+ * **no dark magick**: the core libraries provide *elegant, predictable magic or no magic at all*. We use metaprogramming heavily, but always predictably, and only in service of making common cases simple.
71
+ - Soupy multi-option `case` statements are a smell.
72
+ - Complex tasks will require code that is more explicit, but readable and organically connected to the typical usage. For example, many data flows will require a custom `Wukong::Streamer` class; but that class is no more complex than the built-in streamer models and receives all the same sugar methods they do.
73
+ * **get shit done**: sometimes ugly tasks require ugly solutions. Shelling out to the hadoop process monitor and parsing its output is acceptable if it is robust and obviates the need for a native protocol handler.
74
+ * **be clever early, boring late**: magic in service of having a terse language for assembling a graph is great. However, the assembled graph should be stomic and largely free of any conditional logic or dependencies.
75
+ - for example, the data flow `split` statement allows you to set a condition on each branch. The assembled graph, however, is typically a `fanout` stage followed by `filter` stages.
76
+ - the graph language has some helpers to refer to graph stages. The compiled graph uses explicit mostly-readable but unambiguous static handles.
77
+ - some stages offer light polymorphism -- for example, `select` accepts either a regexp or block. This is handled at the factory level, and the resulting stage is free of conditional logic.
78
+ * **no lock-in**: needless to say, Wukong works seamlessly with the Infochimps platform, making robust, reliable massive-scale dataflows amazingly simple. However, wukong flows are not tied to the cloud: they project to Hadoop, Flume or any of the other open-source components that power our platform.
79
+
80
+ __________________________________________________________________________
81
+
82
+ <a name="stage"></a>
83
+ ## Stage
84
+
85
+ A graph is composed of `stage`s.
86
+
87
+ * *desc* (alias `description`)
88
+
89
+ #### Actions
90
+
91
+ each action
92
+
93
+ * the default action is `call`
94
+ * all stages respond to `nothing`, and like ze goggles, do `nothing`.
95
+
96
+ __________________________________________________________________________
97
+
98
+ <a name="dataflows"></a>
99
+ ## Workflows
100
+
101
+ Wukong workflows work somewhat differently than you may be familiar with Rake and such.
102
+
103
+ In wukong, a stage corresponds to a resource; you can then act on that resource.
104
+
105
+ Consider first compiling a c program:
106
+
107
+ to build the executable, run `cc -o cake eggs.o milk.o flour.o sugar.o -I./include -L./lib`
108
+ to build files like '{file}.o', run `cc -c -o {file}.o {file}.c -I./include`
109
+
110
+ In this case, you define the *steps*, implying the resources.
111
+
112
+
113
+ Something rake can't do (but we should be able to): make it so I can define a dependency that runs **last**
114
+
115
+ ### Defining jobs
116
+
117
+ Wukong.job(:launch) do
118
+ task :aim do
119
+ #...
120
+ end
121
+ task :enter do
122
+ end
123
+ task :commit do
124
+ # ...
125
+ end
126
+ end
127
+
128
+ Wukong.job(:recall) do
129
+ task :smash_with_rock do
130
+ #...
131
+ end
132
+ task :reprogram do
133
+ # ...
134
+ end
135
+ end
136
+
137
+ * stages construct resources
138
+ - these have default actions
139
+ * hanuman tracks defined order
140
+
141
+ * do steps run in order, or is dependency explicit?
142
+ * what about idempotency?
143
+
144
+ * `task` vs `action` vs `resource`; `job`, `task`, `group`, `namespace`.
145
+
146
+ ### documenting
147
+
148
+ Inline option (`:desc` or `:description`?)
149
+
150
+ ```ruby
151
+ task :foo, :description => "pity the foo" do
152
+ # ...
153
+ end
154
+ ```
155
+
156
+ DSL method option
157
+
158
+ ```ruby
159
+ task :foo do
160
+ description "pity the foo"
161
+ # ...
162
+ end
163
+ ```
164
+
165
+ ### actions
166
+
167
+ default action:
168
+
169
+ ```ruby
170
+ script 'nukes/launch_codes.rb' do
171
+ # ...
172
+ end
173
+ ```
174
+
175
+ define the `:undo` action:
176
+
177
+ ```ruby
178
+ script 'nukes/launch_codes.rb', :undo do
179
+ # ...
180
+ end
181
+ ```
182
+
183
+ <a name="file-name-templates"></a>
184
+ ### File name templates
185
+
186
+ * *timestamp*: timestamp of run. everything in this invocation will have the same timestamp.
187
+ * *user*: username; `ENV['USER']` by default
188
+ * *sources*: basenames of job inputs, minus extension, non-`\w` replaced with '_', joined by '-', max 50 chars.
189
+ * *job*: job flow name
190
+
191
+ <a name="job-versioning-of-clobbered"></a>
192
+ ### versioning of clobbered files
193
+
194
+ * when files are generated or removed, relocate to a timestamped location
195
+ - a file `/path/to/file.txt` is relocated to `~/.wukong/backups/path/to/file.txt.wukong-20110102120011` where `20110102120011` is the [job timestamp](#file-naming)
196
+ - accepts a `max_size` param
197
+ - raises if it can't write to directory -- must explicitly say `--safe_file_ops=false`
198
+
199
+ <a name="job-running"></a>
200
+ ### running
201
+
202
+ * `clobber` -- run, but clear all dependencies
203
+ * `undo` --
204
+ * `clean` --
205
+
206
+
207
+ ### Utility and Filesystem tasks
208
+
209
+ The primitives correspond heavily with Rake and Chef. However, they extend them in many ways, don't cover all their functionality in many ways, and incompatible in several ways.
210
+
211
+ ### Configuration
212
+
213
+
214
+ #### Commandline args
215
+
216
+ * handled by configliere: `nukes launch --launch_code=GLG20`
217
+
218
+ * TODO: configliere needs context-specific config vars, so I only get information about the `launch` action in the `nukes` job when I run `nukes launch --help`
219
+
220
+
221
+
222
+ __________________________________________________________________________
223
+
224
+ <a name="dataflows"></a>
225
+ ## Dataflows
226
+
227
+
228
+ Data flows
229
+
230
+ * you can have a consumer connect to a provider, or vice versa
231
+ - producer binds to a port, consumers connect to it: pub/sub
232
+ - consumers open a port, producer connects to many: megaphone
233
+
234
+ * you can bring the provider on line first, and the consumers later, or vice versa.
235
+
236
+
237
+ <a name="dataflow-syntax"></a>
238
+ ## Syntax
239
+
240
+ **note: this is a scratch pad; actual syntax evolving rapidly and currently looks not much like the following**
241
+
242
+ read('/foo/bar') # source( FileSource.new('/foo/bar') )
243
+ writes('/foo/bar') # sink( FileSink.new('/foo/bar') )
244
+
245
+ ... | file('/foo/bar') # this we know is a source
246
+ file('/foo/bar') | ... # this we know is a sink
247
+ file('/foo/bar') # don't know; maybe we can guess later
248
+
249
+ Here is an example Wukong script, `count_followers.rb`:
250
+
251
+ from :json
252
+
253
+ mapper do |user|
254
+ year_month = Time.parse(user[:created_at]).strftime("%Y%M")
255
+ emit [ user[:followers_count], year_month ]
256
+ end
257
+
258
+ reducer do
259
+ start{ @count = 0 }
260
+
261
+ each do |followers_count, year_month|
262
+ @count += 1
263
+ end
264
+
265
+ finally{ emit [*@group_key, @count] }
266
+ end
267
+
268
+ You can run this from the commandline:
269
+
270
+ wukong count_followers.rb users.json followers_histogram.tsv
271
+
272
+ It will run in local mode, effectively doing
273
+
274
+ cat users.json | {the map block} | sort | {the reduce block} > followers_histogram.tsv
275
+
276
+ You can instead run it in Hadoop mode, and it will launch the job across a distributed Hadoop cluster
277
+
278
+ wukong --run=hadoop count_followers.rb users.json followers_histogram.tsv
279
+
280
+ <a name="formatters"></a>
281
+ #### Data Formats (Serialization / Deserialization)
282
+
283
+ * tsv/csv
284
+ * json
285
+ * xml
286
+ * avro
287
+ * apache_log
288
+ * flat
289
+ * regexp
290
+ * [Tagged Netstrings](http://tnetstrings.org/)
291
+ * [ZeroMQ Property Language](http://rfc.zeromq.org/spec:4)
292
+
293
+ * gz/bz2/zip/snappy
294
+
295
+ <a name="data-packets"></a>
296
+ #### Data Packets
297
+
298
+ Data consists of
299
+
300
+ - record
301
+ - schema
302
+ - metadata
303
+
304
+ ## Delivery Guarantees
305
+
306
+ Most messaging systems keep metadata about what messages have been consumed on the broker. That is, as a message is handed out to a consumer, the broker records that fact locally. This is a fairly intuitive choice, and indeed for a single machine server it is not clear where else it could go. Since the data structure used for storage in many messaging systems scale poorly, this is also a pragmatic choice--since the broker knows what is consumed it can immediately delete it, keeping the data size small.
307
+
308
+ What is perhaps not obvious, is that getting the broker and consumer to come into agreement about what has been consumed is not a trivial problem. If the broker records a message as consumed immediately every time it is handed out over the network, then if the consumer fails to process the message (say because it crashes or the request times out or whatever) then that message will be lost. To solve this problem, many messaging systems add an acknowledgement feature which means that messages are only marked as sent not consumed when they are sent; the broker waits for a specific acknowledgement from the consumer to record the message as consumed. This strategy fixes the problem of losing messages, but creates new problems. First of all, if the consumer processes the message but fails before it can send an acknowledgement then the message will be consumed twice. The second problem is around performance, now the broker must keep multiple states about every single message (first to lock it so it is not given out a second time, and then to mark it as permanently consumed so that it can be removed). Tricky problems must be dealt with, like what to do with messages that are sent but never acknowledged.
309
+
310
+ So clearly there are multiple possible message delivery guarantees that could be provided:
311
+
312
+ * At most once—this handles the first case described. Messages are immediately marked as consumed, so they can't be given out twice, but many failure scenarios may lead to losing messages.
313
+ * At least once—this is the second case where we guarantee each message will be delivered at least once, but in failure cases may be delivered twice.
314
+ * Exactly once—this is what people actually want, each message is delivered once and only once.
315
+
316
+
317
+ __________________________________________________________________________
318
+
319
+ <a name="design-questions"></a>
320
+ ## Design Questions
321
+
322
+ * **filename helpers**:
323
+ - `':data_dir:/this/that/:script:-:user:-:timestamp:.:ext:'`?
324
+ - `path_to(:data_dir, 'this/that', "???")`?
325
+
326
+ * `class Wukong::Foo::Base` vs `class Wukong::Foo`
327
+ - the latter is more natural, and still allows
328
+ - I'd like
329
+
330
+
331
+
332
+ __________________________________________________________________________
333
+ __________________________________________________________________________
334
+ __________________________________________________________________________
335
+
336
+
337
+ <a name="references"></a>
338
+ ## References
339
+
340
+ <a name="refs-workflow"></a>
341
+ ### Workflow
342
+
343
+ * **Rake**
344
+
345
+ - [Rake Docs](http://rdoc.info/gems/rake/file/README.rdoc)
346
+ - [Rake Tutorial](http://jasonseifer.com/2010/04/06/rake-tutorial) by Jason Seifer -- 2010, with a good overview of why Rake is useful
347
+ - [Rake Tutorial](http://martinfowler.com/articles/rake.html) by Martin Fowler -- from 2005, so may lack some modernities
348
+ - [Rake Tutorial](http://onestepback.org/index.cgi/Tech/Rake/Tutorial/RakeTutorialRules.red) -- from 2005, so may lack some modernities
349
+
350
+ * **Rake Examples**
351
+
352
+ - [resque's redis.rake](https://github.com/defunkt/resque/blob/master/lib/tasks/redis.rake) and [resque/tasks](https://github.com/defunkt/resque/blob/master/lib/resque/tasks.rb)
353
+ - [rails' Rails Ties](https://github.com/rails/rails/tree/master/railties/lib/rails/tasks)
354
+
355
+ * **Thor**
356
+
357
+ - [Thor Wiki](https://github.com/wycats/thor/wiki)
358
+ -
359
+
360
+ * **Chef**
361
+
362
+ - [Chef Wiki](http://wiki.opscode.com/display/chef/Home)
363
+ - specifically, [Chef Resources](http://wiki.opscode.com/display/chef/Resources)
364
+
365
+ * **Other**
366
+
367
+ - [**Gradle**](http://gradle.org/) -- a modern take on `ant` + `maven`. The [Gradle overview](http://gradle.org/overview) states its case.
368
+
369
+ <a name="refs-dataflow"></a>
370
+ ### Dataflow
371
+
372
+ * **Esper**
373
+
374
+ - Must read: [StreamSQL Event Processing with Esper](http://www.igvita.com/2011/05/27/streamsql-event-processing-with-esper/)
375
+ - [Esper docs](http://esper.codehaus.org/esper-4.5.0/doc/reference/en/html_single/index.html#epl_clauses)
376
+ - [Esper EPL Reference](http://esper.codehaus.org/esper-4.5.0/doc/reference/en/html_single/index.html#epl_clauses)
377
+
378
+ * **Storm**
379
+
380
+ - [A Storm is coming: more details and plans for release](http://engineering.twitter.com/2011/08/storm-is-coming-more-details-and-plans.html)
381
+ - [Storm: distributed and fault-tolerant realtime computation](http://www.slideshare.net/nathanmarz/storm-distributed-and-faulttolerant-realtime-computation) -- slideshare presentation
382
+ - [Storm: the Hadoop of Realtime Processing](http://tech.backtype.com/preview-of-storm-the-hadoop-of-realtime-proce)
383
+
384
+ * **Kafka**: LinkedIn's high-throughput messaging queue
385
+
386
+ - [Kafka's Design: Why we built this](http://incubator.apache.org/kafka/design.html)
387
+
388
+ * **ZeroMQ**: tcp sockets like you think they should work
389
+
390
+ - [ZeroMQ: A Modern & Fast Networking Stack](http://www.igvita.com/2010/09/03/zeromq-modern-fast-networking-stack/)
391
+ - [ZeroMQ Guide](http://zguide.zeromq.org/page:all)
392
+ - [ZeroMQ: An Introduction](http://nichol.as/zeromq-an-introduction)
393
+ - [Routing with Ruby & ZeroMQ Devices](http://www.igvita.com/2010/11/17/routing-with-ruby-zeromq-devices/)
394
+ - [Ruby bindings for ZeroMQ](http://zeromq.github.com/rbzmq/) and the [Ruby-FFI bindings](http://www.zeromq.org/bindings:ruby-ffi)
395
+ - [Learn ruby ZeroMQ](https://github.com/andrewvc/learn-ruby-zeromq) by @andrewvc
396
+
397
+ * **Other**
398
+
399
+ - [Infopipes: An abstraction for multimedia streamin](http://web.cecs.pdx.edu/~black/publications/Mms062%203rd%20try.pdf) Black et al 2002
400
+ - [Yahoo Pipes](http://pipes.yahoo.com/pipes/)
401
+ - [Yahoo Pipes wikipedia page](http://en.wikipedia.org/wiki/Yahoo_Pipes)
402
+ - [Streambase](http://www.streambase.com/products/streambasecep/faqs/) -- Why is is so goddamn hard to find out anything real about a project once it gets an enterprise version? Seriously, the consistent fundamental brokenness of enterprise product is astonishing. It's like they take inspiration from shitty major-label band websites but layer a whiteout of [web jargon bullshit](http://www.dack.com/web/bullshit.html) in place of inessential flash animation. Anyway I think Streambase is kinda similar but who the hell can tell.
403
+ - [Scribe](http://www.cloudera.com/blog/2008/11/02/configuring-and-using-scribe-for-hadoop-log-collection/)
404
+ - [Splunk Case Study](http://www.igvita.com/2008/10/22/distributed-logging-syslog-ng-splunk/)
405
+
406
+ <a name="refs-dataflow"></a>
407
+ ### Messaging Queue
408
+
409
+ - [DripDrop](https://github.com/andrewvc/dripdrop) - a message passing library with a unified API abstracting HTTP, zeroMQ and websockets.
410
+
411
+
412
+ <a name="refs-dataflow"></a>
413
+ ### Data Processing
414
+
415
+ * **Hadoop**
416
+
417
+ - [Hadoop]()
418
+
419
+
420
+ * **Spark/Mesos**
421
+
422
+ - [Mesos](http://www.mesosproject.org/)