ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,1308 @@
1
+ # Wukong
2
+
3
+ **NOTE: This is a divergent fork of https://github.com/infochimps-labs/wukong. The gem has been renamed to ul-wukong.**
4
+
5
+ Wukong is a toolkit for rapid, agile development of data applications
6
+ at any scale.
7
+
8
+ The core concept in Wukong is a **Processor**. Wukong processors are
9
+ simple Ruby classes that do one thing and do it well. This codebase
10
+ implements processors and other core Wukong classes and provides a way
11
+ to run and combine processors on the command-line.
12
+
13
+ Wukong's larger theme is *powerful black boxes, beautiful glue*. The
14
+ Wukong ecosystem consists of other tools which run Wukong processors
15
+ in various topologies across a variety of different backends. Code
16
+ written in Wukong can be easily ported between environments and
17
+ frameworks: local command-line scripts on your laptop instantly turn
18
+ into powerful jobs running in Hadoop.
19
+
20
+ Here is a list of various other projects which you may also want to
21
+ peruse when trying to understand the full Wukong experience:
22
+
23
+ * <a href="http://github.com/infochimps-labs/wukong-hadoop">wukong-hadoop</a>: Run Wukong processors as mappers and reducers within the Hadoop framework. Model Hadoop jobs locally before you run them.
24
+ * <a href="http://github.com/infochimps-labs/wukong-storm">wukong-storm</a>: Run Wukong processors within the Storm framework. Model flows locally before you run them.
25
+ * <a href="http://github.com/infochimps-labs/wukong-load">wukong-load</a>: Load the output data from your local Wukong jobs and flows into a variety of different data stores.
26
+ * <a href="http://github.com/infochimps-labs/wonderdog">wonderdog</a>: Connect Wukong processors running within Hadoop to Elasticsearch as either a source or sink for data.
27
+ * <a href="http://github.com/infochimps-labs/wukong-deploy">wukong-deploy</a>: Orchestrate Wukong and other wu-tools together to support an application running on the Infochimps Platform.
28
+
29
+ For a more holistic perspective also see the Infochimps Platform
30
+ Community Edition (**FIXME: link to this**) which combines all the
31
+ Wukong tools together into a jetpack which fits comfortably over the
32
+ shoulders of developers.
33
+
34
+ <a name="processors"></a>
35
+ ## Writing Simple Processors
36
+
37
+ The fundamental unit of computation in Wukong is the processor. A
38
+ processor is Ruby class which
39
+
40
+ * subclasses `Wukong::Processor` (use the `Wukong.processor` method as sugar for this)
41
+ * defines a `process` method which takes an input record, does something, and calls `yield` on the output
42
+
43
+ Here's a processor that reverses each of its input records:
44
+
45
+ ```ruby
46
+ # in string_reverser.rb
47
+ Wukong.processor(:string_reverser) do
48
+ def process string
49
+ yield string.reverse
50
+ end
51
+ end
52
+ ```
53
+
54
+ You can run this processor on the command line using text files as
55
+ input using the `wu-local` tool that comes with Wukong:
56
+
57
+ ```
58
+ $ cat novel.txt
59
+ It was the best of times, it was the worst of times.
60
+ ...
61
+
62
+ $ cat novel.txt | wu-local string_reverser.rb
63
+ .semit fo tsrow eht saw ti ,semit fo tseb eht saw tI
64
+ ```
65
+
66
+ The `wu-local` program consumes one line at at time from STDIN and
67
+ calls your processor's `process` method with that line as a Ruby
68
+ String object. Each object you `yield` within your process method
69
+ will be printed back out on STDOUT.
70
+
71
+ ### Multiple Processors, Multiple (Or No) Yields
72
+
73
+ Processors are intended to be combined so they can be stored in the
74
+ same file like these two, related processors:
75
+
76
+ ```ruby
77
+ # in processors.rb
78
+
79
+ Wukong.processor(:splitter) do
80
+ def process line
81
+ line.split.each { |token| yield token }
82
+ end
83
+ end
84
+
85
+ Wukong.processor(:normalizer) do
86
+ def process token
87
+ stripped = token.downcase.gsub(/\W/,'')
88
+ yield stripped if stripped.size > 0
89
+ end
90
+ end
91
+ ```
92
+
93
+ Notice how the `splitter` yields multiple tokens for each of its input
94
+ tokens and that the `normalizer` may sometimes never yield at all,
95
+ depending on its input. Processors are under no obligations by the
96
+ framework to yield or return anything so they can easily act as
97
+ filters or even sinks in data flows.
98
+
99
+ There are two processors in this file and neither shares a name with
100
+ the basename of the file ("processors") so `wu-local` can't
101
+ automatically choose a processor to run. We can specify one
102
+ explicitly with the `--run` option:
103
+
104
+ ```
105
+ $ cat novel.txt | wu-local processors.rb --run=splitter
106
+ It
107
+ was
108
+ the
109
+ best
110
+ of
111
+ times,
112
+ ...
113
+ ```
114
+
115
+ We can combine the two processors together
116
+
117
+ ```
118
+ $ cat novel.txt | wu-local processors.rb --run=splitter | wu-local processors.rb --run=normalizer
119
+ it
120
+ was
121
+ the
122
+ best
123
+ of
124
+ times
125
+ ...
126
+ ```
127
+
128
+ but there's an easier way of doing this with <a href="#flows">dataflows</a>.
129
+
130
+ ### Adding Configurable Options
131
+
132
+ Processors can have options that can be set in Ruby code, from the
133
+ command-line, a configuration file, or a variety of other places
134
+ thanks to [Configliere](http://github.com/infochimps-labs/configliere).
135
+
136
+ This processor calculates percentiles from observations assuming a
137
+ normal distribution given a particular mean and standard deviation.
138
+ It uses two *fields*, the mean or average of a distribution (`mean`)
139
+ and its standard deviation (`std_dev`). From this information, it
140
+ will measure the percentile of all input values.
141
+
142
+ ```ruby
143
+ # in percentile.rb
144
+ Wukong.processor(:percentile) do
145
+
146
+ SQRT_1_HALF = Math.sqrt(0.5)
147
+
148
+ field :mean, Float, :default => 0.0
149
+ field :std_dev, Float, :default => 1.0
150
+
151
+ def process value
152
+ observation = value.to_f
153
+ z_score = (mean - observation) / std_dev
154
+ percentile = 50 * Math.erfc(z_score * SQRT_1_HALF)
155
+ yield [observation, percentile].join("\t")
156
+ end
157
+ end
158
+ ```
159
+
160
+ These fields have default values but you can overide them on the
161
+ command line. If you scored a 95 on an exam where the mean score was
162
+ 80 points and the standard deviation of the scores was 10 points, for
163
+ example, then you'd be in the 93rd percentile:
164
+
165
+ ```
166
+ $ echo 95 | wu-local /tmp/percentile.rb --mean=80 --std_dev=10
167
+ 95.0 93.3192798731142
168
+ ```
169
+
170
+ If the exam were more difficult, with a mean of 75 points and a
171
+ standard deviation of 8 points, you'd be in the 99th percentile!
172
+
173
+ ```
174
+ $ echo 95 | wu-local /tmp/percentile.rb --mean=75 --std_dev=8
175
+ 95.0 99.37903346742239
176
+ ```
177
+
178
+ ### The Lifecycle of a Processor
179
+
180
+ Processors have a lifecycle that they execute when they are run within
181
+ the context of a Wukong runner like `wu-local` or `wu-hadoop`. Each
182
+ lifecycle phase corresponds to a method of the processor that is
183
+ called:
184
+
185
+ * `setup` called *after* the Processor is initialized but *before* the first record is processed. You cannot yield from this method.
186
+ * `process` called once for each input record, may yield once, many, or no times.
187
+ * `finalize` called after the the *last* record has been processed but while the processor still has an opportunity to yield records.
188
+ * `stop` called to signal to the processor that all work should stop, open connections should be closed, &c. You cannot yield from this method.
189
+
190
+ The above examples have already focused on the `process` method.
191
+
192
+ The `setup` and `stop` methods are often used together to handle
193
+ external connections
194
+
195
+ ```ruby
196
+ # in geolocator.rb
197
+ Wukong.processor(:geolocator) do
198
+ field :host, String, :default => 'localhost'
199
+ attr_accessor :connection
200
+
201
+ def setup
202
+ self.connection = Database::Connection.new(host)
203
+ end
204
+ def process record
205
+ record.added_value = connection.find("...some query...")
206
+ end
207
+ def stop
208
+ self.connection.close
209
+ end
210
+ end
211
+ ```
212
+
213
+ The `finalize` method is most useful when writing a "reduce"-type
214
+ operation that involves storing or aggregating information till some
215
+ criterion is met. It will always be called after the last record has
216
+ been given (to `process`) but you can call it whenever you want to
217
+ within your own code.
218
+
219
+ Here's an example of using the `finalize` method to implement a simple
220
+ counter that counts all the input records:
221
+
222
+ ```ruby
223
+ # in counter.rb
224
+ Wukong.processor(:counter) do
225
+ attr_accessor :count
226
+ def setup
227
+ self.count = 0
228
+ end
229
+ def process thing
230
+ self.count += 1
231
+ end
232
+ def finalize
233
+ yield count
234
+ end
235
+ end
236
+ ```
237
+
238
+ It hinges on the fact that the last input record will be passed to
239
+ `process` *first* and only then will `finalize` be called. This
240
+ allows the last input record to be counted/processed/aggregated and
241
+ then the entire aggregate to be dealt with in finalize.
242
+
243
+ Because of this emphasis on building and processing aggregates, the
244
+ `finalize` method is often useful within processors meant to run as
245
+ reducers in a Hadoop environment.
246
+
247
+ Note:: Finalize is not guaranteed to be called by in every possible
248
+ environment as it depends on the chosen runner. In a local or Hadoop
249
+ environment, the notion of "last record" makes sense and so the
250
+ corresponding runners will call `finalize`. In an environment like
251
+ Storm, where the concept of last record is not (supposed to be)
252
+ meaningful, the corresponding runner doesn't ever call it.
253
+
254
+ ### Serialization
255
+
256
+ `wu-local` (and many similar tools) deal with inputs and outputs as
257
+ strings.
258
+
259
+ Processors want to process objects as close to their domain as is
260
+ possible. A processor which decorates address book entries with
261
+ Twitter handles doesn't want to think of its inputs as Strings but
262
+ Hashes or, better yet, Persons.
263
+
264
+ Wukong makes it easy to wrap a processor with other processors
265
+ dedicated to handling the common tasks of parsing records into or out
266
+ of formats like JSON and turning them into Ruby model instances.
267
+
268
+ #### De-serializing data formats like JSON or TSV
269
+
270
+ Wukong can parse and emit common data formats like JSON and delimited
271
+ formats like TSV or CSV so that you don't pollute or tie down your own
272
+ processors with protocol logic.
273
+
274
+ Here's an example of a processor that wants to deal with Hashes as
275
+ input.
276
+
277
+ ```ruby
278
+ # in extractor.rb
279
+ Wukong.processor(:extractor) do
280
+ def process hsh
281
+ yield hsh["first_name"]
282
+ end
283
+ end
284
+ ```
285
+
286
+ Given JSON data,
287
+
288
+ ```
289
+ $ cat input.json
290
+ {"first_name": "John", "last_name":, "Smith"}
291
+ {"first_name": "Sally", "last_name":, "Johnson"}
292
+ ...
293
+ ```
294
+
295
+ you can feed it directly to a processor
296
+
297
+ ```
298
+ $ cat input.json | wu-local --from=json extractor.rb
299
+ John
300
+ Sally
301
+ ...
302
+ ```
303
+
304
+ Other processors really like Arrays:
305
+
306
+ ```ruby
307
+ # in summer.rb
308
+ Wukong.processor(:summer) do
309
+ def process values
310
+ yield values.map(&:to_f).inject(&:+)
311
+ end
312
+ end
313
+ ```
314
+
315
+ so you can feed them TSV data
316
+ ```
317
+ $ cat data.tsv
318
+ 1 2 3
319
+ 4 5 6
320
+ 7 8 9
321
+ ...
322
+ $ cat data.tsv | wu-local --from=tsv summer.rb
323
+ 6
324
+ 15
325
+ 24
326
+ ...
327
+ ```
328
+
329
+ but you can just as easily use the same code with CSV data
330
+
331
+ ```
332
+ $ cat data.tsv | wu-local --from=csv summer.rb
333
+ ```
334
+
335
+ or a more general delimited format.
336
+
337
+ ```
338
+ $ cat data.tsv | wu-local --from=delimited --delimiter='--' summer.rb
339
+ ```
340
+
341
+ #### Recordizing data structures into domain models
342
+
343
+ Here's a contact validator that relies on a Person model to decide
344
+ whether a contact entry should be yielded:
345
+
346
+ ```ruby
347
+ # in contact_validator.rb
348
+ require 'person'
349
+
350
+ Wukong.processor(:contact_validator) do
351
+ def process person
352
+ yield person if person.valid?
353
+ end
354
+ end
355
+ ```
356
+
357
+ Relying on the (elsewhere-defined) Person model to define `valid?`
358
+ means the processor can stay skinny and readable. Wukong can, in
359
+ combination with the deserializing features above, turn input text
360
+ into instances of Person:
361
+
362
+ ```
363
+ $ cat input.json | wu-local --consumes=Person --from=json contact_validator.rb
364
+ #<Person:0x000000020e6120>
365
+ #<Person:0x000000020e6120>
366
+ #<Person:0x000000020e6120>
367
+ ```
368
+
369
+ `wu-local` can also serialize records from the `contact_validator`
370
+ processor:
371
+
372
+ ```
373
+ $ cat input.json | wu-local --consumes=Person --from=json contact_validator.rb --to=json
374
+ {"first_name": "John", "last_name":, "Smith", "valid": "true"}
375
+ {"first_name": "Sally", "last_name":, "Johnson", "valid": "true"}
376
+ ...
377
+ ```
378
+
379
+ Serialization formats work just like deserialization formats, with
380
+ JSON as well as delimited formats available.
381
+
382
+ Parsing records into model instances and serializing them out again
383
+ puts constraints on the model class providing these instances. Here's
384
+ what the `Person` class needs to look like:
385
+
386
+
387
+ ```ruby
388
+ # in person.rb
389
+ class Person
390
+
391
+ # Create a new Person from the given attributes. Supports usage of
392
+ # the `--consumes` flag on the command-line
393
+ #
394
+ # @param [Hash] attrs
395
+ # @return [Person]
396
+ def self.receive attrs
397
+ new(attrs)
398
+ end
399
+
400
+ # Turn this Person into a basic data structure. Supports the usage
401
+ # of the `--to` flag on the command-line.
402
+ #
403
+ # @return [Hash]
404
+ def to_wire
405
+ to_hash
406
+ end
407
+ end
408
+ ```
409
+
410
+ To support the `--consumes=Person` syntax, the `receive` class method
411
+ must take a Hash produced from the operation of the `--from` argument
412
+ and return a `Person` instance.
413
+
414
+ To support the `--to=json` syntax, the `Person` class must implement
415
+ the `to_wire` instance method.
416
+
417
+ ### Logging and Notifications
418
+
419
+ Wukong comes with a logger that all processors have access to via
420
+ their `log` attribute. This logger has the following priorities:
421
+
422
+ * debug (can be set as a log level)
423
+ * info (can be set as a log level)
424
+ * warn (can be set as a log level)
425
+ * error
426
+ * fatal
427
+
428
+ and here's a processor which uses them all
429
+
430
+ ```ruby
431
+ # in logs.rb
432
+ Wukong.processor(:logs) do
433
+ def process line
434
+ log.debug line
435
+ log.info line
436
+ log.warn line
437
+ log.error line
438
+ log.fatal line
439
+ end
440
+ end
441
+ ```
442
+
443
+ The default log level is DEBUG.
444
+
445
+ ```
446
+ $ echo something | wu-local logs.rb
447
+ DEBUG 2013-01-11 23:40:56 [Logs ] -- something
448
+ INFO 2013-01-11 23:40:56 [Logs ] -- something
449
+ WARN 2013-01-11 23:40:56 [Logs ] -- something
450
+ ERROR 2013-01-11 23:40:56 [Logs ] -- something
451
+ FATAL 2013-01-11 23:40:56 [Logs ] -- something
452
+ ```
453
+
454
+ though you can set it to something else globally
455
+
456
+ ```
457
+ $ echo something | wu-local logs.rb --log.level=warn
458
+ WARN 2013-01-11 23:40:56 [Logs ] -- something
459
+ ERROR 2013-01-11 23:40:56 [Logs ] -- something
460
+ FATAL 2013-01-11 23:40:56 [Logs ] -- something
461
+ ```
462
+
463
+ or on a per-class basis.
464
+
465
+ ### Creating Documentation
466
+
467
+ `wu-local` includes a help message:
468
+
469
+ ```
470
+ $ wu-local --help
471
+ usage: wu-local [ --param=val | --param | -p val | -p ] PROCESSOR|FLOW
472
+
473
+ wu-local is a tool for running Wukong processors and flows locally on
474
+ the command-line. Use wu-local by passing it a processor and feeding
475
+ ...
476
+
477
+
478
+ Params:
479
+ -r, --run=String Name of the processor or dataflow to use. Defaults to basename of the given path.
480
+ ```
481
+
482
+ You can generate custom help messages for your own processors. Here's
483
+ the percentile processor from before but made more usable with good
484
+ documentation:
485
+
486
+ ```ruby
487
+ # in percentile.rb
488
+ Wukong.processor(:percentile) do
489
+
490
+ description <<-EOF.gsub(/^ {2}/,'')
491
+ This processor calculates percentiles from input scores based on a
492
+ given mean score and a given standard deviation for the scores.
493
+
494
+ The mean and standard deviation are given at run time and processed
495
+ scores will be compared against the given mean and standard
496
+ deviation.
497
+
498
+ The input is expected to consist of float values, one per line.
499
+
500
+ Example:
501
+
502
+ $ cat input.dat
503
+ 88
504
+ 89
505
+ 77
506
+ ...
507
+
508
+ $ cat input.dat | wu-local percentile.rb --mean=85 --std_dev=7
509
+ 88.0 66.58824291023753
510
+ 89.0 71.61454169013237
511
+ 77.0 12.654895447355777
512
+ EOF
513
+
514
+ SQRT_1_HALF = Math.sqrt(0.5)
515
+
516
+ field :mean, Float, :default => 0.0, :doc => "The mean of the assumed distribution"
517
+ field :std_dev, Float, :default => 1.0, :doc => "The standard deviation of the assumed distribution"
518
+
519
+ def process value
520
+ observation = value.to_f
521
+ z_score = (mean - observation) / std_dev
522
+ percentile = 50 * Math.erfc(z_score * SQRT_1_HALF)
523
+ yield [observation, percentile].join("\t")
524
+ end
525
+ end
526
+ ```
527
+
528
+ If you call `wu-local` with the file to this processor as an argument
529
+ in addition to the original `--help` argument, you'll get custom
530
+ documentation.
531
+
532
+ ```
533
+ $ wu-local percentile.rb --help
534
+ usage: wu-local [ --param=val | --param | -p val | -p ] PROCESSOR|FLOW
535
+
536
+ This processor calculates percentiles from input scores based on a
537
+ given mean score and a given standard deviation for the scores.
538
+ ...
539
+
540
+
541
+ Params:
542
+ --mean=Float The mean of the assumed distribution [Default: 0.0]
543
+ -r, --run=String Name of the processor or dataflow to use. Defaults to basename of the given path.
544
+ --std_dev=Float The standard deviation of the assumed distribution [Default: 1.0]
545
+
546
+ ```
547
+
548
+ <a name="flows"></a>
549
+ ## Combining Processors into Dataflows
550
+
551
+ Wukong provides a DSL for combining processors together into
552
+ dataflows. This DSL is designed to make it easy to replicate the
553
+ tried and true UNIX philosophy of building simple tools which do one
554
+ thing well and then combining them together to create more complicated
555
+ flows.
556
+
557
+ For example, having written the `tokenizer` processor, we can use it
558
+ in a dataflow along with the built-in `regexp` processor to replicate
559
+ what we did in the last example:
560
+
561
+ ```ruby
562
+ # in find_t_words.rb
563
+ require_relative('processors')
564
+ Wukong.dataflow(:find_t_words) do
565
+ tokenizer | regexp(match: /^t/)
566
+ end
567
+ ```
568
+
569
+ The `|` operator connects the output of one processor (what it
570
+ `yield`s) with the input of another (its `process` method). In this
571
+ example, every record emitted by `tokenizer` will be subsequently
572
+ processed by `regexp`.
573
+
574
+ You can run this dataflow directly (mimicing what we did above with
575
+ single processors chained together on the command-line):
576
+
577
+ ```
578
+ $ cat novel.txt | wu-local find_t_words.rb
579
+ the
580
+ times
581
+ the
582
+ times
583
+ ...
584
+ ```
585
+
586
+ ### More complicated dataflow topologies
587
+
588
+ The Wukong dataflow DSL allows for more complicated topologies than
589
+ just chaining processors together in a linear pipeline.
590
+
591
+ The `|` operator, used in the above examples to connect two processors
592
+ together into a chain, can also be used to connect a single processor
593
+ to *multiple* processors, creating a branch-point in the dataflow.
594
+ Each branch of the flow will receive the same records.
595
+
596
+ This can be used to perform multiple actions with the same record, as
597
+ in the following example:
598
+
599
+ ```ruby
600
+ # in book_reviews.rb
601
+ Wukong.dataflow(:complicated) do
602
+ from_json | recordize(model: BookReview) |
603
+ [
604
+ map(&:author) | do_author_stuff | ... | to_json,
605
+ map(&:book) | do_book_stuff | ... | to_json,
606
+ ]
607
+ end
608
+ ```
609
+
610
+ Each `BookReview` record yielded by the `recordize` processor will be
611
+ passed to both subsequent branches of the flow, with each branch doing
612
+ a different kind of processing. Output records from both branches
613
+ (which are here turned `to_json` first) will be interspersed in the
614
+ final output when run.
615
+
616
+ A processor like `select`, which filters its inputs, can be used to
617
+ split a flow into records of two types:
618
+
619
+ ```ruby
620
+ # in complicated.rb
621
+ Wukong.dataflow(:complicated) do
622
+ from_json | parser |
623
+ [
624
+ select(&:valid?) | further_processing | ... | to_json,
625
+ select(&:invalid?) | track_errors | null
626
+ ]
627
+ end
628
+ ```
629
+
630
+ Here, only records which respond true to the method `valid?` will pass
631
+ through the first flow (applying `further_processing` and so on) while
632
+ only records which respond true to `invalid?` will pass through the
633
+ second flow (with `track_errors`). The `null` processor at the end of
634
+ this second branch ensures that only records from the first branch
635
+ will be emitted in the final output.
636
+
637
+ Flows can be split over and over again, allowing for rich semantics
638
+ when processing an input source:
639
+
640
+ ```ruby
641
+ # in many_splits.rb
642
+ Wukong.dataflow(:many_splits) do
643
+ from_json | parser | recordize(model: BookReview) |
644
+ [
645
+ map(&:author) | ... | to_json,
646
+ map(&:publisher) |
647
+ [
648
+ select(&:domestic?) | ... | to_json,
649
+ select(&:international?) |
650
+ [
651
+ select(&:north_american?) | ... |
652
+ [
653
+ select(&:american?) | ... | to_json,
654
+ select(&:canadian?) | ... | to_json,
655
+ select(&:mexican?) | ... | to_json,
656
+ ],
657
+ select(&:asian?) | ... | to_json,
658
+ select(&:european?) | ... | to_json,
659
+ ],
660
+ ],
661
+ map(&:title) | ... | to_json
662
+ ]
663
+ end
664
+ ```
665
+
666
+ <a name="serialization></a>
667
+ ## Serialization
668
+
669
+ The process method for a Processor must accept a String argument and
670
+ yield a String argument (or something that will `to_s` appropriately).
671
+
672
+ **Coming Soon:** The ability to define `consumes` and `emits` to
673
+ automatically handle serialization and deserialization.
674
+
675
+ <a name="widgets></a>
676
+ ## Widgets
677
+
678
+ Wukong has a number of built-in widgets that are useful for
679
+ scaffolding your dataflows or using as starting off points for your
680
+ own processors.
681
+
682
+ For any of these widgets you can get customized help, say
683
+
684
+ ```
685
+ $ wu-local group --help
686
+ ```
687
+
688
+ ### Serializers
689
+
690
+ Serializers are widgets which don't change the semantic meaning of a
691
+ record, merely its representation. Here's a list:
692
+
693
+ * `to_json`, `from_json` for turning records into JSON or parsing JSON into records
694
+ * `to_tsv`, `from_tsv` for turning Array records into TSV or parsing TSV into Array records
695
+ * `pretty` for pretty printing JSON inputs
696
+
697
+ When you're writing processors that are capable of running in
698
+ isolation you'll want to ensure that you deserialize and serialize
699
+ records on the way in and out, using the serialization/deserialization
700
+ options `--to` and `--from` on the command-line, as <a
701
+ href="#serialization">defined above</a>.
702
+
703
+ For processors which will only run inside a data flow, you can
704
+ optimize by not doing any (de)serialization until except at the very
705
+ beginning and at the end
706
+
707
+ ```ruby
708
+ Wukong.dataflow(:complicated) do
709
+ from_json | proc_1 | proc_2 | proc_3 ... proc_n | to_json
710
+ end
711
+ ```
712
+
713
+ in this approach, no serialization will be done between processors,
714
+ only at the beginning and end.
715
+
716
+ (This is actually the implementation behind the serialization options
717
+ themselves -- they dynamically prepend/append the appropriate
718
+ deserializers/serializers.)
719
+
720
+ ### General Purpose
721
+
722
+ There are several general purpose processors which implement common
723
+ patterns on input and output data. These are most useful within the
724
+ context of a dataflow definition.
725
+
726
+ * `null` does what you think it doesn't
727
+ * `map` perform some block on each
728
+ * `flatten` flatten the input array
729
+ * `filter`, `select`, `reject` only let certain records through based on a block
730
+ * `regexp`, `not_regexp` only pass records matching (or not matching) a regular expression
731
+ * `limit` only let some number of records pass
732
+ * `logger` send events to the local log stream
733
+ * `extract` extract some part of each input event
734
+
735
+ Some of these widgets can be used directly, perhaps with some
736
+ arguments
737
+
738
+ ```ruby
739
+ Wukong.processor(:log_everything) do
740
+ proc_1 | proc_2 | ... | logger
741
+ end
742
+
743
+ Wukong.processor(:log_everything_important) do
744
+ proc_1 | proc_2 | ... | regexp(match: /important/i) | logger
745
+ end
746
+ ```
747
+
748
+ Other widgets require a block to define their action:
749
+
750
+ ```ruby
751
+ Wukong.processor(:log_everything_important) do
752
+ parser | select { |record| record.priority =~ /important/i } | logger
753
+ end
754
+ ```
755
+
756
+ ### Reducers
757
+
758
+ There are a selection of widgets that do aggregative operations like
759
+ counting, sorting, and summing.
760
+
761
+ * `count` emits a final count of all input records
762
+ * `sort` can sort input streams
763
+ * `group` will group records by some extracting part and give a count of each group's size
764
+ * `moments` will emit more complicated statistics (mean, std. dev.) on the group given some other value to measure
765
+
766
+ Here's an example of sorting data right on the command line
767
+
768
+ ```
769
+ $ head tokens.txt | wu-local sort
770
+ abhor
771
+ abide
772
+ abide
773
+ able
774
+ able
775
+ able
776
+ about
777
+ ...
778
+ ```
779
+
780
+ Try adding group:
781
+
782
+ ```
783
+ $ head tokens.txt | wu-local sort | wu-local group
784
+ {:group=>"abhor", :count=>1}
785
+ {:group=>"abide", :count=>2}
786
+ {:group=>"able", :count=>3}
787
+ {:group=>"about", :count=>3}
788
+ {:group=>"above", :count=>1}
789
+ ...
790
+ ```
791
+
792
+ You can also use these within a more complicated dataflow:
793
+
794
+ ```ruby
795
+ Wukong.dataflow(:word_count) do
796
+ tokenize | remove_stopwords | sort | group
797
+ end
798
+ ```
799
+
800
+ ## Commands
801
+
802
+ Wukong comes with a few commands built-in.
803
+
804
+ ### wu-local
805
+
806
+ You've seen one already, `wu-local`, in many of the examples above.
807
+ `wu-local` is used to model dataflows locally, using `STDIN` and
808
+ `STDOUT` for input and output.
809
+
810
+ `wu-local` is a "core" Wukong command in the sense that more
811
+ complicated commands like `wu-hadoop` and `wu-storm`, implemented by
812
+ Wukong plugins, ultimately invoke some `wu-local` process.
813
+
814
+ ### wu-source
815
+
816
+ Wukong also comes with another basic command `wu-source`. This
817
+ command works very similarly to `wu-local` except that it doesn't read
818
+ any input from `STDIN`. Instead it generates its *own* input records
819
+ in an easy to configure, periodic way. It thus acts as a *source* of
820
+ data for other processes in a UNIX pipeline.
821
+
822
+ Here's an example using the `identity` processor which will have the
823
+ effect of printing to `STDOUT` the exact input received:
824
+
825
+ ```
826
+ $ wu-source identity
827
+ 1
828
+ 2
829
+ 3
830
+ ...
831
+ ```
832
+
833
+ From this example it's clear that the records produced by `wu-source`
834
+ are consecutive integers starting at 1 and that they are produced at a
835
+ rate of one record per second.
836
+
837
+ `wu-source` can thus be used to turn any processor (or dataflow) into
838
+ a source of data:
839
+
840
+ ```ruby
841
+ # in random_numbers.rb
842
+ Wukong.processor(:random_numbers) do
843
+ def process index
844
+ yield rand() * index.to_i
845
+ end
846
+ end
847
+ ```
848
+
849
+ Run `random_numbers` like this:
850
+
851
+ ```
852
+ $ wu-source random_numbers.rb
853
+ 0.7671364694830113
854
+ 0.5958089791553307
855
+ 1.8284806932633886
856
+ 3.707189931235327
857
+ 4.106618048255548
858
+ ...
859
+ ```
860
+
861
+ Which produces random numbers with an ever greater ceiling.
862
+
863
+ You can also completely ignore the input record from `wu-source` in
864
+ your processor:
865
+
866
+ ```ruby
867
+ # in generator.rb
868
+ Wukong.processor(:generator) do
869
+ def process _
870
+ yield new_record
871
+ end
872
+ def new_record
873
+ MyRecord.new(...)
874
+ end
875
+ end
876
+ ```
877
+
878
+ which can produce `MyRecord` instances as it's driven by `wu-source`.
879
+
880
+ It's easy to generate several thousand events per second using
881
+ `wu-source` this way:
882
+
883
+ ```
884
+ $ wu-source generator.rb --per_sec=2000
885
+ ```
886
+
887
+ or use the `--period` (which is the inverse of `--per_sec`) to spit
888
+ out records at a regular interval (every 5 minutes in this example):
889
+
890
+ ```
891
+ $ wu-source generator.rb --period=300
892
+ ```
893
+
894
+ `wu-source` can naturally combine with other dataflows or programs you
895
+ might write:
896
+
897
+ ```
898
+ $ wu-source generator.rb --per_sec=200 | wu-local my_flow
899
+ ```
900
+ ### wu
901
+
902
+ The `wu` command is a convenience command useful when using any of the
903
+ other `wu-` commands in the context of a Ruby project with a
904
+ [`Gemfile`](http://bundler.io/v1.3/gemfile.html).
905
+
906
+ Instead of typing
907
+
908
+ ```
909
+ $ bundle exec wu-local my_flow --option=value ...
910
+ ```
911
+
912
+ which would run `wu-local` using the exact version of `wukong` (and
913
+ any other dependencies) as declared in your project's `Gemfile` and
914
+ `Gemfile.lock`, the `wu` command lets you type
915
+
916
+ ```
917
+ $ wu local my_flow --option=value ...
918
+ ```
919
+
920
+ essentially adding the `bundle exec` prefix and munging `wu local` to
921
+ `wu-local` for you. This can be very helpful when doing lots of work
922
+ with Wukong.
923
+
924
+ **Note:** If `bundle exec wu-whatever` works in your project but `wu
925
+ whatever` fails it is probably because Bundler is resolving `wu-`
926
+ commands to some installation that is not on your `$PATH` (often the
927
+ case if you ran `bundle install --standalone`). Ensure that the
928
+ `wukong` gem is installed on your system and that it's binaries are
929
+ your `$PATH` to use the `wu` command.
930
+
931
+ ## Testing
932
+
933
+ Wukong comes with several helpers to make writing specs using
934
+ [RSpec](http://rspec.info/) easier.
935
+
936
+ The only method that you need to test in a Processor is the `process`
937
+ method. The rest of the processor's methods and functionality are
938
+ provided by Wukong and are already tested.
939
+
940
+ You may want to test this process method in two ways:
941
+
942
+ * unit tests of the class itself in various contexts
943
+ * integration tests of running the class with the `wu-local` (or other) command-line runner
944
+
945
+ ### Unit Tests
946
+
947
+ Let's start with a simple processor
948
+
949
+ ```ruby
950
+ # in tokenizer.rb
951
+ Wukong.processor(:tokenizer) do
952
+ def process text
953
+ text.downcase.gsub(/[^\s\w]/,'').split.each do |token|
954
+ yield token
955
+ end
956
+ end
957
+ end
958
+ ```
959
+
960
+ You could test this processor directly:
961
+
962
+ ```ruby
963
+ # in spec/tokenizer_spec.rb
964
+ require 'spec_helper'
965
+ describe :tokenizer do
966
+ subject { Wukong::Processor::Tokenizer.new }
967
+ before { subject.setup }
968
+ after { subject.finalize ; subject.stop }
969
+ it "correctly counts tokens" do
970
+ expect { |b| subject.process("Hi there, Wukong!", &b) }.to yield_successive_args('hi', 'there', 'wukong')
971
+ end
972
+ end
973
+ ```
974
+
975
+ but having to handle the yield from the block yourself can lead to
976
+ verbose and unreadable tests. Wukong defines some helpers for this
977
+ case. Require and include them first in your `spec_helper.rb`:
978
+
979
+ ```ruby
980
+ # spec/spec_helper.rb
981
+ require 'wukong'
982
+ require 'wukong/spec_helpers'
983
+ RSpec.configure do |config|
984
+ config.include(Wukong::SpecHelpers)
985
+ end
986
+ ```
987
+
988
+ and then use them in your test
989
+
990
+ ```ruby
991
+ # in spec/tokenizer_spec.rb
992
+ require 'spec_helper'
993
+ describe :tokenizer do
994
+ it_behaves_like 'a processor', :named => :tokenizer
995
+ it "emits the correct number of tokens" do
996
+ processor.given("Hi there.\nMy name is Wukong!").should emit(6).records
997
+ end
998
+ it "eliminates all punctuation" do
999
+ processor(:tokenizer).given("Never!").should emit('Never')
1000
+ end
1001
+ it "will not emit tokens in a stop list" do
1002
+ processor(:tokenizer, :stop_list => ['apples', 'bananas']).given("I like apples and bananas").should emit('I', 'like', 'and')
1003
+ end
1004
+ end
1005
+ ```
1006
+
1007
+ Let's look at each kind of helper:
1008
+
1009
+ * The `a processor` shared example (invoked with RSpec's
1010
+ `it_behaves_like` helper) adds some tests that ensure that the
1011
+ processor conforms to the API of a Wukong::Processor.
1012
+
1013
+ * The `processor` method is actually an alias for the more aptly named
1014
+ (but less convenient) `unit_test_runner`. This method accepts a
1015
+ processor name and options (just like `wu-local` and other
1016
+ command-line tools) and returns a Wukong::UnitTestRunner instance.
1017
+ This runner handles the
1018
+
1019
+
1020
+ a (registered) processor name and options and creates a new
1021
+ processor. If no name is given, the argument of the enclosing
1022
+ `describe` or `context` block is used. The object returned by
1023
+ `processor` is the Wukong::Processor you're testing so you can
1024
+ directly declare introspect on it or declare expectations about its
1025
+ behavior.
1026
+
1027
+ * The `given` method (and other helpers like `given_json`,
1028
+ `given_tsv`, &c.) is a method on the runner. It's a way of lazily
1029
+ feeding records to a processor, without having to go through the
1030
+ `process` method directly and having to handle the block or the
1031
+ processor's lifecycle as in the prior example.
1032
+
1033
+ * The `output` and `emit` matchers will `process` all previously
1034
+ `given` records when they are called. This lets you separate
1035
+ instantiation, input, expectations, and output. Here's a more
1036
+ complicated example.
1037
+
1038
+ The same helpers can be used to test dataflows as well as
1039
+ processors.
1040
+
1041
+ ####
1042
+
1043
+ #### Functions vs. Objects
1044
+
1045
+ The above test helpers are designed to aid in testing processors
1046
+ functionally because:
1047
+
1048
+ * they accept the
1049
+
1050
+ ### Integration Tests
1051
+
1052
+ If you are implementing a new Wukong command (akin to `wu-local`) then
1053
+ you may also want to run integration tests. Wukong comes with helpers
1054
+ for these, too.
1055
+
1056
+ You should almost always be able to test your processors without
1057
+ integration tests. Your unit tests and the Wukong framework itself
1058
+ should ensure that your processors work correctly no matter what
1059
+ environment they are deployed in.
1060
+
1061
+ ```ruby
1062
+ # spec/integration/tokenizer_spec.rb
1063
+ context "running the tokenizer with wu-local" do
1064
+ subject { command("wu-local tokenizer") < "hi there" }
1065
+ it { should exit_with(0) }
1066
+ it { should have_stdout("hi", "there") }
1067
+ end
1068
+
1069
+ context "interpreting its arguments" do
1070
+ context "with a valid --match argument" do
1071
+ subject { command("wu-local tokenizer --match='^hi'") < "hi there" }
1072
+ it { should exit_with(0) }
1073
+ it { should have_stdout("hi") }
1074
+ it { should_not have_stdout("there") }
1075
+ end
1076
+ context "with a malformed --match argument" do
1077
+ # invalid b/c the regexp is broken...
1078
+ subject { command("wu-local tokenizer --match='^(h'") < "hi there" }
1079
+ it { should exit_with(:non_zero) }
1080
+ it { should have_stderr(/invalid/) }
1081
+ end
1082
+ end
1083
+ ```
1084
+
1085
+ Let's go through the helpers:
1086
+
1087
+ * The `command` helper creates a wrapper around a command-line that will be launched. The command's environment and working directory will be taken from the current values of `ENV` and `Dir.pwd`, unless
1088
+
1089
+ * The `in` or `using` arguments are chained with `command` to specify the working directory and environment:
1090
+
1091
+ ```ruby
1092
+ command("some-command with --args").in("/my/working/directory").using("THIS" => "ENV_HASH", "WILL_BE" => "MERGED_OVER_EXISTING_ENV")
1093
+ ```
1094
+
1095
+ * The scope in which the `command` helper is called defines methods `integration_cwd` and `integration_env`. This can be done through including a module in your `spec_helper.rb`:
1096
+
1097
+ ```ruby
1098
+ # in spec/support/integration_helper.rb
1099
+ module IntegrationHelper
1100
+ def integration_cwd
1101
+ "/my/working/directory"
1102
+ end
1103
+ def integration_env
1104
+ { "THIS" => "ENV_HASH", "WILL_BE" => "MERGED_OVER_EXISTING_ENV" }
1105
+ end
1106
+ end
1107
+
1108
+ # in spec/spec_helper.rb
1109
+ require_relative("support/integration_helper")
1110
+ RSpec.configure do |config|
1111
+ config.include(IntegrationHelper)
1112
+ end
1113
+ ```
1114
+
1115
+ * The `command` helper can accept input with the `<` method. Input can be either a String or an Array of strings. It will be passed to the command over STDIN.
1116
+
1117
+ * The `have_stdout` and `have_stderr` matchers let you test the STDOUT or STDERR of the command for particular strings or regular expressions.
1118
+
1119
+ * The `exit_with` matcher lets you test the exit code of the command. You can pass the symbol `:non_zero` to set the expectation of _any_ non-zero exit code.
1120
+
1121
+ ## Plugins
1122
+
1123
+ Wukong has a built-in plugin framework to make it easy to adapt Wukong
1124
+ processors to new backends or add other functionality. The
1125
+ `Wukong::Local` module and the `wu-local` program it supports is
1126
+ itself a Wukong plugin.
1127
+
1128
+ The following shows how you might build a simplified version of
1129
+ `Wukong::Local` as a new plugin. We'll call this plugin `Cat` as it
1130
+ will implement a program `wu-cat` that is similar in function to
1131
+ `wu-local` (just simplified).
1132
+
1133
+ The first thing to do is include the `Wukong::Plugin` module in your
1134
+ code:
1135
+
1136
+
1137
+ ```Ruby
1138
+ # in lib/cat.rb
1139
+ #
1140
+ # This Wukong plugin works like wu-local but replicates some silly
1141
+ # features of cat like numbered lines.
1142
+ module Cat
1143
+
1144
+ # This registers Cat as a Wukong plugin.
1145
+ include Wukong::Plugin
1146
+
1147
+ # Defines any settings specific to Cat. Cat doesn't need to, but
1148
+ # you can define global settings here if you want. You can also
1149
+ # check the `program` name to decide whether to apply your settings.
1150
+ # This helps you not pollute other commands with your stuff.
1151
+ def self.configure settings, program
1152
+ case program
1153
+ when 'wu-cat'
1154
+ settings.define(:input, :description => "The input file to use")
1155
+ settings.define(:number, :description => "Prepend each input record with a consecutive number", :type => :boolean)
1156
+ else
1157
+ # configure other programs if you need to
1158
+ end
1159
+ end
1160
+
1161
+ # Lets Cat boot up with settings that have already been resolved
1162
+ # from the command-line or other sources like config files or remote
1163
+ # servers added by other plugins.
1164
+ #
1165
+ # The `root` directory in which the program is executing is also
1166
+ # provided.
1167
+ def self.boot settings, root
1168
+ puts "Cat booting up using resolved settings within directory #{root}"
1169
+ end
1170
+ end
1171
+ ```
1172
+
1173
+ If your plugin doesn't interact directly with the command-line
1174
+ (through a wu-tool like `wu-local` or `wu-hadoop`) and doesn't
1175
+ directly interface with passing records to processors then you can
1176
+ just require the rest of your plugin's code at this point and be done.
1177
+
1178
+ ### Write a Runner to interact with the command-line
1179
+
1180
+ If you need to implement a new command line tool then you should write
1181
+ a Runner. A Runner is used to implement Wukong programs like
1182
+ `wu-local` or `wu-hadoop`. Here's what the actual program file would
1183
+ look like for our example plugin's `wu-cat` program.
1184
+
1185
+ ```ruby
1186
+ #!/usr/bin/env ruby
1187
+ # in bin/wu-cat
1188
+ require 'cat'
1189
+ Cat::Runner.run
1190
+ ```
1191
+
1192
+ The Cat::Runner class is implemented separately.
1193
+
1194
+ ```ruby
1195
+ # in lib/cat/runner.rb
1196
+ require_relative('driver')
1197
+ module Cat
1198
+
1199
+ # Implements the `wu-cat` command.
1200
+ class Runner < Wukong::Runner
1201
+
1202
+ usage "PROCESSOR|FLOW"
1203
+
1204
+ description <<-EOF
1205
+
1206
+ wu-cat lets you run a Wukong processor or dataflow on the
1207
+ command-line. Try it like this.
1208
+
1209
+ $ wu-cat --input=data.txt
1210
+ hello
1211
+ my
1212
+ friend
1213
+
1214
+ Connect the output to a processor in upcaser.rb
1215
+
1216
+ $ wu-cat --input=data.txt upcaser.rb
1217
+ HELLO
1218
+ MY
1219
+ FRIEND
1220
+
1221
+ You can also include add line numbers to the output.
1222
+
1223
+ $ wu-cat --number --input=data.txt upcaser.rb
1224
+ 1 HELLO
1225
+ 2 MY
1226
+ 3 FRIEND
1227
+ EOF
1228
+
1229
+ # The name of the processor we're going to run. The #args method
1230
+ # is provided by the Runner class.
1231
+ def processor_name
1232
+ args.first
1233
+ end
1234
+
1235
+ # Validate that we were given the name of a registered processor
1236
+ # to run. Be careful to return true here or validation will fail.
1237
+ def validate
1238
+ raise Wukong::Error.new("Must provide a processor as the first argument") unless processor_name
1239
+ true
1240
+ end
1241
+
1242
+ # Delgates to a driver class to run the processor.
1243
+ def run
1244
+ Driver.new(processor_name, settings).start
1245
+ end
1246
+
1247
+ end
1248
+ end
1249
+ ```
1250
+
1251
+ ### Write a Driver to interact with processors
1252
+
1253
+ The `Cat::Runner#run` method delegates to the `Cat::Driver` class to
1254
+ handle instantiating and interacting with processors.
1255
+
1256
+ ```ruby
1257
+ # in lib/cat/driver.rb
1258
+ module Cat
1259
+
1260
+ # A class for driving a processor from `wu-cat`.
1261
+ class Driver
1262
+
1263
+ # Lets us count the records.
1264
+ attr_accessor :number
1265
+
1266
+ # Gives methods to construct and interact with dataflows.
1267
+ include Wukong::DriverMethods
1268
+
1269
+ # Create a new Driver for a dataflow with the given `label` using
1270
+ # the given `settings`.
1271
+ #
1272
+ # @param [String] label the name of the dataflow
1273
+ # @param [Configliere::Param] settings the settings to use when creating the dataflow
1274
+ def initialize label, settings
1275
+ self.settings = settings
1276
+ self.dataflow = construct_dataflow(label, settings)
1277
+ self.number = 1
1278
+ end
1279
+
1280
+ # The file handle of the input file.
1281
+ #
1282
+ # @return [File]
1283
+ def input_file
1284
+ @input_file ||= File.new(settings[:input])
1285
+ end
1286
+
1287
+ # Starts feeding records to the processor
1288
+ def start
1289
+ while line = input_file.readline rescue nil
1290
+ driver.send_through_dataflow(line)
1291
+ end
1292
+ end
1293
+
1294
+ # Process each record that comes back from the dataflow.
1295
+ #
1296
+ # @param [Object] record the yielded record
1297
+ def process record
1298
+ if settings[:number]
1299
+ puts [number, record].map(&:to_s).join("\t")
1300
+ else
1301
+ puts record.to_s
1302
+ end
1303
+ self.number += 1
1304
+ end
1305
+
1306
+ end
1307
+ end
1308
+ ```