ul-wukong 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +60 -0
- data/.gitmodules +6 -0
- data/.rspec +2 -0
- data/.travis.yml +19 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +17 -0
- data/Guardfile +12 -0
- data/LICENSE.md +95 -0
- data/NOTES-travis.md +31 -0
- data/README-old.md +422 -0
- data/README.md +1308 -0
- data/Rakefile +28 -0
- data/TODO.md +99 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +6 -0
- data/bin/md5sort +20 -0
- data/bin/setcat +11 -0
- data/bin/tabchar +5 -0
- data/bin/uniq-ord +59 -0
- data/bin/uniqc +3 -0
- data/bin/wu +34 -0
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-date +13 -0
- data/bin/wu-datetime +13 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +186 -0
- data/bin/wu-local +4 -0
- data/bin/wu-plus +9 -0
- data/bin/wu-source +5 -0
- data/bin/wu-sum +31 -0
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/basic/tiny_count.rb +8 -0
- data/examples/basic/word_count/accumulator.rb +26 -0
- data/examples/basic/word_count/tokenizer.rb +13 -0
- data/examples/basic/word_count/word_count.rb +6 -0
- data/examples/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/deploy_pack/Gemfile +6 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/dsl/dataflow/simple.rb +12 -0
- data/examples/dsl/dataflow/telegram.rb +45 -0
- data/examples/dsl/workflow/cherry_pie.dot +97 -0
- data/examples/dsl/workflow/cherry_pie.md +104 -0
- data/examples/dsl/workflow/cherry_pie.png +0 -0
- data/examples/dsl/workflow/cherry_pie.rb +101 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/examples_helper.rb +9 -0
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/loadable.rb +2 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/examples/munging/airline_flights/airplane.rb +0 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +107 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/rake_helper.rb +97 -0
- data/examples/ruby_project/Gemfile +6 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/splitter.rb +94 -0
- data/examples/string_reverser.rb +7 -0
- data/examples/text/pig_latin/pig_latinizer.rb +35 -0
- data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +36 -0
- data/lib/hanuman/graph.rb +97 -0
- data/lib/hanuman/graphvizzer.rb +206 -0
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +128 -0
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/lib/wu/graph/union_find.rb +62 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/lib/wu/munging.rb +71 -0
- data/lib/wu/social/models/twitter.rb +31 -0
- data/lib/wu/wikipedia/models.rb +20 -0
- data/lib/wukong.rb +54 -0
- data/lib/wukong/dataflow.rb +43 -0
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +214 -0
- data/lib/wukong/driver/event_machine_driver.rb +15 -0
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +42 -0
- data/lib/wukong/local/runner.rb +96 -0
- data/lib/wukong/local/stdio_driver.rb +104 -0
- data/lib/wukong/logger.rb +102 -0
- data/lib/wukong/model/faker.rb +136 -0
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +110 -0
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +169 -0
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers.rb +74 -0
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
- data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
- data/lib/wukong/version.rb +3 -0
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/extract.rb +122 -0
- data/lib/wukong/widget/filters.rb +452 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +10 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +368 -0
- data/lib/wukong/widget/reducers/count.rb +73 -0
- data/lib/wukong/widget/reducers/group.rb +128 -0
- data/lib/wukong/widget/reducers/group_concat.rb +98 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +180 -0
- data/lib/wukong/widget/reducers/uniq.rb +91 -0
- data/lib/wukong/widget/serializers.rb +317 -0
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +7 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
- data/spec/examples/dataflow/parsing_spec.rb +14 -0
- data/spec/examples/dataflow/simple_spec.rb +34 -0
- data/spec/examples/dataflow/telegram_spec.rb +43 -0
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +18 -0
- data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
- data/spec/hanuman/graph_spec.rb +119 -0
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +81 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +43 -0
- data/spec/support/example_test_helpers.rb +95 -0
- data/spec/support/hanuman_test_helpers.rb +92 -0
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +94 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/dataflow_spec.rb +87 -0
- data/spec/wukong/driver_spec.rb +154 -0
- data/spec/wukong/local/runner_spec.rb +29 -0
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/model/faker_spec.rb +132 -0
- data/spec/wukong/processor_spec.rb +21 -0
- data/spec/wukong/runner_spec.rb +132 -0
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/filters_spec.rb +79 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +21 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
- data/spec/wukong/widget/serializers_spec.rb +114 -0
- data/spec/wukong/widget/sink_spec.rb +19 -0
- data/spec/wukong/widget/source_spec.rb +65 -0
- data/spec/wukong/wu-local_spec.rb +109 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +35 -0
- metadata +465 -0
data/README.md
ADDED
@@ -0,0 +1,1308 @@
|
|
1
|
+
# Wukong
|
2
|
+
|
3
|
+
**NOTE: This is a divergent fork of https://github.com/infochimps-labs/wukong. The gem has been renamed to ul-wukong.**
|
4
|
+
|
5
|
+
Wukong is a toolkit for rapid, agile development of data applications
|
6
|
+
at any scale.
|
7
|
+
|
8
|
+
The core concept in Wukong is a **Processor**. Wukong processors are
|
9
|
+
simple Ruby classes that do one thing and do it well. This codebase
|
10
|
+
implements processors and other core Wukong classes and provides a way
|
11
|
+
to run and combine processors on the command-line.
|
12
|
+
|
13
|
+
Wukong's larger theme is *powerful black boxes, beautiful glue*. The
|
14
|
+
Wukong ecosystem consists of other tools which run Wukong processors
|
15
|
+
in various topologies across a variety of different backends. Code
|
16
|
+
written in Wukong can be easily ported between environments and
|
17
|
+
frameworks: local command-line scripts on your laptop instantly turn
|
18
|
+
into powerful jobs running in Hadoop.
|
19
|
+
|
20
|
+
Here is a list of various other projects which you may also want to
|
21
|
+
peruse when trying to understand the full Wukong experience:
|
22
|
+
|
23
|
+
* <a href="http://github.com/infochimps-labs/wukong-hadoop">wukong-hadoop</a>: Run Wukong processors as mappers and reducers within the Hadoop framework. Model Hadoop jobs locally before you run them.
|
24
|
+
* <a href="http://github.com/infochimps-labs/wukong-storm">wukong-storm</a>: Run Wukong processors within the Storm framework. Model flows locally before you run them.
|
25
|
+
* <a href="http://github.com/infochimps-labs/wukong-load">wukong-load</a>: Load the output data from your local Wukong jobs and flows into a variety of different data stores.
|
26
|
+
* <a href="http://github.com/infochimps-labs/wonderdog">wonderdog</a>: Connect Wukong processors running within Hadoop to Elasticsearch as either a source or sink for data.
|
27
|
+
* <a href="http://github.com/infochimps-labs/wukong-deploy">wukong-deploy</a>: Orchestrate Wukong and other wu-tools together to support an application running on the Infochimps Platform.
|
28
|
+
|
29
|
+
For a more holistic perspective also see the Infochimps Platform
|
30
|
+
Community Edition (**FIXME: link to this**) which combines all the
|
31
|
+
Wukong tools together into a jetpack which fits comfortably over the
|
32
|
+
shoulders of developers.
|
33
|
+
|
34
|
+
<a name="processors"></a>
|
35
|
+
## Writing Simple Processors
|
36
|
+
|
37
|
+
The fundamental unit of computation in Wukong is the processor. A
|
38
|
+
processor is Ruby class which
|
39
|
+
|
40
|
+
* subclasses `Wukong::Processor` (use the `Wukong.processor` method as sugar for this)
|
41
|
+
* defines a `process` method which takes an input record, does something, and calls `yield` on the output
|
42
|
+
|
43
|
+
Here's a processor that reverses each of its input records:
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
# in string_reverser.rb
|
47
|
+
Wukong.processor(:string_reverser) do
|
48
|
+
def process string
|
49
|
+
yield string.reverse
|
50
|
+
end
|
51
|
+
end
|
52
|
+
```
|
53
|
+
|
54
|
+
You can run this processor on the command line using text files as
|
55
|
+
input using the `wu-local` tool that comes with Wukong:
|
56
|
+
|
57
|
+
```
|
58
|
+
$ cat novel.txt
|
59
|
+
It was the best of times, it was the worst of times.
|
60
|
+
...
|
61
|
+
|
62
|
+
$ cat novel.txt | wu-local string_reverser.rb
|
63
|
+
.semit fo tsrow eht saw ti ,semit fo tseb eht saw tI
|
64
|
+
```
|
65
|
+
|
66
|
+
The `wu-local` program consumes one line at at time from STDIN and
|
67
|
+
calls your processor's `process` method with that line as a Ruby
|
68
|
+
String object. Each object you `yield` within your process method
|
69
|
+
will be printed back out on STDOUT.
|
70
|
+
|
71
|
+
### Multiple Processors, Multiple (Or No) Yields
|
72
|
+
|
73
|
+
Processors are intended to be combined so they can be stored in the
|
74
|
+
same file like these two, related processors:
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
# in processors.rb
|
78
|
+
|
79
|
+
Wukong.processor(:splitter) do
|
80
|
+
def process line
|
81
|
+
line.split.each { |token| yield token }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
Wukong.processor(:normalizer) do
|
86
|
+
def process token
|
87
|
+
stripped = token.downcase.gsub(/\W/,'')
|
88
|
+
yield stripped if stripped.size > 0
|
89
|
+
end
|
90
|
+
end
|
91
|
+
```
|
92
|
+
|
93
|
+
Notice how the `splitter` yields multiple tokens for each of its input
|
94
|
+
tokens and that the `normalizer` may sometimes never yield at all,
|
95
|
+
depending on its input. Processors are under no obligations by the
|
96
|
+
framework to yield or return anything so they can easily act as
|
97
|
+
filters or even sinks in data flows.
|
98
|
+
|
99
|
+
There are two processors in this file and neither shares a name with
|
100
|
+
the basename of the file ("processors") so `wu-local` can't
|
101
|
+
automatically choose a processor to run. We can specify one
|
102
|
+
explicitly with the `--run` option:
|
103
|
+
|
104
|
+
```
|
105
|
+
$ cat novel.txt | wu-local processors.rb --run=splitter
|
106
|
+
It
|
107
|
+
was
|
108
|
+
the
|
109
|
+
best
|
110
|
+
of
|
111
|
+
times,
|
112
|
+
...
|
113
|
+
```
|
114
|
+
|
115
|
+
We can combine the two processors together
|
116
|
+
|
117
|
+
```
|
118
|
+
$ cat novel.txt | wu-local processors.rb --run=splitter | wu-local processors.rb --run=normalizer
|
119
|
+
it
|
120
|
+
was
|
121
|
+
the
|
122
|
+
best
|
123
|
+
of
|
124
|
+
times
|
125
|
+
...
|
126
|
+
```
|
127
|
+
|
128
|
+
but there's an easier way of doing this with <a href="#flows">dataflows</a>.
|
129
|
+
|
130
|
+
### Adding Configurable Options
|
131
|
+
|
132
|
+
Processors can have options that can be set in Ruby code, from the
|
133
|
+
command-line, a configuration file, or a variety of other places
|
134
|
+
thanks to [Configliere](http://github.com/infochimps-labs/configliere).
|
135
|
+
|
136
|
+
This processor calculates percentiles from observations assuming a
|
137
|
+
normal distribution given a particular mean and standard deviation.
|
138
|
+
It uses two *fields*, the mean or average of a distribution (`mean`)
|
139
|
+
and its standard deviation (`std_dev`). From this information, it
|
140
|
+
will measure the percentile of all input values.
|
141
|
+
|
142
|
+
```ruby
|
143
|
+
# in percentile.rb
|
144
|
+
Wukong.processor(:percentile) do
|
145
|
+
|
146
|
+
SQRT_1_HALF = Math.sqrt(0.5)
|
147
|
+
|
148
|
+
field :mean, Float, :default => 0.0
|
149
|
+
field :std_dev, Float, :default => 1.0
|
150
|
+
|
151
|
+
def process value
|
152
|
+
observation = value.to_f
|
153
|
+
z_score = (mean - observation) / std_dev
|
154
|
+
percentile = 50 * Math.erfc(z_score * SQRT_1_HALF)
|
155
|
+
yield [observation, percentile].join("\t")
|
156
|
+
end
|
157
|
+
end
|
158
|
+
```
|
159
|
+
|
160
|
+
These fields have default values but you can overide them on the
|
161
|
+
command line. If you scored a 95 on an exam where the mean score was
|
162
|
+
80 points and the standard deviation of the scores was 10 points, for
|
163
|
+
example, then you'd be in the 93rd percentile:
|
164
|
+
|
165
|
+
```
|
166
|
+
$ echo 95 | wu-local /tmp/percentile.rb --mean=80 --std_dev=10
|
167
|
+
95.0 93.3192798731142
|
168
|
+
```
|
169
|
+
|
170
|
+
If the exam were more difficult, with a mean of 75 points and a
|
171
|
+
standard deviation of 8 points, you'd be in the 99th percentile!
|
172
|
+
|
173
|
+
```
|
174
|
+
$ echo 95 | wu-local /tmp/percentile.rb --mean=75 --std_dev=8
|
175
|
+
95.0 99.37903346742239
|
176
|
+
```
|
177
|
+
|
178
|
+
### The Lifecycle of a Processor
|
179
|
+
|
180
|
+
Processors have a lifecycle that they execute when they are run within
|
181
|
+
the context of a Wukong runner like `wu-local` or `wu-hadoop`. Each
|
182
|
+
lifecycle phase corresponds to a method of the processor that is
|
183
|
+
called:
|
184
|
+
|
185
|
+
* `setup` called *after* the Processor is initialized but *before* the first record is processed. You cannot yield from this method.
|
186
|
+
* `process` called once for each input record, may yield once, many, or no times.
|
187
|
+
* `finalize` called after the the *last* record has been processed but while the processor still has an opportunity to yield records.
|
188
|
+
* `stop` called to signal to the processor that all work should stop, open connections should be closed, &c. You cannot yield from this method.
|
189
|
+
|
190
|
+
The above examples have already focused on the `process` method.
|
191
|
+
|
192
|
+
The `setup` and `stop` methods are often used together to handle
|
193
|
+
external connections
|
194
|
+
|
195
|
+
```ruby
|
196
|
+
# in geolocator.rb
|
197
|
+
Wukong.processor(:geolocator) do
|
198
|
+
field :host, String, :default => 'localhost'
|
199
|
+
attr_accessor :connection
|
200
|
+
|
201
|
+
def setup
|
202
|
+
self.connection = Database::Connection.new(host)
|
203
|
+
end
|
204
|
+
def process record
|
205
|
+
record.added_value = connection.find("...some query...")
|
206
|
+
end
|
207
|
+
def stop
|
208
|
+
self.connection.close
|
209
|
+
end
|
210
|
+
end
|
211
|
+
```
|
212
|
+
|
213
|
+
The `finalize` method is most useful when writing a "reduce"-type
|
214
|
+
operation that involves storing or aggregating information till some
|
215
|
+
criterion is met. It will always be called after the last record has
|
216
|
+
been given (to `process`) but you can call it whenever you want to
|
217
|
+
within your own code.
|
218
|
+
|
219
|
+
Here's an example of using the `finalize` method to implement a simple
|
220
|
+
counter that counts all the input records:
|
221
|
+
|
222
|
+
```ruby
|
223
|
+
# in counter.rb
|
224
|
+
Wukong.processor(:counter) do
|
225
|
+
attr_accessor :count
|
226
|
+
def setup
|
227
|
+
self.count = 0
|
228
|
+
end
|
229
|
+
def process thing
|
230
|
+
self.count += 1
|
231
|
+
end
|
232
|
+
def finalize
|
233
|
+
yield count
|
234
|
+
end
|
235
|
+
end
|
236
|
+
```
|
237
|
+
|
238
|
+
It hinges on the fact that the last input record will be passed to
|
239
|
+
`process` *first* and only then will `finalize` be called. This
|
240
|
+
allows the last input record to be counted/processed/aggregated and
|
241
|
+
then the entire aggregate to be dealt with in finalize.
|
242
|
+
|
243
|
+
Because of this emphasis on building and processing aggregates, the
|
244
|
+
`finalize` method is often useful within processors meant to run as
|
245
|
+
reducers in a Hadoop environment.
|
246
|
+
|
247
|
+
Note:: Finalize is not guaranteed to be called by in every possible
|
248
|
+
environment as it depends on the chosen runner. In a local or Hadoop
|
249
|
+
environment, the notion of "last record" makes sense and so the
|
250
|
+
corresponding runners will call `finalize`. In an environment like
|
251
|
+
Storm, where the concept of last record is not (supposed to be)
|
252
|
+
meaningful, the corresponding runner doesn't ever call it.
|
253
|
+
|
254
|
+
### Serialization
|
255
|
+
|
256
|
+
`wu-local` (and many similar tools) deal with inputs and outputs as
|
257
|
+
strings.
|
258
|
+
|
259
|
+
Processors want to process objects as close to their domain as is
|
260
|
+
possible. A processor which decorates address book entries with
|
261
|
+
Twitter handles doesn't want to think of its inputs as Strings but
|
262
|
+
Hashes or, better yet, Persons.
|
263
|
+
|
264
|
+
Wukong makes it easy to wrap a processor with other processors
|
265
|
+
dedicated to handling the common tasks of parsing records into or out
|
266
|
+
of formats like JSON and turning them into Ruby model instances.
|
267
|
+
|
268
|
+
#### De-serializing data formats like JSON or TSV
|
269
|
+
|
270
|
+
Wukong can parse and emit common data formats like JSON and delimited
|
271
|
+
formats like TSV or CSV so that you don't pollute or tie down your own
|
272
|
+
processors with protocol logic.
|
273
|
+
|
274
|
+
Here's an example of a processor that wants to deal with Hashes as
|
275
|
+
input.
|
276
|
+
|
277
|
+
```ruby
|
278
|
+
# in extractor.rb
|
279
|
+
Wukong.processor(:extractor) do
|
280
|
+
def process hsh
|
281
|
+
yield hsh["first_name"]
|
282
|
+
end
|
283
|
+
end
|
284
|
+
```
|
285
|
+
|
286
|
+
Given JSON data,
|
287
|
+
|
288
|
+
```
|
289
|
+
$ cat input.json
|
290
|
+
{"first_name": "John", "last_name":, "Smith"}
|
291
|
+
{"first_name": "Sally", "last_name":, "Johnson"}
|
292
|
+
...
|
293
|
+
```
|
294
|
+
|
295
|
+
you can feed it directly to a processor
|
296
|
+
|
297
|
+
```
|
298
|
+
$ cat input.json | wu-local --from=json extractor.rb
|
299
|
+
John
|
300
|
+
Sally
|
301
|
+
...
|
302
|
+
```
|
303
|
+
|
304
|
+
Other processors really like Arrays:
|
305
|
+
|
306
|
+
```ruby
|
307
|
+
# in summer.rb
|
308
|
+
Wukong.processor(:summer) do
|
309
|
+
def process values
|
310
|
+
yield values.map(&:to_f).inject(&:+)
|
311
|
+
end
|
312
|
+
end
|
313
|
+
```
|
314
|
+
|
315
|
+
so you can feed them TSV data
|
316
|
+
```
|
317
|
+
$ cat data.tsv
|
318
|
+
1 2 3
|
319
|
+
4 5 6
|
320
|
+
7 8 9
|
321
|
+
...
|
322
|
+
$ cat data.tsv | wu-local --from=tsv summer.rb
|
323
|
+
6
|
324
|
+
15
|
325
|
+
24
|
326
|
+
...
|
327
|
+
```
|
328
|
+
|
329
|
+
but you can just as easily use the same code with CSV data
|
330
|
+
|
331
|
+
```
|
332
|
+
$ cat data.tsv | wu-local --from=csv summer.rb
|
333
|
+
```
|
334
|
+
|
335
|
+
or a more general delimited format.
|
336
|
+
|
337
|
+
```
|
338
|
+
$ cat data.tsv | wu-local --from=delimited --delimiter='--' summer.rb
|
339
|
+
```
|
340
|
+
|
341
|
+
#### Recordizing data structures into domain models
|
342
|
+
|
343
|
+
Here's a contact validator that relies on a Person model to decide
|
344
|
+
whether a contact entry should be yielded:
|
345
|
+
|
346
|
+
```ruby
|
347
|
+
# in contact_validator.rb
|
348
|
+
require 'person'
|
349
|
+
|
350
|
+
Wukong.processor(:contact_validator) do
|
351
|
+
def process person
|
352
|
+
yield person if person.valid?
|
353
|
+
end
|
354
|
+
end
|
355
|
+
```
|
356
|
+
|
357
|
+
Relying on the (elsewhere-defined) Person model to define `valid?`
|
358
|
+
means the processor can stay skinny and readable. Wukong can, in
|
359
|
+
combination with the deserializing features above, turn input text
|
360
|
+
into instances of Person:
|
361
|
+
|
362
|
+
```
|
363
|
+
$ cat input.json | wu-local --consumes=Person --from=json contact_validator.rb
|
364
|
+
#<Person:0x000000020e6120>
|
365
|
+
#<Person:0x000000020e6120>
|
366
|
+
#<Person:0x000000020e6120>
|
367
|
+
```
|
368
|
+
|
369
|
+
`wu-local` can also serialize records from the `contact_validator`
|
370
|
+
processor:
|
371
|
+
|
372
|
+
```
|
373
|
+
$ cat input.json | wu-local --consumes=Person --from=json contact_validator.rb --to=json
|
374
|
+
{"first_name": "John", "last_name":, "Smith", "valid": "true"}
|
375
|
+
{"first_name": "Sally", "last_name":, "Johnson", "valid": "true"}
|
376
|
+
...
|
377
|
+
```
|
378
|
+
|
379
|
+
Serialization formats work just like deserialization formats, with
|
380
|
+
JSON as well as delimited formats available.
|
381
|
+
|
382
|
+
Parsing records into model instances and serializing them out again
|
383
|
+
puts constraints on the model class providing these instances. Here's
|
384
|
+
what the `Person` class needs to look like:
|
385
|
+
|
386
|
+
|
387
|
+
```ruby
|
388
|
+
# in person.rb
|
389
|
+
class Person
|
390
|
+
|
391
|
+
# Create a new Person from the given attributes. Supports usage of
|
392
|
+
# the `--consumes` flag on the command-line
|
393
|
+
#
|
394
|
+
# @param [Hash] attrs
|
395
|
+
# @return [Person]
|
396
|
+
def self.receive attrs
|
397
|
+
new(attrs)
|
398
|
+
end
|
399
|
+
|
400
|
+
# Turn this Person into a basic data structure. Supports the usage
|
401
|
+
# of the `--to` flag on the command-line.
|
402
|
+
#
|
403
|
+
# @return [Hash]
|
404
|
+
def to_wire
|
405
|
+
to_hash
|
406
|
+
end
|
407
|
+
end
|
408
|
+
```
|
409
|
+
|
410
|
+
To support the `--consumes=Person` syntax, the `receive` class method
|
411
|
+
must take a Hash produced from the operation of the `--from` argument
|
412
|
+
and return a `Person` instance.
|
413
|
+
|
414
|
+
To support the `--to=json` syntax, the `Person` class must implement
|
415
|
+
the `to_wire` instance method.
|
416
|
+
|
417
|
+
### Logging and Notifications
|
418
|
+
|
419
|
+
Wukong comes with a logger that all processors have access to via
|
420
|
+
their `log` attribute. This logger has the following priorities:
|
421
|
+
|
422
|
+
* debug (can be set as a log level)
|
423
|
+
* info (can be set as a log level)
|
424
|
+
* warn (can be set as a log level)
|
425
|
+
* error
|
426
|
+
* fatal
|
427
|
+
|
428
|
+
and here's a processor which uses them all
|
429
|
+
|
430
|
+
```ruby
|
431
|
+
# in logs.rb
|
432
|
+
Wukong.processor(:logs) do
|
433
|
+
def process line
|
434
|
+
log.debug line
|
435
|
+
log.info line
|
436
|
+
log.warn line
|
437
|
+
log.error line
|
438
|
+
log.fatal line
|
439
|
+
end
|
440
|
+
end
|
441
|
+
```
|
442
|
+
|
443
|
+
The default log level is DEBUG.
|
444
|
+
|
445
|
+
```
|
446
|
+
$ echo something | wu-local logs.rb
|
447
|
+
DEBUG 2013-01-11 23:40:56 [Logs ] -- something
|
448
|
+
INFO 2013-01-11 23:40:56 [Logs ] -- something
|
449
|
+
WARN 2013-01-11 23:40:56 [Logs ] -- something
|
450
|
+
ERROR 2013-01-11 23:40:56 [Logs ] -- something
|
451
|
+
FATAL 2013-01-11 23:40:56 [Logs ] -- something
|
452
|
+
```
|
453
|
+
|
454
|
+
though you can set it to something else globally
|
455
|
+
|
456
|
+
```
|
457
|
+
$ echo something | wu-local logs.rb --log.level=warn
|
458
|
+
WARN 2013-01-11 23:40:56 [Logs ] -- something
|
459
|
+
ERROR 2013-01-11 23:40:56 [Logs ] -- something
|
460
|
+
FATAL 2013-01-11 23:40:56 [Logs ] -- something
|
461
|
+
```
|
462
|
+
|
463
|
+
or on a per-class basis.
|
464
|
+
|
465
|
+
### Creating Documentation
|
466
|
+
|
467
|
+
`wu-local` includes a help message:
|
468
|
+
|
469
|
+
```
|
470
|
+
$ wu-local --help
|
471
|
+
usage: wu-local [ --param=val | --param | -p val | -p ] PROCESSOR|FLOW
|
472
|
+
|
473
|
+
wu-local is a tool for running Wukong processors and flows locally on
|
474
|
+
the command-line. Use wu-local by passing it a processor and feeding
|
475
|
+
...
|
476
|
+
|
477
|
+
|
478
|
+
Params:
|
479
|
+
-r, --run=String Name of the processor or dataflow to use. Defaults to basename of the given path.
|
480
|
+
```
|
481
|
+
|
482
|
+
You can generate custom help messages for your own processors. Here's
|
483
|
+
the percentile processor from before but made more usable with good
|
484
|
+
documentation:
|
485
|
+
|
486
|
+
```ruby
|
487
|
+
# in percentile.rb
|
488
|
+
Wukong.processor(:percentile) do
|
489
|
+
|
490
|
+
description <<-EOF.gsub(/^ {2}/,'')
|
491
|
+
This processor calculates percentiles from input scores based on a
|
492
|
+
given mean score and a given standard deviation for the scores.
|
493
|
+
|
494
|
+
The mean and standard deviation are given at run time and processed
|
495
|
+
scores will be compared against the given mean and standard
|
496
|
+
deviation.
|
497
|
+
|
498
|
+
The input is expected to consist of float values, one per line.
|
499
|
+
|
500
|
+
Example:
|
501
|
+
|
502
|
+
$ cat input.dat
|
503
|
+
88
|
504
|
+
89
|
505
|
+
77
|
506
|
+
...
|
507
|
+
|
508
|
+
$ cat input.dat | wu-local percentile.rb --mean=85 --std_dev=7
|
509
|
+
88.0 66.58824291023753
|
510
|
+
89.0 71.61454169013237
|
511
|
+
77.0 12.654895447355777
|
512
|
+
EOF
|
513
|
+
|
514
|
+
SQRT_1_HALF = Math.sqrt(0.5)
|
515
|
+
|
516
|
+
field :mean, Float, :default => 0.0, :doc => "The mean of the assumed distribution"
|
517
|
+
field :std_dev, Float, :default => 1.0, :doc => "The standard deviation of the assumed distribution"
|
518
|
+
|
519
|
+
def process value
|
520
|
+
observation = value.to_f
|
521
|
+
z_score = (mean - observation) / std_dev
|
522
|
+
percentile = 50 * Math.erfc(z_score * SQRT_1_HALF)
|
523
|
+
yield [observation, percentile].join("\t")
|
524
|
+
end
|
525
|
+
end
|
526
|
+
```
|
527
|
+
|
528
|
+
If you call `wu-local` with the file to this processor as an argument
|
529
|
+
in addition to the original `--help` argument, you'll get custom
|
530
|
+
documentation.
|
531
|
+
|
532
|
+
```
|
533
|
+
$ wu-local percentile.rb --help
|
534
|
+
usage: wu-local [ --param=val | --param | -p val | -p ] PROCESSOR|FLOW
|
535
|
+
|
536
|
+
This processor calculates percentiles from input scores based on a
|
537
|
+
given mean score and a given standard deviation for the scores.
|
538
|
+
...
|
539
|
+
|
540
|
+
|
541
|
+
Params:
|
542
|
+
--mean=Float The mean of the assumed distribution [Default: 0.0]
|
543
|
+
-r, --run=String Name of the processor or dataflow to use. Defaults to basename of the given path.
|
544
|
+
--std_dev=Float The standard deviation of the assumed distribution [Default: 1.0]
|
545
|
+
|
546
|
+
```
|
547
|
+
|
548
|
+
<a name="flows"></a>
|
549
|
+
## Combining Processors into Dataflows
|
550
|
+
|
551
|
+
Wukong provides a DSL for combining processors together into
|
552
|
+
dataflows. This DSL is designed to make it easy to replicate the
|
553
|
+
tried and true UNIX philosophy of building simple tools which do one
|
554
|
+
thing well and then combining them together to create more complicated
|
555
|
+
flows.
|
556
|
+
|
557
|
+
For example, having written the `tokenizer` processor, we can use it
|
558
|
+
in a dataflow along with the built-in `regexp` processor to replicate
|
559
|
+
what we did in the last example:
|
560
|
+
|
561
|
+
```ruby
|
562
|
+
# in find_t_words.rb
|
563
|
+
require_relative('processors')
|
564
|
+
Wukong.dataflow(:find_t_words) do
|
565
|
+
tokenizer | regexp(match: /^t/)
|
566
|
+
end
|
567
|
+
```
|
568
|
+
|
569
|
+
The `|` operator connects the output of one processor (what it
|
570
|
+
`yield`s) with the input of another (its `process` method). In this
|
571
|
+
example, every record emitted by `tokenizer` will be subsequently
|
572
|
+
processed by `regexp`.
|
573
|
+
|
574
|
+
You can run this dataflow directly (mimicing what we did above with
|
575
|
+
single processors chained together on the command-line):
|
576
|
+
|
577
|
+
```
|
578
|
+
$ cat novel.txt | wu-local find_t_words.rb
|
579
|
+
the
|
580
|
+
times
|
581
|
+
the
|
582
|
+
times
|
583
|
+
...
|
584
|
+
```
|
585
|
+
|
586
|
+
### More complicated dataflow topologies
|
587
|
+
|
588
|
+
The Wukong dataflow DSL allows for more complicated topologies than
|
589
|
+
just chaining processors together in a linear pipeline.
|
590
|
+
|
591
|
+
The `|` operator, used in the above examples to connect two processors
|
592
|
+
together into a chain, can also be used to connect a single processor
|
593
|
+
to *multiple* processors, creating a branch-point in the dataflow.
|
594
|
+
Each branch of the flow will receive the same records.
|
595
|
+
|
596
|
+
This can be used to perform multiple actions with the same record, as
|
597
|
+
in the following example:
|
598
|
+
|
599
|
+
```ruby
|
600
|
+
# in book_reviews.rb
|
601
|
+
Wukong.dataflow(:complicated) do
|
602
|
+
from_json | recordize(model: BookReview) |
|
603
|
+
[
|
604
|
+
map(&:author) | do_author_stuff | ... | to_json,
|
605
|
+
map(&:book) | do_book_stuff | ... | to_json,
|
606
|
+
]
|
607
|
+
end
|
608
|
+
```
|
609
|
+
|
610
|
+
Each `BookReview` record yielded by the `recordize` processor will be
|
611
|
+
passed to both subsequent branches of the flow, with each branch doing
|
612
|
+
a different kind of processing. Output records from both branches
|
613
|
+
(which are here turned `to_json` first) will be interspersed in the
|
614
|
+
final output when run.
|
615
|
+
|
616
|
+
A processor like `select`, which filters its inputs, can be used to
|
617
|
+
split a flow into records of two types:
|
618
|
+
|
619
|
+
```ruby
|
620
|
+
# in complicated.rb
|
621
|
+
Wukong.dataflow(:complicated) do
|
622
|
+
from_json | parser |
|
623
|
+
[
|
624
|
+
select(&:valid?) | further_processing | ... | to_json,
|
625
|
+
select(&:invalid?) | track_errors | null
|
626
|
+
]
|
627
|
+
end
|
628
|
+
```
|
629
|
+
|
630
|
+
Here, only records which respond true to the method `valid?` will pass
|
631
|
+
through the first flow (applying `further_processing` and so on) while
|
632
|
+
only records which respond true to `invalid?` will pass through the
|
633
|
+
second flow (with `track_errors`). The `null` processor at the end of
|
634
|
+
this second branch ensures that only records from the first branch
|
635
|
+
will be emitted in the final output.
|
636
|
+
|
637
|
+
Flows can be split over and over again, allowing for rich semantics
|
638
|
+
when processing an input source:
|
639
|
+
|
640
|
+
```ruby
|
641
|
+
# in many_splits.rb
|
642
|
+
Wukong.dataflow(:many_splits) do
|
643
|
+
from_json | parser | recordize(model: BookReview) |
|
644
|
+
[
|
645
|
+
map(&:author) | ... | to_json,
|
646
|
+
map(&:publisher) |
|
647
|
+
[
|
648
|
+
select(&:domestic?) | ... | to_json,
|
649
|
+
select(&:international?) |
|
650
|
+
[
|
651
|
+
select(&:north_american?) | ... |
|
652
|
+
[
|
653
|
+
select(&:american?) | ... | to_json,
|
654
|
+
select(&:canadian?) | ... | to_json,
|
655
|
+
select(&:mexican?) | ... | to_json,
|
656
|
+
],
|
657
|
+
select(&:asian?) | ... | to_json,
|
658
|
+
select(&:european?) | ... | to_json,
|
659
|
+
],
|
660
|
+
],
|
661
|
+
map(&:title) | ... | to_json
|
662
|
+
]
|
663
|
+
end
|
664
|
+
```
|
665
|
+
|
666
|
+
<a name="serialization></a>
|
667
|
+
## Serialization
|
668
|
+
|
669
|
+
The process method for a Processor must accept a String argument and
|
670
|
+
yield a String argument (or something that will `to_s` appropriately).
|
671
|
+
|
672
|
+
**Coming Soon:** The ability to define `consumes` and `emits` to
|
673
|
+
automatically handle serialization and deserialization.
|
674
|
+
|
675
|
+
<a name="widgets></a>
|
676
|
+
## Widgets
|
677
|
+
|
678
|
+
Wukong has a number of built-in widgets that are useful for
|
679
|
+
scaffolding your dataflows or using as starting off points for your
|
680
|
+
own processors.
|
681
|
+
|
682
|
+
For any of these widgets you can get customized help, say
|
683
|
+
|
684
|
+
```
|
685
|
+
$ wu-local group --help
|
686
|
+
```
|
687
|
+
|
688
|
+
### Serializers
|
689
|
+
|
690
|
+
Serializers are widgets which don't change the semantic meaning of a
|
691
|
+
record, merely its representation. Here's a list:
|
692
|
+
|
693
|
+
* `to_json`, `from_json` for turning records into JSON or parsing JSON into records
|
694
|
+
* `to_tsv`, `from_tsv` for turning Array records into TSV or parsing TSV into Array records
|
695
|
+
* `pretty` for pretty printing JSON inputs
|
696
|
+
|
697
|
+
When you're writing processors that are capable of running in
|
698
|
+
isolation you'll want to ensure that you deserialize and serialize
|
699
|
+
records on the way in and out, using the serialization/deserialization
|
700
|
+
options `--to` and `--from` on the command-line, as <a
|
701
|
+
href="#serialization">defined above</a>.
|
702
|
+
|
703
|
+
For processors which will only run inside a data flow, you can
|
704
|
+
optimize by not doing any (de)serialization until except at the very
|
705
|
+
beginning and at the end
|
706
|
+
|
707
|
+
```ruby
|
708
|
+
Wukong.dataflow(:complicated) do
|
709
|
+
from_json | proc_1 | proc_2 | proc_3 ... proc_n | to_json
|
710
|
+
end
|
711
|
+
```
|
712
|
+
|
713
|
+
in this approach, no serialization will be done between processors,
|
714
|
+
only at the beginning and end.
|
715
|
+
|
716
|
+
(This is actually the implementation behind the serialization options
|
717
|
+
themselves -- they dynamically prepend/append the appropriate
|
718
|
+
deserializers/serializers.)
|
719
|
+
|
720
|
+
### General Purpose
|
721
|
+
|
722
|
+
There are several general purpose processors which implement common
|
723
|
+
patterns on input and output data. These are most useful within the
|
724
|
+
context of a dataflow definition.
|
725
|
+
|
726
|
+
* `null` does what you think it doesn't
|
727
|
+
* `map` perform some block on each
|
728
|
+
* `flatten` flatten the input array
|
729
|
+
* `filter`, `select`, `reject` only let certain records through based on a block
|
730
|
+
* `regexp`, `not_regexp` only pass records matching (or not matching) a regular expression
|
731
|
+
* `limit` only let some number of records pass
|
732
|
+
* `logger` send events to the local log stream
|
733
|
+
* `extract` extract some part of each input event
|
734
|
+
|
735
|
+
Some of these widgets can be used directly, perhaps with some
|
736
|
+
arguments
|
737
|
+
|
738
|
+
```ruby
|
739
|
+
Wukong.processor(:log_everything) do
|
740
|
+
proc_1 | proc_2 | ... | logger
|
741
|
+
end
|
742
|
+
|
743
|
+
Wukong.processor(:log_everything_important) do
|
744
|
+
proc_1 | proc_2 | ... | regexp(match: /important/i) | logger
|
745
|
+
end
|
746
|
+
```
|
747
|
+
|
748
|
+
Other widgets require a block to define their action:
|
749
|
+
|
750
|
+
```ruby
|
751
|
+
Wukong.processor(:log_everything_important) do
|
752
|
+
parser | select { |record| record.priority =~ /important/i } | logger
|
753
|
+
end
|
754
|
+
```
|
755
|
+
|
756
|
+
### Reducers
|
757
|
+
|
758
|
+
There are a selection of widgets that do aggregative operations like
|
759
|
+
counting, sorting, and summing.
|
760
|
+
|
761
|
+
* `count` emits a final count of all input records
|
762
|
+
* `sort` can sort input streams
|
763
|
+
* `group` will group records by some extracting part and give a count of each group's size
|
764
|
+
* `moments` will emit more complicated statistics (mean, std. dev.) on the group given some other value to measure
|
765
|
+
|
766
|
+
Here's an example of sorting data right on the command line
|
767
|
+
|
768
|
+
```
|
769
|
+
$ head tokens.txt | wu-local sort
|
770
|
+
abhor
|
771
|
+
abide
|
772
|
+
abide
|
773
|
+
able
|
774
|
+
able
|
775
|
+
able
|
776
|
+
about
|
777
|
+
...
|
778
|
+
```
|
779
|
+
|
780
|
+
Try adding group:
|
781
|
+
|
782
|
+
```
|
783
|
+
$ head tokens.txt | wu-local sort | wu-local group
|
784
|
+
{:group=>"abhor", :count=>1}
|
785
|
+
{:group=>"abide", :count=>2}
|
786
|
+
{:group=>"able", :count=>3}
|
787
|
+
{:group=>"about", :count=>3}
|
788
|
+
{:group=>"above", :count=>1}
|
789
|
+
...
|
790
|
+
```
|
791
|
+
|
792
|
+
You can also use these within a more complicated dataflow:
|
793
|
+
|
794
|
+
```ruby
|
795
|
+
Wukong.dataflow(:word_count) do
|
796
|
+
tokenize | remove_stopwords | sort | group
|
797
|
+
end
|
798
|
+
```
|
799
|
+
|
800
|
+
## Commands
|
801
|
+
|
802
|
+
Wukong comes with a few commands built-in.
|
803
|
+
|
804
|
+
### wu-local
|
805
|
+
|
806
|
+
You've seen one already, `wu-local`, in many of the examples above.
|
807
|
+
`wu-local` is used to model dataflows locally, using `STDIN` and
|
808
|
+
`STDOUT` for input and output.
|
809
|
+
|
810
|
+
`wu-local` is a "core" Wukong command in the sense that more
|
811
|
+
complicated commands like `wu-hadoop` and `wu-storm`, implemented by
|
812
|
+
Wukong plugins, ultimately invoke some `wu-local` process.
|
813
|
+
|
814
|
+
### wu-source
|
815
|
+
|
816
|
+
Wukong also comes with another basic command `wu-source`. This
|
817
|
+
command works very similarly to `wu-local` except that it doesn't read
|
818
|
+
any input from `STDIN`. Instead it generates its *own* input records
|
819
|
+
in an easy to configure, periodic way. It thus acts as a *source* of
|
820
|
+
data for other processes in a UNIX pipeline.
|
821
|
+
|
822
|
+
Here's an example using the `identity` processor which will have the
|
823
|
+
effect of printing to `STDOUT` the exact input received:
|
824
|
+
|
825
|
+
```
|
826
|
+
$ wu-source identity
|
827
|
+
1
|
828
|
+
2
|
829
|
+
3
|
830
|
+
...
|
831
|
+
```
|
832
|
+
|
833
|
+
From this example it's clear that the records produced by `wu-source`
|
834
|
+
are consecutive integers starting at 1 and that they are produced at a
|
835
|
+
rate of one record per second.
|
836
|
+
|
837
|
+
`wu-source` can thus be used to turn any processor (or dataflow) into
|
838
|
+
a source of data:
|
839
|
+
|
840
|
+
```ruby
|
841
|
+
# in random_numbers.rb
|
842
|
+
Wukong.processor(:random_numbers) do
|
843
|
+
def process index
|
844
|
+
yield rand() * index.to_i
|
845
|
+
end
|
846
|
+
end
|
847
|
+
```
|
848
|
+
|
849
|
+
Run `random_numbers` like this:
|
850
|
+
|
851
|
+
```
|
852
|
+
$ wu-source random_numbers.rb
|
853
|
+
0.7671364694830113
|
854
|
+
0.5958089791553307
|
855
|
+
1.8284806932633886
|
856
|
+
3.707189931235327
|
857
|
+
4.106618048255548
|
858
|
+
...
|
859
|
+
```
|
860
|
+
|
861
|
+
Which produces random numbers with an ever greater ceiling.
|
862
|
+
|
863
|
+
You can also completely ignore the input record from `wu-source` in
|
864
|
+
your processor:
|
865
|
+
|
866
|
+
```ruby
|
867
|
+
# in generator.rb
|
868
|
+
Wukong.processor(:generator) do
|
869
|
+
def process _
|
870
|
+
yield new_record
|
871
|
+
end
|
872
|
+
def new_record
|
873
|
+
MyRecord.new(...)
|
874
|
+
end
|
875
|
+
end
|
876
|
+
```
|
877
|
+
|
878
|
+
which can produce `MyRecord` instances as it's driven by `wu-source`.
|
879
|
+
|
880
|
+
It's easy to generate several thousand events per second using
|
881
|
+
`wu-source` this way:
|
882
|
+
|
883
|
+
```
|
884
|
+
$ wu-source generator.rb --per_sec=2000
|
885
|
+
```
|
886
|
+
|
887
|
+
or use the `--period` (which is the inverse of `--per_sec`) to spit
|
888
|
+
out records at a regular interval (every 5 minutes in this example):
|
889
|
+
|
890
|
+
```
|
891
|
+
$ wu-source generator.rb --period=300
|
892
|
+
```
|
893
|
+
|
894
|
+
`wu-source` can naturally combine with other dataflows or programs you
|
895
|
+
might write:
|
896
|
+
|
897
|
+
```
|
898
|
+
$ wu-source generator.rb --per_sec=200 | wu-local my_flow
|
899
|
+
```
|
900
|
+
### wu
|
901
|
+
|
902
|
+
The `wu` command is a convenience command useful when using any of the
|
903
|
+
other `wu-` commands in the context of a Ruby project with a
|
904
|
+
[`Gemfile`](http://bundler.io/v1.3/gemfile.html).
|
905
|
+
|
906
|
+
Instead of typing
|
907
|
+
|
908
|
+
```
|
909
|
+
$ bundle exec wu-local my_flow --option=value ...
|
910
|
+
```
|
911
|
+
|
912
|
+
which would run `wu-local` using the exact version of `wukong` (and
|
913
|
+
any other dependencies) as declared in your project's `Gemfile` and
|
914
|
+
`Gemfile.lock`, the `wu` command lets you type
|
915
|
+
|
916
|
+
```
|
917
|
+
$ wu local my_flow --option=value ...
|
918
|
+
```
|
919
|
+
|
920
|
+
essentially adding the `bundle exec` prefix and munging `wu local` to
|
921
|
+
`wu-local` for you. This can be very helpful when doing lots of work
|
922
|
+
with Wukong.
|
923
|
+
|
924
|
+
**Note:** If `bundle exec wu-whatever` works in your project but `wu
|
925
|
+
whatever` fails it is probably because Bundler is resolving `wu-`
|
926
|
+
commands to some installation that is not on your `$PATH` (often the
|
927
|
+
case if you ran `bundle install --standalone`). Ensure that the
|
928
|
+
`wukong` gem is installed on your system and that it's binaries are
|
929
|
+
your `$PATH` to use the `wu` command.
|
930
|
+
|
931
|
+
## Testing
|
932
|
+
|
933
|
+
Wukong comes with several helpers to make writing specs using
|
934
|
+
[RSpec](http://rspec.info/) easier.
|
935
|
+
|
936
|
+
The only method that you need to test in a Processor is the `process`
|
937
|
+
method. The rest of the processor's methods and functionality are
|
938
|
+
provided by Wukong and are already tested.
|
939
|
+
|
940
|
+
You may want to test this process method in two ways:
|
941
|
+
|
942
|
+
* unit tests of the class itself in various contexts
|
943
|
+
* integration tests of running the class with the `wu-local` (or other) command-line runner
|
944
|
+
|
945
|
+
### Unit Tests
|
946
|
+
|
947
|
+
Let's start with a simple processor
|
948
|
+
|
949
|
+
```ruby
|
950
|
+
# in tokenizer.rb
|
951
|
+
Wukong.processor(:tokenizer) do
|
952
|
+
def process text
|
953
|
+
text.downcase.gsub(/[^\s\w]/,'').split.each do |token|
|
954
|
+
yield token
|
955
|
+
end
|
956
|
+
end
|
957
|
+
end
|
958
|
+
```
|
959
|
+
|
960
|
+
You could test this processor directly:
|
961
|
+
|
962
|
+
```ruby
|
963
|
+
# in spec/tokenizer_spec.rb
|
964
|
+
require 'spec_helper'
|
965
|
+
describe :tokenizer do
|
966
|
+
subject { Wukong::Processor::Tokenizer.new }
|
967
|
+
before { subject.setup }
|
968
|
+
after { subject.finalize ; subject.stop }
|
969
|
+
it "correctly counts tokens" do
|
970
|
+
expect { |b| subject.process("Hi there, Wukong!", &b) }.to yield_successive_args('hi', 'there', 'wukong')
|
971
|
+
end
|
972
|
+
end
|
973
|
+
```
|
974
|
+
|
975
|
+
but having to handle the yield from the block yourself can lead to
|
976
|
+
verbose and unreadable tests. Wukong defines some helpers for this
|
977
|
+
case. Require and include them first in your `spec_helper.rb`:
|
978
|
+
|
979
|
+
```ruby
|
980
|
+
# spec/spec_helper.rb
|
981
|
+
require 'wukong'
|
982
|
+
require 'wukong/spec_helpers'
|
983
|
+
RSpec.configure do |config|
|
984
|
+
config.include(Wukong::SpecHelpers)
|
985
|
+
end
|
986
|
+
```
|
987
|
+
|
988
|
+
and then use them in your test
|
989
|
+
|
990
|
+
```ruby
|
991
|
+
# in spec/tokenizer_spec.rb
|
992
|
+
require 'spec_helper'
|
993
|
+
describe :tokenizer do
|
994
|
+
it_behaves_like 'a processor', :named => :tokenizer
|
995
|
+
it "emits the correct number of tokens" do
|
996
|
+
processor.given("Hi there.\nMy name is Wukong!").should emit(6).records
|
997
|
+
end
|
998
|
+
it "eliminates all punctuation" do
|
999
|
+
processor(:tokenizer).given("Never!").should emit('Never')
|
1000
|
+
end
|
1001
|
+
it "will not emit tokens in a stop list" do
|
1002
|
+
processor(:tokenizer, :stop_list => ['apples', 'bananas']).given("I like apples and bananas").should emit('I', 'like', 'and')
|
1003
|
+
end
|
1004
|
+
end
|
1005
|
+
```
|
1006
|
+
|
1007
|
+
Let's look at each kind of helper:
|
1008
|
+
|
1009
|
+
* The `a processor` shared example (invoked with RSpec's
|
1010
|
+
`it_behaves_like` helper) adds some tests that ensure that the
|
1011
|
+
processor conforms to the API of a Wukong::Processor.
|
1012
|
+
|
1013
|
+
* The `processor` method is actually an alias for the more aptly named
|
1014
|
+
(but less convenient) `unit_test_runner`. This method accepts a
|
1015
|
+
processor name and options (just like `wu-local` and other
|
1016
|
+
command-line tools) and returns a Wukong::UnitTestRunner instance.
|
1017
|
+
This runner handles the
|
1018
|
+
|
1019
|
+
|
1020
|
+
a (registered) processor name and options and creates a new
|
1021
|
+
processor. If no name is given, the argument of the enclosing
|
1022
|
+
`describe` or `context` block is used. The object returned by
|
1023
|
+
`processor` is the Wukong::Processor you're testing so you can
|
1024
|
+
directly declare introspect on it or declare expectations about its
|
1025
|
+
behavior.
|
1026
|
+
|
1027
|
+
* The `given` method (and other helpers like `given_json`,
|
1028
|
+
`given_tsv`, &c.) is a method on the runner. It's a way of lazily
|
1029
|
+
feeding records to a processor, without having to go through the
|
1030
|
+
`process` method directly and having to handle the block or the
|
1031
|
+
processor's lifecycle as in the prior example.
|
1032
|
+
|
1033
|
+
* The `output` and `emit` matchers will `process` all previously
|
1034
|
+
`given` records when they are called. This lets you separate
|
1035
|
+
instantiation, input, expectations, and output. Here's a more
|
1036
|
+
complicated example.
|
1037
|
+
|
1038
|
+
The same helpers can be used to test dataflows as well as
|
1039
|
+
processors.
|
1040
|
+
|
1041
|
+
####
|
1042
|
+
|
1043
|
+
#### Functions vs. Objects
|
1044
|
+
|
1045
|
+
The above test helpers are designed to aid in testing processors
|
1046
|
+
functionally because:
|
1047
|
+
|
1048
|
+
* they accept the
|
1049
|
+
|
1050
|
+
### Integration Tests
|
1051
|
+
|
1052
|
+
If you are implementing a new Wukong command (akin to `wu-local`) then
|
1053
|
+
you may also want to run integration tests. Wukong comes with helpers
|
1054
|
+
for these, too.
|
1055
|
+
|
1056
|
+
You should almost always be able to test your processors without
|
1057
|
+
integration tests. Your unit tests and the Wukong framework itself
|
1058
|
+
should ensure that your processors work correctly no matter what
|
1059
|
+
environment they are deployed in.
|
1060
|
+
|
1061
|
+
```ruby
|
1062
|
+
# spec/integration/tokenizer_spec.rb
|
1063
|
+
context "running the tokenizer with wu-local" do
|
1064
|
+
subject { command("wu-local tokenizer") < "hi there" }
|
1065
|
+
it { should exit_with(0) }
|
1066
|
+
it { should have_stdout("hi", "there") }
|
1067
|
+
end
|
1068
|
+
|
1069
|
+
context "interpreting its arguments" do
|
1070
|
+
context "with a valid --match argument" do
|
1071
|
+
subject { command("wu-local tokenizer --match='^hi'") < "hi there" }
|
1072
|
+
it { should exit_with(0) }
|
1073
|
+
it { should have_stdout("hi") }
|
1074
|
+
it { should_not have_stdout("there") }
|
1075
|
+
end
|
1076
|
+
context "with a malformed --match argument" do
|
1077
|
+
# invalid b/c the regexp is broken...
|
1078
|
+
subject { command("wu-local tokenizer --match='^(h'") < "hi there" }
|
1079
|
+
it { should exit_with(:non_zero) }
|
1080
|
+
it { should have_stderr(/invalid/) }
|
1081
|
+
end
|
1082
|
+
end
|
1083
|
+
```
|
1084
|
+
|
1085
|
+
Let's go through the helpers:
|
1086
|
+
|
1087
|
+
* The `command` helper creates a wrapper around a command-line that will be launched. The command's environment and working directory will be taken from the current values of `ENV` and `Dir.pwd`, unless
|
1088
|
+
|
1089
|
+
* The `in` or `using` arguments are chained with `command` to specify the working directory and environment:
|
1090
|
+
|
1091
|
+
```ruby
|
1092
|
+
command("some-command with --args").in("/my/working/directory").using("THIS" => "ENV_HASH", "WILL_BE" => "MERGED_OVER_EXISTING_ENV")
|
1093
|
+
```
|
1094
|
+
|
1095
|
+
* The scope in which the `command` helper is called defines methods `integration_cwd` and `integration_env`. This can be done through including a module in your `spec_helper.rb`:
|
1096
|
+
|
1097
|
+
```ruby
|
1098
|
+
# in spec/support/integration_helper.rb
|
1099
|
+
module IntegrationHelper
|
1100
|
+
def integration_cwd
|
1101
|
+
"/my/working/directory"
|
1102
|
+
end
|
1103
|
+
def integration_env
|
1104
|
+
{ "THIS" => "ENV_HASH", "WILL_BE" => "MERGED_OVER_EXISTING_ENV" }
|
1105
|
+
end
|
1106
|
+
end
|
1107
|
+
|
1108
|
+
# in spec/spec_helper.rb
|
1109
|
+
require_relative("support/integration_helper")
|
1110
|
+
RSpec.configure do |config|
|
1111
|
+
config.include(IntegrationHelper)
|
1112
|
+
end
|
1113
|
+
```
|
1114
|
+
|
1115
|
+
* The `command` helper can accept input with the `<` method. Input can be either a String or an Array of strings. It will be passed to the command over STDIN.
|
1116
|
+
|
1117
|
+
* The `have_stdout` and `have_stderr` matchers let you test the STDOUT or STDERR of the command for particular strings or regular expressions.
|
1118
|
+
|
1119
|
+
* The `exit_with` matcher lets you test the exit code of the command. You can pass the symbol `:non_zero` to set the expectation of _any_ non-zero exit code.
|
1120
|
+
|
1121
|
+
## Plugins
|
1122
|
+
|
1123
|
+
Wukong has a built-in plugin framework to make it easy to adapt Wukong
|
1124
|
+
processors to new backends or add other functionality. The
|
1125
|
+
`Wukong::Local` module and the `wu-local` program it supports is
|
1126
|
+
itself a Wukong plugin.
|
1127
|
+
|
1128
|
+
The following shows how you might build a simplified version of
|
1129
|
+
`Wukong::Local` as a new plugin. We'll call this plugin `Cat` as it
|
1130
|
+
will implement a program `wu-cat` that is similar in function to
|
1131
|
+
`wu-local` (just simplified).
|
1132
|
+
|
1133
|
+
The first thing to do is include the `Wukong::Plugin` module in your
|
1134
|
+
code:
|
1135
|
+
|
1136
|
+
|
1137
|
+
```Ruby
|
1138
|
+
# in lib/cat.rb
|
1139
|
+
#
|
1140
|
+
# This Wukong plugin works like wu-local but replicates some silly
|
1141
|
+
# features of cat like numbered lines.
|
1142
|
+
module Cat
|
1143
|
+
|
1144
|
+
# This registers Cat as a Wukong plugin.
|
1145
|
+
include Wukong::Plugin
|
1146
|
+
|
1147
|
+
# Defines any settings specific to Cat. Cat doesn't need to, but
|
1148
|
+
# you can define global settings here if you want. You can also
|
1149
|
+
# check the `program` name to decide whether to apply your settings.
|
1150
|
+
# This helps you not pollute other commands with your stuff.
|
1151
|
+
def self.configure settings, program
|
1152
|
+
case program
|
1153
|
+
when 'wu-cat'
|
1154
|
+
settings.define(:input, :description => "The input file to use")
|
1155
|
+
settings.define(:number, :description => "Prepend each input record with a consecutive number", :type => :boolean)
|
1156
|
+
else
|
1157
|
+
# configure other programs if you need to
|
1158
|
+
end
|
1159
|
+
end
|
1160
|
+
|
1161
|
+
# Lets Cat boot up with settings that have already been resolved
|
1162
|
+
# from the command-line or other sources like config files or remote
|
1163
|
+
# servers added by other plugins.
|
1164
|
+
#
|
1165
|
+
# The `root` directory in which the program is executing is also
|
1166
|
+
# provided.
|
1167
|
+
def self.boot settings, root
|
1168
|
+
puts "Cat booting up using resolved settings within directory #{root}"
|
1169
|
+
end
|
1170
|
+
end
|
1171
|
+
```
|
1172
|
+
|
1173
|
+
If your plugin doesn't interact directly with the command-line
|
1174
|
+
(through a wu-tool like `wu-local` or `wu-hadoop`) and doesn't
|
1175
|
+
directly interface with passing records to processors then you can
|
1176
|
+
just require the rest of your plugin's code at this point and be done.
|
1177
|
+
|
1178
|
+
### Write a Runner to interact with the command-line
|
1179
|
+
|
1180
|
+
If you need to implement a new command line tool then you should write
|
1181
|
+
a Runner. A Runner is used to implement Wukong programs like
|
1182
|
+
`wu-local` or `wu-hadoop`. Here's what the actual program file would
|
1183
|
+
look like for our example plugin's `wu-cat` program.
|
1184
|
+
|
1185
|
+
```ruby
|
1186
|
+
#!/usr/bin/env ruby
|
1187
|
+
# in bin/wu-cat
|
1188
|
+
require 'cat'
|
1189
|
+
Cat::Runner.run
|
1190
|
+
```
|
1191
|
+
|
1192
|
+
The Cat::Runner class is implemented separately.
|
1193
|
+
|
1194
|
+
```ruby
|
1195
|
+
# in lib/cat/runner.rb
|
1196
|
+
require_relative('driver')
|
1197
|
+
module Cat
|
1198
|
+
|
1199
|
+
# Implements the `wu-cat` command.
|
1200
|
+
class Runner < Wukong::Runner
|
1201
|
+
|
1202
|
+
usage "PROCESSOR|FLOW"
|
1203
|
+
|
1204
|
+
description <<-EOF
|
1205
|
+
|
1206
|
+
wu-cat lets you run a Wukong processor or dataflow on the
|
1207
|
+
command-line. Try it like this.
|
1208
|
+
|
1209
|
+
$ wu-cat --input=data.txt
|
1210
|
+
hello
|
1211
|
+
my
|
1212
|
+
friend
|
1213
|
+
|
1214
|
+
Connect the output to a processor in upcaser.rb
|
1215
|
+
|
1216
|
+
$ wu-cat --input=data.txt upcaser.rb
|
1217
|
+
HELLO
|
1218
|
+
MY
|
1219
|
+
FRIEND
|
1220
|
+
|
1221
|
+
You can also include add line numbers to the output.
|
1222
|
+
|
1223
|
+
$ wu-cat --number --input=data.txt upcaser.rb
|
1224
|
+
1 HELLO
|
1225
|
+
2 MY
|
1226
|
+
3 FRIEND
|
1227
|
+
EOF
|
1228
|
+
|
1229
|
+
# The name of the processor we're going to run. The #args method
|
1230
|
+
# is provided by the Runner class.
|
1231
|
+
def processor_name
|
1232
|
+
args.first
|
1233
|
+
end
|
1234
|
+
|
1235
|
+
# Validate that we were given the name of a registered processor
|
1236
|
+
# to run. Be careful to return true here or validation will fail.
|
1237
|
+
def validate
|
1238
|
+
raise Wukong::Error.new("Must provide a processor as the first argument") unless processor_name
|
1239
|
+
true
|
1240
|
+
end
|
1241
|
+
|
1242
|
+
# Delgates to a driver class to run the processor.
|
1243
|
+
def run
|
1244
|
+
Driver.new(processor_name, settings).start
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
end
|
1248
|
+
end
|
1249
|
+
```
|
1250
|
+
|
1251
|
+
### Write a Driver to interact with processors
|
1252
|
+
|
1253
|
+
The `Cat::Runner#run` method delegates to the `Cat::Driver` class to
|
1254
|
+
handle instantiating and interacting with processors.
|
1255
|
+
|
1256
|
+
```ruby
|
1257
|
+
# in lib/cat/driver.rb
|
1258
|
+
module Cat
|
1259
|
+
|
1260
|
+
# A class for driving a processor from `wu-cat`.
|
1261
|
+
class Driver
|
1262
|
+
|
1263
|
+
# Lets us count the records.
|
1264
|
+
attr_accessor :number
|
1265
|
+
|
1266
|
+
# Gives methods to construct and interact with dataflows.
|
1267
|
+
include Wukong::DriverMethods
|
1268
|
+
|
1269
|
+
# Create a new Driver for a dataflow with the given `label` using
|
1270
|
+
# the given `settings`.
|
1271
|
+
#
|
1272
|
+
# @param [String] label the name of the dataflow
|
1273
|
+
# @param [Configliere::Param] settings the settings to use when creating the dataflow
|
1274
|
+
def initialize label, settings
|
1275
|
+
self.settings = settings
|
1276
|
+
self.dataflow = construct_dataflow(label, settings)
|
1277
|
+
self.number = 1
|
1278
|
+
end
|
1279
|
+
|
1280
|
+
# The file handle of the input file.
|
1281
|
+
#
|
1282
|
+
# @return [File]
|
1283
|
+
def input_file
|
1284
|
+
@input_file ||= File.new(settings[:input])
|
1285
|
+
end
|
1286
|
+
|
1287
|
+
# Starts feeding records to the processor
|
1288
|
+
def start
|
1289
|
+
while line = input_file.readline rescue nil
|
1290
|
+
driver.send_through_dataflow(line)
|
1291
|
+
end
|
1292
|
+
end
|
1293
|
+
|
1294
|
+
# Process each record that comes back from the dataflow.
|
1295
|
+
#
|
1296
|
+
# @param [Object] record the yielded record
|
1297
|
+
def process record
|
1298
|
+
if settings[:number]
|
1299
|
+
puts [number, record].map(&:to_s).join("\t")
|
1300
|
+
else
|
1301
|
+
puts record.to_s
|
1302
|
+
end
|
1303
|
+
self.number += 1
|
1304
|
+
end
|
1305
|
+
|
1306
|
+
end
|
1307
|
+
end
|
1308
|
+
```
|