ul-wukong 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +60 -0
- data/.gitmodules +6 -0
- data/.rspec +2 -0
- data/.travis.yml +19 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +17 -0
- data/Guardfile +12 -0
- data/LICENSE.md +95 -0
- data/NOTES-travis.md +31 -0
- data/README-old.md +422 -0
- data/README.md +1308 -0
- data/Rakefile +28 -0
- data/TODO.md +99 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +6 -0
- data/bin/md5sort +20 -0
- data/bin/setcat +11 -0
- data/bin/tabchar +5 -0
- data/bin/uniq-ord +59 -0
- data/bin/uniqc +3 -0
- data/bin/wu +34 -0
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-date +13 -0
- data/bin/wu-datetime +13 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +186 -0
- data/bin/wu-local +4 -0
- data/bin/wu-plus +9 -0
- data/bin/wu-source +5 -0
- data/bin/wu-sum +31 -0
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/basic/tiny_count.rb +8 -0
- data/examples/basic/word_count/accumulator.rb +26 -0
- data/examples/basic/word_count/tokenizer.rb +13 -0
- data/examples/basic/word_count/word_count.rb +6 -0
- data/examples/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/deploy_pack/Gemfile +6 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/dsl/dataflow/simple.rb +12 -0
- data/examples/dsl/dataflow/telegram.rb +45 -0
- data/examples/dsl/workflow/cherry_pie.dot +97 -0
- data/examples/dsl/workflow/cherry_pie.md +104 -0
- data/examples/dsl/workflow/cherry_pie.png +0 -0
- data/examples/dsl/workflow/cherry_pie.rb +101 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/examples_helper.rb +9 -0
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/loadable.rb +2 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/examples/munging/airline_flights/airplane.rb +0 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +107 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/rake_helper.rb +97 -0
- data/examples/ruby_project/Gemfile +6 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/splitter.rb +94 -0
- data/examples/string_reverser.rb +7 -0
- data/examples/text/pig_latin/pig_latinizer.rb +35 -0
- data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +36 -0
- data/lib/hanuman/graph.rb +97 -0
- data/lib/hanuman/graphvizzer.rb +206 -0
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +128 -0
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/lib/wu/graph/union_find.rb +62 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/lib/wu/munging.rb +71 -0
- data/lib/wu/social/models/twitter.rb +31 -0
- data/lib/wu/wikipedia/models.rb +20 -0
- data/lib/wukong.rb +54 -0
- data/lib/wukong/dataflow.rb +43 -0
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +214 -0
- data/lib/wukong/driver/event_machine_driver.rb +15 -0
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +42 -0
- data/lib/wukong/local/runner.rb +96 -0
- data/lib/wukong/local/stdio_driver.rb +104 -0
- data/lib/wukong/logger.rb +102 -0
- data/lib/wukong/model/faker.rb +136 -0
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +110 -0
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +169 -0
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers.rb +74 -0
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
- data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
- data/lib/wukong/version.rb +3 -0
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/extract.rb +122 -0
- data/lib/wukong/widget/filters.rb +452 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +10 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +368 -0
- data/lib/wukong/widget/reducers/count.rb +73 -0
- data/lib/wukong/widget/reducers/group.rb +128 -0
- data/lib/wukong/widget/reducers/group_concat.rb +98 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +180 -0
- data/lib/wukong/widget/reducers/uniq.rb +91 -0
- data/lib/wukong/widget/serializers.rb +317 -0
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +7 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
- data/spec/examples/dataflow/parsing_spec.rb +14 -0
- data/spec/examples/dataflow/simple_spec.rb +34 -0
- data/spec/examples/dataflow/telegram_spec.rb +43 -0
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +18 -0
- data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
- data/spec/hanuman/graph_spec.rb +119 -0
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +81 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +43 -0
- data/spec/support/example_test_helpers.rb +95 -0
- data/spec/support/hanuman_test_helpers.rb +92 -0
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +94 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/dataflow_spec.rb +87 -0
- data/spec/wukong/driver_spec.rb +154 -0
- data/spec/wukong/local/runner_spec.rb +29 -0
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/model/faker_spec.rb +132 -0
- data/spec/wukong/processor_spec.rb +21 -0
- data/spec/wukong/runner_spec.rb +132 -0
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/filters_spec.rb +79 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +21 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
- data/spec/wukong/widget/serializers_spec.rb +114 -0
- data/spec/wukong/widget/sink_spec.rb +19 -0
- data/spec/wukong/widget/source_spec.rb +65 -0
- data/spec/wukong/wu-local_spec.rb +109 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +35 -0
- metadata +465 -0
@@ -0,0 +1,29 @@
|
|
1
|
+
module Wukong
|
2
|
+
module DocHelpers
|
3
|
+
|
4
|
+
# Handles the Wukong.processor syntax.
|
5
|
+
class ProcessorHandler < YARD::Handlers::Ruby::ClassHandler
|
6
|
+
|
7
|
+
handles method_call(:processor)
|
8
|
+
|
9
|
+
# :nodoc:
|
10
|
+
def base_processor_class
|
11
|
+
@base_processor_class ||= YARD::CodeObjects::ClassObject.new(namespace, "Wukong::Processor")
|
12
|
+
end
|
13
|
+
|
14
|
+
# :nodoc:
|
15
|
+
def process
|
16
|
+
processor_name = statement.parameters.first.jump(:tstring_content, :ident).source
|
17
|
+
class_name = Gorillib::Inflector.camelize(processor_name)
|
18
|
+
processor_class = create_class(class_name, base_processor_class)
|
19
|
+
processor_body = statement.last.last
|
20
|
+
|
21
|
+
push_state(:owner => processor_class, :scope => :class, :namespace => processor_class) do
|
22
|
+
parse_block(processor_body)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,214 @@
|
|
1
|
+
require_relative('driver/wiring')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
|
5
|
+
# A Driver is a class including the DriverMethods module which
|
6
|
+
# connects a Dataflow or Processor to the external world of inputs
|
7
|
+
# and outputs.
|
8
|
+
#
|
9
|
+
# @example Minimal Driver class
|
10
|
+
#
|
11
|
+
# class MinimalDriver
|
12
|
+
# include Wukong::DriverMethods
|
13
|
+
# def initialize(label, settings)
|
14
|
+
# construct_dataflow(label, settings)
|
15
|
+
# end
|
16
|
+
# def process record
|
17
|
+
# puts record
|
18
|
+
# end
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# The MinimalDriver#send_through_dataflow method can be called on an
|
22
|
+
# instance of MinimalDriver with any input record.
|
23
|
+
#
|
24
|
+
# This record will be passed through the dataflow, starting from its
|
25
|
+
# root, and each record yielded at the leaves of the dataflow will
|
26
|
+
# be passed to the driver's #process method.
|
27
|
+
#
|
28
|
+
# The #process method of an implementing driver should *not* yield,
|
29
|
+
# unlike the process method of a Processor class. Instead, it
|
30
|
+
# should treat its argument as an output of the dataflow and do
|
31
|
+
# something appropriate to the driver (write to file, database,
|
32
|
+
# terminal, &c.).
|
33
|
+
#
|
34
|
+
# Drivers are also responsible for implementing the lifecycle of
|
35
|
+
# processors and dataflows they drive. A more complete version of
|
36
|
+
# the above driver class would:
|
37
|
+
#
|
38
|
+
# * call the #setup_dataflow method when ready to trigger the
|
39
|
+
# Processor#setup method on each processor in the dataflow
|
40
|
+
#
|
41
|
+
# * call the #finalize_dataflow method when indicating that the
|
42
|
+
# dataflow should consider a batch of records complete
|
43
|
+
#
|
44
|
+
# * call the #finalize_and_stop_dataflow method to indicate the
|
45
|
+
# last batch of records and to trigger the Processor#stop method
|
46
|
+
# on each processor in the dataflow
|
47
|
+
#
|
48
|
+
# Driver instances are started by Runners which should delegate to
|
49
|
+
# the `start` method driver class itself.
|
50
|
+
#
|
51
|
+
# @see Wukong::Local::StdioDriver for a complete example of a driver.
|
52
|
+
# @see Wukong::Local::Runner for an example of how runners call drivers.
|
53
|
+
module DriverMethods
|
54
|
+
|
55
|
+
attr_accessor :label
|
56
|
+
attr_accessor :settings
|
57
|
+
attr_accessor :dataflow
|
58
|
+
|
59
|
+
# Classes including DriverMethods should override this method with
|
60
|
+
# some way of handling the `output_record` that is appropriate for
|
61
|
+
# the driver.
|
62
|
+
#
|
63
|
+
# @param [Object] output_record
|
64
|
+
def process output_record
|
65
|
+
raise NotImplementedError.new("Define the #{self.class}#process method to handle output records from the dataflow")
|
66
|
+
end
|
67
|
+
|
68
|
+
# Construct a dataflow from the given `label` and `settings`.
|
69
|
+
#
|
70
|
+
# This method does **not** cause Processor#setup to be called on
|
71
|
+
# any of the processors in the dataflow. Call the #setup_dataflow
|
72
|
+
# method to explicitly have setup occur. This distinction is
|
73
|
+
# useful for drivers which themselves need to do complex
|
74
|
+
# initialization before letting processors in the dataflow
|
75
|
+
# initialize.
|
76
|
+
#
|
77
|
+
# @param [Symbol] label the name of the dataflow (or processor) to build
|
78
|
+
# @param [Hash] settings
|
79
|
+
# @param settings [String] :to Serialize all output via the named serializer (json, tsv)
|
80
|
+
# @param settings [String] :from Deserialize all input via the named deserializer (json, tsv)
|
81
|
+
# @param settings [String] :as Recordize each input as instances of the given class
|
82
|
+
#
|
83
|
+
# @see #setup_dataflow
|
84
|
+
def construct_dataflow(label, settings={})
|
85
|
+
self.label = label
|
86
|
+
self.settings = settings
|
87
|
+
prepend(:recordize) if settings[:as]
|
88
|
+
prepend("from_#{settings[:from]}".to_sym) if settings[:from]
|
89
|
+
append("to_#{settings[:to]}".to_sym) if settings[:to]
|
90
|
+
build_dataflow
|
91
|
+
end
|
92
|
+
|
93
|
+
# Set up this driver. Called before setting up any of the
|
94
|
+
# dataflow stages.
|
95
|
+
def setup
|
96
|
+
end
|
97
|
+
|
98
|
+
# Walks the dataflow and calls Processor#setup on each of the
|
99
|
+
# processors.
|
100
|
+
def setup_dataflow
|
101
|
+
setup
|
102
|
+
dataflow.each_stage do |stage|
|
103
|
+
stage.setup
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Send the given `record` through the dataflow.
|
108
|
+
#
|
109
|
+
# @param [Object] record
|
110
|
+
def send_through_dataflow(record)
|
111
|
+
wiring.start_with(dataflow.root).call(record)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Perform finalization code for this driver. Runs after #setup
|
115
|
+
# and before #stop.
|
116
|
+
def finalize
|
117
|
+
end
|
118
|
+
|
119
|
+
# Indicate a full batch of records has already been sent through
|
120
|
+
# and any batch-oriented or accumulative operations should trigger
|
121
|
+
# (e.g. - counting).
|
122
|
+
#
|
123
|
+
# Walks the dataflow calling Processor#finalize on each processor.
|
124
|
+
#
|
125
|
+
# On the *last* batch, the #finalize_and_stop_dataflow method
|
126
|
+
# should be called instead.
|
127
|
+
#
|
128
|
+
# @see #finalize_and_stop_dataflow
|
129
|
+
def finalize_dataflow
|
130
|
+
finalize
|
131
|
+
dataflow.each_stage do |stage|
|
132
|
+
stage.finalize(&wiring.advance(stage))
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Works similar to #finalize_dataflow but calls Processor#stop
|
137
|
+
# after calling Processor#finalize on each processor.
|
138
|
+
def finalize_and_stop_dataflow
|
139
|
+
finalize
|
140
|
+
dataflow.each_stage do |stage|
|
141
|
+
stage.finalize(&wiring.advance(stage))
|
142
|
+
stage.stop
|
143
|
+
end
|
144
|
+
stop
|
145
|
+
end
|
146
|
+
|
147
|
+
# Perform shutdown code for this driver. Called after #finalize
|
148
|
+
# and after all stages have been finalized and stopped.
|
149
|
+
def stop
|
150
|
+
end
|
151
|
+
|
152
|
+
protected
|
153
|
+
|
154
|
+
# The builder for this driver's `label`, either for a Processor or
|
155
|
+
# a Dataflow.
|
156
|
+
#
|
157
|
+
# @return [Wukong::ProcessorBuilder, Wukong::DataflowBuilder]
|
158
|
+
def builder
|
159
|
+
return @builder if @builder
|
160
|
+
raise Wukong::Error.new("could not find definition for <#{label}>") unless Wukong.registry.registered?(label.to_sym)
|
161
|
+
@builder = Wukong.registry.retrieve(label.to_sym)
|
162
|
+
end
|
163
|
+
|
164
|
+
# Return the builder for this driver's dataflow.
|
165
|
+
#
|
166
|
+
# Even if a Processor was originally named by this driver's
|
167
|
+
# `label`, a DataflowBuilder will be returned here. The
|
168
|
+
# DataflowBuilder is itself built from just the ProcessorBuilder
|
169
|
+
# alone.
|
170
|
+
#
|
171
|
+
# @return [Wukong::DataflowBuilder]
|
172
|
+
# @see #builder
|
173
|
+
def dataflow_builder
|
174
|
+
@dataflow_builder ||= (builder.is_a?(DataflowBuilder) ? builder : Wukong::DataflowBuilder.receive(for_class: Class.new(Wukong::Dataflow), stages: {label.to_sym => builder}))
|
175
|
+
end
|
176
|
+
|
177
|
+
# Build the dataflow using the #dataflow_builder and the supplied
|
178
|
+
# `settings`.
|
179
|
+
#
|
180
|
+
# @return [Wukong::Dataflow]
|
181
|
+
def build_dataflow
|
182
|
+
self.dataflow = dataflow_builder.build(settings)
|
183
|
+
end
|
184
|
+
|
185
|
+
# Add the processor with the given `new_label` in front of this
|
186
|
+
# driver's dataflow, making it into the new root of the dataflow.
|
187
|
+
#
|
188
|
+
# @param [Symbol] new_label
|
189
|
+
def prepend new_label
|
190
|
+
raise Wukong::Error.new("could not find processor <#{new_label}> to prepend") unless Wukong.registry.registered?(new_label)
|
191
|
+
dataflow_builder.prepend(Wukong.registry.retrieve(new_label))
|
192
|
+
end
|
193
|
+
|
194
|
+
# Add the processor with the given `new_label` at the end of each
|
195
|
+
# of this driver's dataflow's leaves.
|
196
|
+
#
|
197
|
+
# @param [Symbol] new_label
|
198
|
+
def append new_label
|
199
|
+
raise Wukong::Error.new("could not find processor <#{new_label}> to append") unless Wukong.registry.registered?(new_label)
|
200
|
+
dataflow_builder.append(Wukong.registry.retrieve(new_label))
|
201
|
+
end
|
202
|
+
|
203
|
+
# Returns the underlying Wiring object that will coordinate
|
204
|
+
# transfer of records from the driver to the dataflow and back to
|
205
|
+
# the driver.
|
206
|
+
#
|
207
|
+
# @return [Wiring]
|
208
|
+
def wiring
|
209
|
+
@wiring ||= Wiring.new(self, dataflow)
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
213
|
+
|
214
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Wukong
|
2
|
+
module EventMachineDriver
|
3
|
+
include DriverMethods
|
4
|
+
|
5
|
+
def self.included klass
|
6
|
+
klass.class_eval do
|
7
|
+
def self.add_signal_traps
|
8
|
+
Signal.trap('INT') { log.info 'Received SIGINT. Stopping.' ; EM.stop }
|
9
|
+
Signal.trap('TERM') { log.info 'Received SIGTERM. Stopping.' ; EM.stop }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Wukong
|
2
|
+
|
3
|
+
# Provides a very Ruby-minded way of walking a dataflow connected to
|
4
|
+
# a driver.
|
5
|
+
class Wiring
|
6
|
+
|
7
|
+
# The driver instance that likely calls the #start_with method and
|
8
|
+
# provides a #process method to be called by this wiring.
|
9
|
+
attr_accessor :driver
|
10
|
+
|
11
|
+
# The dataflow being wired.
|
12
|
+
attr_accessor :dataflow
|
13
|
+
|
14
|
+
# Construct a new Wiring for the given `driver` and `dataflow`.
|
15
|
+
#
|
16
|
+
# @param [#process] driver
|
17
|
+
# @param [Wukong::Dataflow] dataflow
|
18
|
+
def initialize(driver, dataflow)
|
19
|
+
@driver = driver
|
20
|
+
@dataflow = dataflow
|
21
|
+
end
|
22
|
+
|
23
|
+
# Return a proc which, if called with a record, will process that
|
24
|
+
# record through each of the given `stages` as well as through the
|
25
|
+
# rest of the dataflow ahead of them.
|
26
|
+
#
|
27
|
+
# @param [Array<Wukong::Stage>] stages
|
28
|
+
# @return [Proc]
|
29
|
+
def start_with(*stages)
|
30
|
+
to_proc.curry.call(stages)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Return a proc (the output of #start_with) which will process
|
34
|
+
# records through the stages that are ahead of the given stage.
|
35
|
+
#
|
36
|
+
# @param [Wukong::Stage] stage
|
37
|
+
# @return [Proc]
|
38
|
+
#
|
39
|
+
# @see #start_with
|
40
|
+
def advance(stage)
|
41
|
+
# This is where the tree of procs will terminate, but only after
|
42
|
+
# having passed all output records through the driver -- the
|
43
|
+
# last "stage".
|
44
|
+
return start_with() if stage.nil? || stage == driver
|
45
|
+
|
46
|
+
# Otherwise we're still in the middle of the tree...
|
47
|
+
descendents = dataflow.descendents(stage)
|
48
|
+
if descendents.empty?
|
49
|
+
# No descendents it means we've reached a leaf of the tree so
|
50
|
+
# we'll run records through the driver to generate output.
|
51
|
+
start_with(driver)
|
52
|
+
else
|
53
|
+
# Otherwise continue down the tree of procs...
|
54
|
+
start_with(*descendents)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# :nodoc:
|
59
|
+
def to_proc
|
60
|
+
return @wiring if @wiring
|
61
|
+
@wiring = Proc.new do |stages, record|
|
62
|
+
stages.each do |stage|
|
63
|
+
stage.process(record, &advance(stage)) if stage
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/lib/wukong/local.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
module Wukong
|
2
|
+
|
3
|
+
# Provides methods for supporting the running of Wukong processors
|
4
|
+
# and dataflows entirely locally, without any frameworks like Hadoop
|
5
|
+
# or Storm.
|
6
|
+
#
|
7
|
+
# This module is actually a plugin for Wukong.
|
8
|
+
module Local
|
9
|
+
include Plugin
|
10
|
+
|
11
|
+
# Configures the given +settings+ object with all settings
|
12
|
+
# specific to Wukong::Local for the given program +name+.
|
13
|
+
#
|
14
|
+
# @param [Configliere::Param] settings the settings to configure
|
15
|
+
# @param [String] program the name of the currently executing program
|
16
|
+
def self.configure settings, program
|
17
|
+
case program
|
18
|
+
when 'wu-local'
|
19
|
+
settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of first argument", flag: 'r'
|
20
|
+
|
21
|
+
settings.define :from, description: "Parse input from given data format (json, tsv, &c.) before processing"
|
22
|
+
settings.define :to, description: "Convert input to given data format (json, tsv, &c.) before emitting"
|
23
|
+
settings.define :as, description: "Call Class.receive on each input (will run after --from)", type: Class
|
24
|
+
when 'wu-source'
|
25
|
+
settings.define :per_sec, description: "Number of events produced per second", type: Float
|
26
|
+
settings.define :period, description: "Number of seconds between events (overrides --per_sec)", type: Float
|
27
|
+
settings.define :batch_size, description: "Trigger a finalize across the dataflow each time this many records are processed", type: Integer
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Boots Wukong::Local using the given +settings+ at the given
|
32
|
+
# +root.
|
33
|
+
#
|
34
|
+
# @param [Configliere::Param] settings the settings to use to boot
|
35
|
+
# @param [String] root the root directory to boot in
|
36
|
+
def self.boot(settings, root)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
require_relative('local/runner')
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require_relative 'stdio_driver'
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Local
|
5
|
+
|
6
|
+
# Implements the Runner for wu-local.
|
7
|
+
class LocalRunner < Wukong::Runner
|
8
|
+
|
9
|
+
include Wukong::Logging
|
10
|
+
|
11
|
+
usage "PROCESSOR|FLOW"
|
12
|
+
|
13
|
+
description <<-EOF.gsub(/^ {8}/, '')
|
14
|
+
wu-local is a tool for running Wukong processors and flows locally on
|
15
|
+
the command-line. Use wu-local by passing it a processor and feeding
|
16
|
+
in some data:
|
17
|
+
|
18
|
+
$ echo 'UNIX is Clever and Fun...' | wu-local tokenizer.rb
|
19
|
+
UNIX
|
20
|
+
is
|
21
|
+
Clever
|
22
|
+
and
|
23
|
+
Fun
|
24
|
+
|
25
|
+
If your processors have named fields you can pass them in as
|
26
|
+
arguments:
|
27
|
+
|
28
|
+
$ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4
|
29
|
+
UNIX
|
30
|
+
Clever
|
31
|
+
|
32
|
+
You can chain processors and calls to wu-local together:
|
33
|
+
|
34
|
+
$ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4 | wu-local downcaser.rb
|
35
|
+
unix
|
36
|
+
clever
|
37
|
+
|
38
|
+
Which is a good way to develop a combined data flow which you can
|
39
|
+
again test locally:
|
40
|
+
|
41
|
+
$ echo 'UNIX is clever and fun...' | wu-local tokenize_and_downcase_big_words.rb
|
42
|
+
unix
|
43
|
+
clever
|
44
|
+
EOF
|
45
|
+
|
46
|
+
# Returns the name of the dataflow we're going to run.
|
47
|
+
#
|
48
|
+
# @return [String]
|
49
|
+
def dataflow
|
50
|
+
arg = args.first
|
51
|
+
basename = File.basename(arg.to_s, '.rb')
|
52
|
+
|
53
|
+
case
|
54
|
+
when settings[:run] then settings[:run]
|
55
|
+
when arg && File.exist?(arg) then basename
|
56
|
+
else arg
|
57
|
+
end
|
58
|
+
end
|
59
|
+
alias_method :processor, :dataflow
|
60
|
+
|
61
|
+
# Validates the chosen processor.
|
62
|
+
#
|
63
|
+
# @raise [Wukong::Error] if it finds a problem
|
64
|
+
# @return [true]
|
65
|
+
def validate
|
66
|
+
raise Error.new("Must provide a processor or dataflow to run, via either the --run option or as the first argument") if dataflow.nil? || dataflow.empty?
|
67
|
+
raise Error.new("No such processor or dataflow <#{dataflow}>") unless registered?(dataflow)
|
68
|
+
true
|
69
|
+
end
|
70
|
+
|
71
|
+
# Adds a customized help message built from the Processor
|
72
|
+
# # itself.
|
73
|
+
def setup
|
74
|
+
super()
|
75
|
+
dataflow_class_for(dataflow).configure(settings) if registered?(dataflow)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Starts up the driver with the right dataflow and settings.
|
79
|
+
#
|
80
|
+
# Starts the EventMachine reactor before starting the driver.
|
81
|
+
def run
|
82
|
+
EM.run do
|
83
|
+
driver.start(dataflow, settings)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# The class used
|
88
|
+
#
|
89
|
+
# @return [Class, #start]
|
90
|
+
def driver
|
91
|
+
StdioDriver
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|