ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,29 @@
1
+ module Wukong
2
+ module DocHelpers
3
+
4
+ # Handles the Wukong.processor syntax.
5
+ class ProcessorHandler < YARD::Handlers::Ruby::ClassHandler
6
+
7
+ handles method_call(:processor)
8
+
9
+ # :nodoc:
10
+ def base_processor_class
11
+ @base_processor_class ||= YARD::CodeObjects::ClassObject.new(namespace, "Wukong::Processor")
12
+ end
13
+
14
+ # :nodoc:
15
+ def process
16
+ processor_name = statement.parameters.first.jump(:tstring_content, :ident).source
17
+ class_name = Gorillib::Inflector.camelize(processor_name)
18
+ processor_class = create_class(class_name, base_processor_class)
19
+ processor_body = statement.last.last
20
+
21
+ push_state(:owner => processor_class, :scope => :class, :namespace => processor_class) do
22
+ parse_block(processor_body)
23
+ end
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+
@@ -0,0 +1,214 @@
1
+ require_relative('driver/wiring')
2
+
3
+ module Wukong
4
+
5
+ # A Driver is a class including the DriverMethods module which
6
+ # connects a Dataflow or Processor to the external world of inputs
7
+ # and outputs.
8
+ #
9
+ # @example Minimal Driver class
10
+ #
11
+ # class MinimalDriver
12
+ # include Wukong::DriverMethods
13
+ # def initialize(label, settings)
14
+ # construct_dataflow(label, settings)
15
+ # end
16
+ # def process record
17
+ # puts record
18
+ # end
19
+ # end
20
+ #
21
+ # The MinimalDriver#send_through_dataflow method can be called on an
22
+ # instance of MinimalDriver with any input record.
23
+ #
24
+ # This record will be passed through the dataflow, starting from its
25
+ # root, and each record yielded at the leaves of the dataflow will
26
+ # be passed to the driver's #process method.
27
+ #
28
+ # The #process method of an implementing driver should *not* yield,
29
+ # unlike the process method of a Processor class. Instead, it
30
+ # should treat its argument as an output of the dataflow and do
31
+ # something appropriate to the driver (write to file, database,
32
+ # terminal, &c.).
33
+ #
34
+ # Drivers are also responsible for implementing the lifecycle of
35
+ # processors and dataflows they drive. A more complete version of
36
+ # the above driver class would:
37
+ #
38
+ # * call the #setup_dataflow method when ready to trigger the
39
+ # Processor#setup method on each processor in the dataflow
40
+ #
41
+ # * call the #finalize_dataflow method when indicating that the
42
+ # dataflow should consider a batch of records complete
43
+ #
44
+ # * call the #finalize_and_stop_dataflow method to indicate the
45
+ # last batch of records and to trigger the Processor#stop method
46
+ # on each processor in the dataflow
47
+ #
48
+ # Driver instances are started by Runners which should delegate to
49
+ # the `start` method driver class itself.
50
+ #
51
+ # @see Wukong::Local::StdioDriver for a complete example of a driver.
52
+ # @see Wukong::Local::Runner for an example of how runners call drivers.
53
+ module DriverMethods
54
+
55
+ attr_accessor :label
56
+ attr_accessor :settings
57
+ attr_accessor :dataflow
58
+
59
+ # Classes including DriverMethods should override this method with
60
+ # some way of handling the `output_record` that is appropriate for
61
+ # the driver.
62
+ #
63
+ # @param [Object] output_record
64
+ def process output_record
65
+ raise NotImplementedError.new("Define the #{self.class}#process method to handle output records from the dataflow")
66
+ end
67
+
68
+ # Construct a dataflow from the given `label` and `settings`.
69
+ #
70
+ # This method does **not** cause Processor#setup to be called on
71
+ # any of the processors in the dataflow. Call the #setup_dataflow
72
+ # method to explicitly have setup occur. This distinction is
73
+ # useful for drivers which themselves need to do complex
74
+ # initialization before letting processors in the dataflow
75
+ # initialize.
76
+ #
77
+ # @param [Symbol] label the name of the dataflow (or processor) to build
78
+ # @param [Hash] settings
79
+ # @param settings [String] :to Serialize all output via the named serializer (json, tsv)
80
+ # @param settings [String] :from Deserialize all input via the named deserializer (json, tsv)
81
+ # @param settings [String] :as Recordize each input as instances of the given class
82
+ #
83
+ # @see #setup_dataflow
84
+ def construct_dataflow(label, settings={})
85
+ self.label = label
86
+ self.settings = settings
87
+ prepend(:recordize) if settings[:as]
88
+ prepend("from_#{settings[:from]}".to_sym) if settings[:from]
89
+ append("to_#{settings[:to]}".to_sym) if settings[:to]
90
+ build_dataflow
91
+ end
92
+
93
+ # Set up this driver. Called before setting up any of the
94
+ # dataflow stages.
95
+ def setup
96
+ end
97
+
98
+ # Walks the dataflow and calls Processor#setup on each of the
99
+ # processors.
100
+ def setup_dataflow
101
+ setup
102
+ dataflow.each_stage do |stage|
103
+ stage.setup
104
+ end
105
+ end
106
+
107
+ # Send the given `record` through the dataflow.
108
+ #
109
+ # @param [Object] record
110
+ def send_through_dataflow(record)
111
+ wiring.start_with(dataflow.root).call(record)
112
+ end
113
+
114
+ # Perform finalization code for this driver. Runs after #setup
115
+ # and before #stop.
116
+ def finalize
117
+ end
118
+
119
+ # Indicate a full batch of records has already been sent through
120
+ # and any batch-oriented or accumulative operations should trigger
121
+ # (e.g. - counting).
122
+ #
123
+ # Walks the dataflow calling Processor#finalize on each processor.
124
+ #
125
+ # On the *last* batch, the #finalize_and_stop_dataflow method
126
+ # should be called instead.
127
+ #
128
+ # @see #finalize_and_stop_dataflow
129
+ def finalize_dataflow
130
+ finalize
131
+ dataflow.each_stage do |stage|
132
+ stage.finalize(&wiring.advance(stage))
133
+ end
134
+ end
135
+
136
+ # Works similar to #finalize_dataflow but calls Processor#stop
137
+ # after calling Processor#finalize on each processor.
138
+ def finalize_and_stop_dataflow
139
+ finalize
140
+ dataflow.each_stage do |stage|
141
+ stage.finalize(&wiring.advance(stage))
142
+ stage.stop
143
+ end
144
+ stop
145
+ end
146
+
147
+ # Perform shutdown code for this driver. Called after #finalize
148
+ # and after all stages have been finalized and stopped.
149
+ def stop
150
+ end
151
+
152
+ protected
153
+
154
+ # The builder for this driver's `label`, either for a Processor or
155
+ # a Dataflow.
156
+ #
157
+ # @return [Wukong::ProcessorBuilder, Wukong::DataflowBuilder]
158
+ def builder
159
+ return @builder if @builder
160
+ raise Wukong::Error.new("could not find definition for <#{label}>") unless Wukong.registry.registered?(label.to_sym)
161
+ @builder = Wukong.registry.retrieve(label.to_sym)
162
+ end
163
+
164
+ # Return the builder for this driver's dataflow.
165
+ #
166
+ # Even if a Processor was originally named by this driver's
167
+ # `label`, a DataflowBuilder will be returned here. The
168
+ # DataflowBuilder is itself built from just the ProcessorBuilder
169
+ # alone.
170
+ #
171
+ # @return [Wukong::DataflowBuilder]
172
+ # @see #builder
173
+ def dataflow_builder
174
+ @dataflow_builder ||= (builder.is_a?(DataflowBuilder) ? builder : Wukong::DataflowBuilder.receive(for_class: Class.new(Wukong::Dataflow), stages: {label.to_sym => builder}))
175
+ end
176
+
177
+ # Build the dataflow using the #dataflow_builder and the supplied
178
+ # `settings`.
179
+ #
180
+ # @return [Wukong::Dataflow]
181
+ def build_dataflow
182
+ self.dataflow = dataflow_builder.build(settings)
183
+ end
184
+
185
+ # Add the processor with the given `new_label` in front of this
186
+ # driver's dataflow, making it into the new root of the dataflow.
187
+ #
188
+ # @param [Symbol] new_label
189
+ def prepend new_label
190
+ raise Wukong::Error.new("could not find processor <#{new_label}> to prepend") unless Wukong.registry.registered?(new_label)
191
+ dataflow_builder.prepend(Wukong.registry.retrieve(new_label))
192
+ end
193
+
194
+ # Add the processor with the given `new_label` at the end of each
195
+ # of this driver's dataflow's leaves.
196
+ #
197
+ # @param [Symbol] new_label
198
+ def append new_label
199
+ raise Wukong::Error.new("could not find processor <#{new_label}> to append") unless Wukong.registry.registered?(new_label)
200
+ dataflow_builder.append(Wukong.registry.retrieve(new_label))
201
+ end
202
+
203
+ # Returns the underlying Wiring object that will coordinate
204
+ # transfer of records from the driver to the dataflow and back to
205
+ # the driver.
206
+ #
207
+ # @return [Wiring]
208
+ def wiring
209
+ @wiring ||= Wiring.new(self, dataflow)
210
+ end
211
+
212
+ end
213
+
214
+ end
@@ -0,0 +1,15 @@
1
+ module Wukong
2
+ module EventMachineDriver
3
+ include DriverMethods
4
+
5
+ def self.included klass
6
+ klass.class_eval do
7
+ def self.add_signal_traps
8
+ Signal.trap('INT') { log.info 'Received SIGINT. Stopping.' ; EM.stop }
9
+ Signal.trap('TERM') { log.info 'Received SIGTERM. Stopping.' ; EM.stop }
10
+ end
11
+ end
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,68 @@
1
+ module Wukong
2
+
3
+ # Provides a very Ruby-minded way of walking a dataflow connected to
4
+ # a driver.
5
+ class Wiring
6
+
7
+ # The driver instance that likely calls the #start_with method and
8
+ # provides a #process method to be called by this wiring.
9
+ attr_accessor :driver
10
+
11
+ # The dataflow being wired.
12
+ attr_accessor :dataflow
13
+
14
+ # Construct a new Wiring for the given `driver` and `dataflow`.
15
+ #
16
+ # @param [#process] driver
17
+ # @param [Wukong::Dataflow] dataflow
18
+ def initialize(driver, dataflow)
19
+ @driver = driver
20
+ @dataflow = dataflow
21
+ end
22
+
23
+ # Return a proc which, if called with a record, will process that
24
+ # record through each of the given `stages` as well as through the
25
+ # rest of the dataflow ahead of them.
26
+ #
27
+ # @param [Array<Wukong::Stage>] stages
28
+ # @return [Proc]
29
+ def start_with(*stages)
30
+ to_proc.curry.call(stages)
31
+ end
32
+
33
+ # Return a proc (the output of #start_with) which will process
34
+ # records through the stages that are ahead of the given stage.
35
+ #
36
+ # @param [Wukong::Stage] stage
37
+ # @return [Proc]
38
+ #
39
+ # @see #start_with
40
+ def advance(stage)
41
+ # This is where the tree of procs will terminate, but only after
42
+ # having passed all output records through the driver -- the
43
+ # last "stage".
44
+ return start_with() if stage.nil? || stage == driver
45
+
46
+ # Otherwise we're still in the middle of the tree...
47
+ descendents = dataflow.descendents(stage)
48
+ if descendents.empty?
49
+ # No descendents it means we've reached a leaf of the tree so
50
+ # we'll run records through the driver to generate output.
51
+ start_with(driver)
52
+ else
53
+ # Otherwise continue down the tree of procs...
54
+ start_with(*descendents)
55
+ end
56
+ end
57
+
58
+ # :nodoc:
59
+ def to_proc
60
+ return @wiring if @wiring
61
+ @wiring = Proc.new do |stages, record|
62
+ stages.each do |stage|
63
+ stage.process(record, &advance(stage)) if stage
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,42 @@
1
+ module Wukong
2
+
3
+ # Provides methods for supporting the running of Wukong processors
4
+ # and dataflows entirely locally, without any frameworks like Hadoop
5
+ # or Storm.
6
+ #
7
+ # This module is actually a plugin for Wukong.
8
+ module Local
9
+ include Plugin
10
+
11
+ # Configures the given +settings+ object with all settings
12
+ # specific to Wukong::Local for the given program +name+.
13
+ #
14
+ # @param [Configliere::Param] settings the settings to configure
15
+ # @param [String] program the name of the currently executing program
16
+ def self.configure settings, program
17
+ case program
18
+ when 'wu-local'
19
+ settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of first argument", flag: 'r'
20
+
21
+ settings.define :from, description: "Parse input from given data format (json, tsv, &c.) before processing"
22
+ settings.define :to, description: "Convert input to given data format (json, tsv, &c.) before emitting"
23
+ settings.define :as, description: "Call Class.receive on each input (will run after --from)", type: Class
24
+ when 'wu-source'
25
+ settings.define :per_sec, description: "Number of events produced per second", type: Float
26
+ settings.define :period, description: "Number of seconds between events (overrides --per_sec)", type: Float
27
+ settings.define :batch_size, description: "Trigger a finalize across the dataflow each time this many records are processed", type: Integer
28
+ end
29
+ end
30
+
31
+ # Boots Wukong::Local using the given +settings+ at the given
32
+ # +root.
33
+ #
34
+ # @param [Configliere::Param] settings the settings to use to boot
35
+ # @param [String] root the root directory to boot in
36
+ def self.boot(settings, root)
37
+ end
38
+
39
+ end
40
+ end
41
+
42
+ require_relative('local/runner')
@@ -0,0 +1,96 @@
1
+ require_relative 'stdio_driver'
2
+
3
+ module Wukong
4
+ module Local
5
+
6
+ # Implements the Runner for wu-local.
7
+ class LocalRunner < Wukong::Runner
8
+
9
+ include Wukong::Logging
10
+
11
+ usage "PROCESSOR|FLOW"
12
+
13
+ description <<-EOF.gsub(/^ {8}/, '')
14
+ wu-local is a tool for running Wukong processors and flows locally on
15
+ the command-line. Use wu-local by passing it a processor and feeding
16
+ in some data:
17
+
18
+ $ echo 'UNIX is Clever and Fun...' | wu-local tokenizer.rb
19
+ UNIX
20
+ is
21
+ Clever
22
+ and
23
+ Fun
24
+
25
+ If your processors have named fields you can pass them in as
26
+ arguments:
27
+
28
+ $ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4
29
+ UNIX
30
+ Clever
31
+
32
+ You can chain processors and calls to wu-local together:
33
+
34
+ $ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4 | wu-local downcaser.rb
35
+ unix
36
+ clever
37
+
38
+ Which is a good way to develop a combined data flow which you can
39
+ again test locally:
40
+
41
+ $ echo 'UNIX is clever and fun...' | wu-local tokenize_and_downcase_big_words.rb
42
+ unix
43
+ clever
44
+ EOF
45
+
46
+ # Returns the name of the dataflow we're going to run.
47
+ #
48
+ # @return [String]
49
+ def dataflow
50
+ arg = args.first
51
+ basename = File.basename(arg.to_s, '.rb')
52
+
53
+ case
54
+ when settings[:run] then settings[:run]
55
+ when arg && File.exist?(arg) then basename
56
+ else arg
57
+ end
58
+ end
59
+ alias_method :processor, :dataflow
60
+
61
+ # Validates the chosen processor.
62
+ #
63
+ # @raise [Wukong::Error] if it finds a problem
64
+ # @return [true]
65
+ def validate
66
+ raise Error.new("Must provide a processor or dataflow to run, via either the --run option or as the first argument") if dataflow.nil? || dataflow.empty?
67
+ raise Error.new("No such processor or dataflow <#{dataflow}>") unless registered?(dataflow)
68
+ true
69
+ end
70
+
71
+ # Adds a customized help message built from the Processor
72
+ # # itself.
73
+ def setup
74
+ super()
75
+ dataflow_class_for(dataflow).configure(settings) if registered?(dataflow)
76
+ end
77
+
78
+ # Starts up the driver with the right dataflow and settings.
79
+ #
80
+ # Starts the EventMachine reactor before starting the driver.
81
+ def run
82
+ EM.run do
83
+ driver.start(dataflow, settings)
84
+ end
85
+ end
86
+
87
+ # The class used
88
+ #
89
+ # @return [Class, #start]
90
+ def driver
91
+ StdioDriver
92
+ end
93
+
94
+ end
95
+ end
96
+ end