ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,56 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A widget that will log all incoming records.
5
+ #
6
+ # @example Logging records from the command line
7
+ #
8
+ # $ cat input
9
+ # 1
10
+ # 2
11
+ # 3
12
+ # $ cat input | wu-local logger
13
+ # 2012-11-28 18:20:46 [INFO] Logger: 1
14
+ # 2012-11-28 18:20:46 [INFO] Logger: 2
15
+ # 2012-11-28 18:20:46 [INFO] Logger: 3
16
+ #
17
+ # @example Logging records within a dataflow
18
+ #
19
+ # Wukong.dataflow(:uses_logger) do
20
+ # ... | logger
21
+ # end
22
+ class Logger < Processor
23
+ field :level, Symbol, :default => :info, :doc => "Log level priority"
24
+
25
+ description <<EOF
26
+ This processor passes all input records unmodified, making a log
27
+ statement on each one.
28
+
29
+ $ cat input
30
+ 1
31
+ 2
32
+ 3
33
+ $ cat input | wu-local logger
34
+ INFO 2013-01-04 17:10:59 [Logger ] -- 1
35
+ INFO 2013-01-04 17:10:59 [Logger ] -- 2
36
+ INFO 2013-01-04 17:10:59 [Logger ] -- 3
37
+
38
+ You can set the priority level of the log messages with the --level
39
+ flag.
40
+
41
+ $ cat input | wu-local logger --level=debug
42
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 1
43
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 2
44
+ DEBUG 2013-01-04 17:10:59 [Logger ] -- 3
45
+ EOF
46
+
47
+ # Process a given `record` by logging it.
48
+ #
49
+ # @param [Object] record
50
+ def process(record)
51
+ log.send(level, record)
52
+ end
53
+ register
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,82 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # Yield the result of this processor's action for each input
5
+ # record.
6
+ #
7
+ # @example Apply a function (like a parser) to each record
8
+ #
9
+ # Wukong.dataflow(:parser) do
10
+ # ... | map { |string| MyParser.parse(string) } | ...
11
+ # end
12
+ #
13
+ # @example Succintly map between objects
14
+ #
15
+ # Wukong.dataflow(:converter) do
16
+ # ... | my_book_parser | map(&:author) | author_processor | ...
17
+ # end
18
+ #
19
+ # Can also be called with the :compact option which will check if
20
+ # the result of the action is non falsy before yielding.
21
+ #
22
+ # @example Mapping but only if it exists
23
+ #
24
+ # Wukong.dataflow(:converter_and_trimmer) do
25
+ # ... | my_book_parser | map(compact: true, &:author) | processor_that_needs_an_author | ...
26
+ # end
27
+ class Map < Processor
28
+
29
+ field :compact, :boolean, default: false
30
+
31
+ # Call #perform_action on the input_record and yield the
32
+ # returned output record.
33
+ #
34
+ # If #compact then only yield the output record if it is not
35
+ # falsy.
36
+ #
37
+ # @param [Object] input_record
38
+ # @yield [output_record] if compact, then only yield if it is not falsy
39
+ # @yieldparam [Object] output_record the result of #perform_action
40
+ #
41
+ # @see Flatten
42
+ def process(input_record)
43
+ output_record = perform_action(input_record)
44
+ if compact
45
+ yield output_record if output_record
46
+ else
47
+ yield output_record
48
+ end
49
+ end
50
+ register
51
+ end
52
+
53
+ # If an input record defines the #each method then yield each of
54
+ # its records. Otherwise yield the input record.
55
+ #
56
+ # @example Turning one record into many
57
+ #
58
+ # Wukong.dataflow(:authors_to_books) do
59
+ # ... | author_parser | map(&:books) | flatten | book_processor | ...
60
+ # end
61
+ #
62
+ # @see Map
63
+ class Flatten < Processor
64
+
65
+ # If input_record responds to #each then yield each of these as
66
+ # an output record. Else, just yield the input_record.
67
+ #
68
+ # @param [Object] input_record
69
+ # @yield [output_record]
70
+ # @yieldparam [Object] output_record
71
+ def process(input_record)
72
+ if input_record.respond_to?(:each)
73
+ input_record.each{ |output_record| yield(output_record) }
74
+ else
75
+ yield(input_record)
76
+ end
77
+ end
78
+ register
79
+ end
80
+
81
+ end
82
+ end
@@ -0,0 +1,10 @@
1
+ require_relative("reducers/accumulator")
2
+ require_relative("reducers/improver")
3
+ require_relative("reducers/sort")
4
+ require_relative("reducers/count")
5
+ require_relative("reducers/group")
6
+ require_relative("reducers/group_concat")
7
+ require_relative("reducers/moments")
8
+ require_relative("reducers/bin")
9
+ require_relative("reducers/uniq")
10
+ require_relative("reducers/join_xml")
@@ -0,0 +1,73 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A base widget for building more complex accumulative widgets.
5
+ class Accumulator < Processor
6
+
7
+ # The current key used to define the current group being
8
+ # accumulated.
9
+ attr_accessor :key
10
+
11
+ # The current group of records.
12
+ attr_accessor :group
13
+
14
+ # Sets up this accumulator by defining an initial key (with a
15
+ # value that is unlikely to be found in real data) and calling
16
+ # `#start` with no record.
17
+ def setup
18
+ @key = :__first_group__
19
+ start(nil)
20
+ end
21
+
22
+ # Processes the `record`.
23
+ #
24
+ # If the record is part of the current group (has a key that is
25
+ # the same as the current key) then will call `accumulate` with
26
+ # the record.
27
+ #
28
+ # If the record has a different key, will call `finalize` and
29
+ # then call `start` with the record.
30
+ #
31
+ # @param [Object] record
32
+ # @yield [finalized_record] each record yielded by `finalize`
33
+ # @yieldparam [Object] finalized_record
34
+ # @see #accumulate
35
+ # @see #finalize
36
+ # @see #get_key
37
+ # @see #start
38
+ def process(record)
39
+ this_key = get_key(record)
40
+ if this_key != self.key
41
+ finalize { |record| yield record } unless self.key == :__first_group__
42
+ self.key = this_key
43
+ start record
44
+ end
45
+ accumulate(record)
46
+ end
47
+
48
+ # Starts accumulation for a new group of records with a new key.
49
+ # This is where you can reset counters, clear caches, &c.
50
+ #
51
+ # @param [Object] record
52
+ def start record
53
+ end
54
+
55
+ # Gets the key from the given +record+. By default a record's
56
+ # key is just the record itself.
57
+ #
58
+ # @param [Object] record
59
+ # @return [Object] the record's key
60
+ def get_key record
61
+ record
62
+ end
63
+
64
+ # Accumulates another +record+.
65
+ #
66
+ # Does nothing by default, intended for you to override.
67
+ #
68
+ # @param [Object] record
69
+ def accumulate record
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,368 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A widget for binning input data. Will emit
5
+ #
6
+ #
7
+ #
8
+ # @example Binning some input data on the command-line
9
+ #
10
+ # $ cat input
11
+ # 0.94628
12
+ # 0.03480
13
+ # 0.74418
14
+ # ...
15
+ # $ cat input | wu-local bin --to=tsv
16
+ #
17
+ # 0.02935 0.12638500000000003 7
18
+ # 0.12638500000000003 0.22342000000000004 11
19
+ # 0.22342000000000004 0.32045500000000005 15
20
+ #
21
+ # @example Control how the bins are defined and displayed
22
+ #
23
+ # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --to=tsv
24
+ # 0.0 0.1 10.0
25
+ # 0.1 0.2 12.0
26
+ # 0.2 0.3 8.0
27
+ # ...
28
+ #
29
+ # @example Include an additional column of normalized (fractional) counts
30
+ #
31
+ # $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize --to=tsv
32
+ # 0.0 0.1 10.0 0.3
33
+ # 0.1 0.2 12.0 0.36
34
+ # 0.2 0.3 8.0 0.24
35
+ # ...
36
+ #
37
+ # @example Make a log-log histogram
38
+ #
39
+ # $ cat input | wu-local bin --log_bins --log_counts --to=tsv
40
+ # 1.000 3.162 1.099
41
+ # 3.162 10.000 1.946
42
+ # 10.000 31.623 3.045
43
+ # 31.623 100.000 4.234
44
+ #
45
+ # This widget works nicely with the Extract widget at the end of a
46
+ # data flow:
47
+ #
48
+ # @example Use the bin at the end of a dataflow
49
+ #
50
+ # Wukong.processor(:bins_at_end) do
51
+ # ... | extract(part: 'age') | bin(num_bins: 10) | to_tsv
52
+ # end
53
+ #
54
+ # @see Accumulator
55
+ # @see Extract
56
+ class Bin < Accumulator
57
+
58
+ description <<EOF
59
+ This processor can be used to create a set of bins defining the
60
+ frequency distribution of the input records (or some part of each
61
+ input record).
62
+
63
+ Here's a simple example:
64
+
65
+ $ cat input.dat
66
+ 1
67
+ 2
68
+ 3
69
+ ...
70
+ 100
71
+
72
+ $ cat input.dat | wu-local bin --to=tsv
73
+ 1.000 10.900 10.000
74
+ 10.900 20.800 10.000
75
+ 20.800 30.700 10.000
76
+ 30.700 40.600 10.000
77
+ ...
78
+ 90.100 100.000 10.000
79
+
80
+ By default, all the input values are included and the number of bins
81
+ used corresponds to the square root of the number of input values.
82
+ You can customize the domain for the distribution, the number of bins,
83
+ or the explicit bin edges themselves, via the --min, --max,
84
+ --num_bins, and --edges flags.
85
+
86
+ You can control the display of numbers with the --format_string and
87
+ --precision options.
88
+
89
+ $ cat input.dat | wu-local bin --num_bins=4 --min=0 --max=100 --precision=0 --to=tsv
90
+ 0.0 25 24
91
+ 25 50 25
92
+ 50 75 25
93
+ 75 100 26
94
+
95
+ You can use the --log_bins, --log_counts, and --base options to use
96
+ logarithmically spaced bins or logarithmic counts within each bin to
97
+ the given base.
98
+
99
+ You can also normalize the distribution using the --normalize option.
100
+
101
+ $ cat input.dat | wu-local bin --num_bins=4 --log_bins --normalize --to=tsv
102
+ 1.000 3.162 3.000 0.030
103
+ 3.162 10.000 7.000 0.070
104
+ 10.000 31.623 21.000 0.210
105
+ 31.623 100.000 69.000 0.690
106
+ EOF
107
+
108
+ field :num_bins, Integer, :doc => "Number of bins to use"
109
+ field :edges, Array, :doc => "Number of edges to use"
110
+ field :min, Float, :doc => "Smallest bin starting point"
111
+ field :max, Float, :doc => "Largest bin ending point"
112
+
113
+ field :format_string, String, :doc => "Format string used when printing numerical values"
114
+ field :precision, Integer, :doc => "Precision used when printing numerical values", :default => 3
115
+
116
+ include DynamicGet
117
+ field :by, Whatever, :doc => "Bin the values extracted by this label"
118
+
119
+ field :log_bins, :boolean, :default => false, :doc => "Use logarithmically spaced bins"
120
+ field :log_counts, :boolean, :default => false, :doc => "Use logarithmic bin counts"
121
+ field :base, Float, :default => Math::E, :doc => "Base for logarithms"
122
+
123
+ field :normalize, :boolean, :default => false, :doc => "Normalize bin counts so they sum to 1.0"
124
+
125
+ # The accumulated values
126
+ attr_accessor :values
127
+
128
+ # The bins (pairs of edges)
129
+ attr_accessor :bins
130
+
131
+ # The value counts within each bin.
132
+ attr_accessor :counts
133
+
134
+ # The total number of accumulated values.
135
+ attr_accessor :total_count
136
+
137
+ # Initializes all storage. If we can calculate bins in advance,
138
+ # do so now.
139
+ def setup
140
+ super()
141
+ self.values = []
142
+ self.bins = []
143
+ self.counts = []
144
+ self.total_count = 0
145
+ if edges.nil?
146
+ set_edges_from_min_max_and_num_bins! if min && max && num_bins
147
+ else
148
+ set_bins_and_counts_from_edges!
149
+ end
150
+ end
151
+
152
+ # Keep all records in the same "group", at least from the
153
+ # Accumulator's perspective.
154
+ #
155
+ # @param [Object] record
156
+ # @return [:__first__group__]
157
+ def get_key record
158
+ :__first__group__
159
+ end
160
+
161
+ # Accumulates a single `record`.
162
+ #
163
+ # First we extract the value from the record. If we already
164
+ # have bins, add the value to the appropriate bin. Otherwise,
165
+ # store the value, updating any properties like `max` or `min`
166
+ # as necessary.
167
+ #
168
+ # @param [Object] record
169
+ def accumulate record
170
+ value = (value_from(record) or return)
171
+ self.total_count += 1
172
+ if bins?
173
+ add_to_some_bin(value)
174
+ else
175
+ self.min ||= value
176
+ self.min = value if value < min
177
+ self.max ||= value
178
+ self.max = value if value > max
179
+ self.values << value
180
+ end
181
+ end
182
+
183
+ # Emits each bin with its edges and count. Adds the normalized
184
+ # count if requested.
185
+ #
186
+ # Will bins the values if we haven't done so on the fly already.
187
+ #
188
+ # @yield [lower, upper, count, normalized_count]
189
+ # @yieldparam [String] lower the lower (left) edge of the bin
190
+ # @yieldparam [String] upper the upper (right) edge of the bin
191
+ # @yieldparam [String] count the (logarithmic if requested) count of values in the bin
192
+ # @yieldparam [String] normalized_count the (logarithmic if requested) normalized count of values in the bin if requested
193
+ def finalize
194
+ bin! unless bins?
195
+ counts.each_with_index do |count, index|
196
+ bin = bins[index]
197
+ bin << log_count_if_necessary(count)
198
+ if normalize && total_count > 0
199
+ bin << log_count_if_necessary((count.to_f / total_count.to_f))
200
+ end
201
+ yield bin.map { |n| format(n) }
202
+ end
203
+ end
204
+
205
+ # Formats `n` so it's readable and compact.
206
+ #
207
+ # If this widget is given an explicit `format_string` then it
208
+ # will be used here (the value of `format_string` should have a
209
+ # slot for a float).
210
+ #
211
+ # Otherwise, large (or small) numbers will be formatted in
212
+ # scientific notation while "medium numbers" (0.001 < |n| <
213
+ # 1000) are merely printed, all with the given `precision`.
214
+ #
215
+ # @param [Float] n
216
+ # @return [String]
217
+ def format n
218
+ case
219
+ when format_string
220
+ format_string % n
221
+ when n == 0.0
222
+ '0.0'
223
+ when n.abs > 1000 || n.abs < 0.001
224
+ "%#{precision}.#{precision}E" % n
225
+ else
226
+ "%#{precision}.#{precision}f" % n
227
+ end
228
+ end
229
+
230
+ # Bins the accumulated values.
231
+ #
232
+ # @see #bins?
233
+ def bin!
234
+ set_num_bins_from_total_count! unless self.num_bins
235
+ set_edges_from_min_max_and_num_bins!
236
+ until values.empty?
237
+ value = values.shift
238
+ add_to_some_bin(value.to_f) if value
239
+ end
240
+ end
241
+
242
+ # Does this widget have a populated list of bins?
243
+ #
244
+ # @return [true, false]
245
+ def bins?
246
+ bins && (! bins.empty?)
247
+ end
248
+
249
+ # Get a value from a given `record`.
250
+ #
251
+ # @param [Object] record
252
+ # @return [Float, nil]
253
+ def value_from record
254
+ val = get(self.by, record)
255
+ return unless val
256
+ val.to_f rescue nil
257
+ end
258
+
259
+ # Returns `val`, taking a logarithm to the appropriate base if
260
+ # required.
261
+ #
262
+ # @param [Float] val
263
+ # @return [Float] the original value or its logarithm if required
264
+ def log_count_if_necessary val
265
+ log_counts ? log_if_possible(val) : val
266
+ end
267
+
268
+ # Returns the logarithm of the given `val` if possible.
269
+ #
270
+ # Will return the original value if negative.
271
+ #
272
+ # @param [Float] val
273
+ # @return [Float]
274
+ def log_if_possible val
275
+ val > 0 ? Math.log(val, base) : val
276
+ end
277
+
278
+ private
279
+
280
+ # :nodoc
281
+ def receive_min new_min
282
+ raise Error.new("The minimum value must be strictly less than the maximum value") if max && new_min.to_f >= max
283
+ @min = new_min.to_f
284
+ end
285
+
286
+ # :nodoc
287
+ def receive_max new_max
288
+ raise Error.new("The maximum value must be strictly greater than the minimum value") if min && new_max.to_f <= min
289
+ @max = new_max.to_f
290
+ end
291
+
292
+ # :nodoc
293
+ def receive_num_bins n
294
+ raise Error.new("The number of bins must be a postive-definite integer") if n.to_i <= 0
295
+ @num_bins = n.to_i
296
+ end
297
+
298
+ # :nodoc
299
+ def receive_edges es
300
+ @edges = case es
301
+ when String then es.split(',')
302
+ when Array then es
303
+ end.map(&:to_f).sort
304
+ set_bins_and_counts_from_edges! if @edges
305
+ @edges
306
+ end
307
+
308
+ # :nodoc
309
+ def set_num_bins_from_total_count!
310
+ self.num_bins = Math.sqrt(total_count).to_i
311
+ end
312
+
313
+ # :nodoc
314
+ def set_bins_and_counts_from_edges!
315
+ @bins = [].tap do |b|
316
+ edges[0..-2].each_with_index do |edge, index|
317
+ b << [edge, edges[index+1]]
318
+ end
319
+ end
320
+ @counts = bins.length.times.map { 0 }
321
+ end
322
+
323
+ # :nodoc
324
+ def set_edges_from_min_max_and_num_bins!
325
+ e = []
326
+
327
+ if log_bins
328
+ bin_min = log_if_possible(min)
329
+ bin_max = log_if_possible(max)
330
+ else
331
+ bin_min = min
332
+ bin_max = max
333
+ end
334
+
335
+ bin_diff = (bin_max - bin_min) / num_bins
336
+ e << bin_min
337
+ current = bin_min + bin_diff
338
+ while current < bin_max
339
+ e << current
340
+ current += bin_diff
341
+ end
342
+ e << bin_max
343
+
344
+ if log_bins
345
+ self.edges = e.map { |n| Math.exp(n) }
346
+ else
347
+ self.edges = e
348
+ end
349
+ set_bins_and_counts_from_edges!
350
+ end
351
+
352
+ # :nodoc:
353
+ def add_to_some_bin value
354
+ # FIXME optimize this O(n) algorithm...
355
+ bins.each_with_index do |bin, index|
356
+ lower, upper = bin
357
+ if value >= lower && value < upper
358
+ counts[index] += 1
359
+ return
360
+ end
361
+ end
362
+ counts[-1] += 1 # if it's the maximal element
363
+ end
364
+
365
+ register
366
+ end
367
+ end
368
+ end