ul-wukong 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,73 @@
1
+ require_relative("accumulator")
2
+
3
+ module Wukong
4
+ class Processor
5
+
6
+ # A processor which counts the total number of its input records.
7
+ #
8
+ # On it's own, this widget is really just a poor man's `wc -l`.
9
+ # It's really intended to serve as a superclass for more complex
10
+ # accumulators.
11
+ #
12
+ # @example Count the total number of input records on the command-line.
13
+ #
14
+ # $ wc -l input
15
+ # 283 input
16
+ # $ cat input | wu-local count
17
+ # 283
18
+ class Count < Accumulator
19
+
20
+ description <<EOF
21
+ This processor counts the number of input records it receives.
22
+
23
+ $ wc -l input
24
+ 283 input
25
+ $ cat input | wu-local count
26
+ 283
27
+
28
+ This processor will not output any records until it receives its final
29
+ input record.
30
+ EOF
31
+
32
+ # The total size of the input recors.
33
+ attr_accessor :size
34
+
35
+ # Initializes the count to 0.
36
+ def setup
37
+ super()
38
+ @size = 0
39
+ end
40
+
41
+ # Accumulate a `record` by incrmenting the total size.
42
+ #
43
+ # @param [Object] record
44
+ def accumulate record
45
+ self.size += 1
46
+ end
47
+
48
+ # Keeps all records in the same group so that one count is
49
+ # emitted at the end.
50
+ #
51
+ # Overriding this method and returning different keys for
52
+ # different records is the beginning of constructing a "group
53
+ # by" type widget.
54
+ #
55
+ # @param [Object] record
56
+ # @return [:__first__group__]
57
+ # @see Group
58
+ def get_key record
59
+ :__first_group__
60
+ end
61
+
62
+ # Yields the total size.
63
+ #
64
+ # @yield [size]
65
+ # @yieldparam [Integer] size
66
+ def finalize
67
+ yield self.size
68
+ end
69
+
70
+ register
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,128 @@
1
+ require_relative("../utils")
2
+ require_relative("count")
3
+
4
+ module Wukong
5
+ class Processor
6
+
7
+ # Groups sorted input records and emits each group with a count.
8
+ #
9
+ # Allows you to use several ways of extracting the key that
10
+ # defines the group.
11
+ #
12
+ # **Note:** The input records must be previously sorted by the
13
+ # same key used for grouping in order to ensure that groups are
14
+ # not split up.
15
+ #
16
+ # @example Group simple string values on the command-line.
17
+ #
18
+ # $ cat input
19
+ # apple
20
+ # cat
21
+ # banana
22
+ # apple
23
+ # ...
24
+ # $ cat input | wu-local sort | wu-local group --to=tsv
25
+ # apple 4
26
+ # banana 2
27
+ # cat 5
28
+ # ...
29
+ #
30
+ # @example Group using a nested key within a JSON string on the command-line
31
+ #
32
+ # $ cat input
33
+ # {"id": 1, "word": "apple" }
34
+ # {"id": 2, "word": "cat" }
35
+ # {"id": 3, "word": "banana"}
36
+ # ...
37
+ # $ cat input | wu-local sort --on=word | wu-local group --by=word --to=tsv
38
+ # apple 4
39
+ # banana 2
40
+ # cat 5
41
+ # ...
42
+ #
43
+ # A group fits nicely at the end of a dataflow. Since it requires
44
+ # a sort, it is blocking.
45
+ #
46
+ # @example Using a group at the end of a dataflow
47
+ #
48
+ # Wukong.dataflow(:makes_groups) do
49
+ # ... | sort(on: 'field') | group(by: 'field') | to_tsv
50
+ # end
51
+ #
52
+ # @see Sort
53
+ class Group < Count
54
+
55
+ description <<EOF
56
+ This processor groups consecutive input records that share the same
57
+ "group key". There are several ways to extract this group key from a
58
+ record.
59
+
60
+ NOTE: The input records must be previously sorted by the
61
+ same key used for grouping in order to ensure that groups are
62
+ not split up.
63
+
64
+ By default the input records themselves are used as their own group
65
+ keys, allowing to count identical values, a la `uniq -c`:
66
+
67
+ $ cat input
68
+ apple
69
+ cat
70
+ banana
71
+ apple
72
+ ...
73
+
74
+ $ cat input | wu-local sort | wu-local group --to=tsv
75
+ apple 4
76
+ banana 2
77
+ cat 5
78
+ ...
79
+
80
+ You can also group by some part of in input record:
81
+
82
+ $ cat input
83
+ {"id": 1, "word": "apple" }
84
+ {"id": 2, "word": "cat" }
85
+ {"id": 3, "word": "banana"}
86
+ ...
87
+
88
+ $ cat input | wu-local sort --on==word | wu-local group --by=word --to=tsv
89
+ apple 4
90
+ banana 2
91
+ cat 5
92
+ ...
93
+
94
+ This processor will not produce any output for a given group until it
95
+ sees the last record of that group.
96
+ EOF
97
+
98
+ include DynamicGet
99
+ field :by, Whatever, :doc => "Part of the record to group by"
100
+
101
+ # Get the key which defines the group for this `record`.
102
+ #
103
+ # @param [Object] record
104
+ # @return [Object]
105
+ def get_key(record)
106
+ get(self.by, record)
107
+ end
108
+
109
+ # Reset the size counter for new group.
110
+ #
111
+ # @param [Object] record
112
+ def start record
113
+ self.size = 0
114
+ end
115
+
116
+ # Yields the current group along with its size
117
+ #
118
+ # @yield [key, size]
119
+ # @yieldparam [Object] key the key defining the group
120
+ # @yieldparam [Integer] size the size of the group
121
+ def finalize
122
+ yield [key, size]
123
+ end
124
+
125
+ register
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,98 @@
1
+ require_relative("group")
2
+
3
+ module Wukong
4
+ class Processor
5
+
6
+ # Concatenates the elements of a group, yielding the group key,
7
+ # the count, and its members.
8
+ #
9
+ # @example Concatenating elements of a group on the command-line.
10
+ #
11
+ # $ cat input
12
+ # {"id": 1, "parent_id": 4}
13
+ # {"id": 2, "parent_id": 3}
14
+ # {"id": 3, "parent_id": 3}
15
+ # ...
16
+ # $ cat input | wu-local group_concat --by=parent_id --to=tsv
17
+ # 4 1 {"id": 1, "parent_id": 4}
18
+ # 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
19
+ # ...
20
+ #
21
+ # GroupConcat takes all the same options as Group.
22
+ #
23
+ # @see Group
24
+ class GroupConcat < Group
25
+
26
+ description <<EOF
27
+ This processor concatenates records of a consecutive group of records
28
+ into a single record.
29
+
30
+ $ cat input
31
+ {"id": 1, "parent_id": 4}
32
+ {"id": 2, "parent_id": 3}
33
+ {"id": 3, "parent_id": 3}
34
+ ...
35
+
36
+ $ cat input | wu-local group_concat --by=parent_id --to=tsv
37
+ 4 1 {"id": 1, "parent_id": 4}
38
+ 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
39
+ ...
40
+
41
+ Each output record consists of tab-separated fields in the following
42
+ order:
43
+
44
+ 1) The key defining the group of input records in this output record
45
+ 2) The number of input records in the group
46
+ 3) Each input record in the group
47
+ ...
48
+
49
+ This processor will not produce any output for a given group until it
50
+ sees the last record of that group. See the documentation for the
51
+ 'group' processor for more information.
52
+ EOF
53
+
54
+ # The members of the current group.
55
+ attr_accessor :members
56
+
57
+ # Initializes the empty members array.
58
+ def setup
59
+ super()
60
+ @members = []
61
+ end
62
+
63
+ # Initializes the empty members array.
64
+ #
65
+ # @param [Object] record
66
+ def start record
67
+ super(record)
68
+ self.members = []
69
+ end
70
+
71
+ # Accumulate each record, adding it to the current members.
72
+ #
73
+ # @param [Object] record
74
+ def accumulate record
75
+ super(record)
76
+ self.members << record
77
+ end
78
+
79
+ # Yields the group, including its key, its size, and each
80
+ # member.
81
+ #
82
+ # @yield [key, size, *members]
83
+ # @yieldparam [Object] key the key defining the group
84
+ # @yieldparam [Integer] size the number of members in the group
85
+ # @yieldparam [Array<Object>] the members of the group
86
+ def finalize
87
+ group = [key, size]
88
+ group.concat(members)
89
+ yield group
90
+ end
91
+
92
+ register
93
+ end
94
+ end
95
+ end
96
+
97
+
98
+
@@ -0,0 +1,71 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # A base widget for building more complex improver widgets.
5
+ class Improver < Processor
6
+
7
+ # The current group of records.
8
+ attr_accessor :group
9
+
10
+ # Sets up this improver by defining an initial key (with a
11
+ # value that is unlikely to be found in real data) and calling
12
+ # `#zero` with no record.
13
+ def setup
14
+ @key = :__first_group__
15
+ zero
16
+ end
17
+
18
+ def recordize record
19
+ record.split("\t")
20
+ end
21
+
22
+ #
23
+ # All kinds of assumptions here,
24
+ # record is tab-delimited and the
25
+ # first field is a name of a function
26
+ # to call
27
+ #
28
+ def get_function record
29
+ record.first
30
+ end
31
+
32
+ # Processes the `record`.
33
+ def process(record)
34
+ fields = recordize(record)
35
+ func = get_function(fields)
36
+ case func
37
+ when 'zero' then
38
+ yield zero
39
+ when 'accumulate' then
40
+ accumulate(fields[1..-1])
41
+ when 'improve' then
42
+ yield improve(fields[1], self.group)
43
+ self.group = []
44
+ else
45
+ raise NoMethodError, "undefined method #{func} for Improver"
46
+ end
47
+ STDOUT.flush # WHY? Because.
48
+ end
49
+
50
+ # Starts accumulation for a new key. Return what you would
51
+ # with no improvements.
52
+ def zero
53
+ self.group = []
54
+ end
55
+
56
+ # Accumulates another +record+.
57
+ #
58
+ # @param [Object] record
59
+ def accumulate record
60
+ self.group << record
61
+ end
62
+
63
+ # Improve prev with group
64
+ #
65
+ #
66
+ def improve prev, group
67
+ end
68
+
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,37 @@
1
+ module Wukong
2
+ class Processor
3
+
4
+ # Joins XML input data based on a root tag.
5
+ class JoinXML < Processor
6
+
7
+ field :root, String, default: 'xml', doc: "Name of the root XML element"
8
+
9
+ def setup
10
+ @lines = []
11
+ end
12
+
13
+ def process line
14
+ if match = terminator.match(line)
15
+ if match.end(0) == line.size
16
+ @lines << line
17
+ else
18
+ @lines << line[0...match.end(0)]
19
+ end
20
+ yield @lines.join("\n")
21
+ @lines = []
22
+ @lines << line[match.end(0)..-1] unless match.end(0) == line.size
23
+ else
24
+ @lines << line
25
+ end
26
+ end
27
+
28
+ def terminator
29
+ %r{<\s*/\s*#{root}\s*>}i
30
+ end
31
+
32
+ register :join_xml
33
+ end
34
+ end
35
+ end
36
+
37
+
@@ -0,0 +1,72 @@
1
+ require_relative("group")
2
+
3
+ module Wukong
4
+ class Processor
5
+ class Moments < Group
6
+
7
+ field :group_by, Whatever, :doc => "Part of the record to group by"
8
+
9
+ attr_accessor :measurements
10
+
11
+ field :of, Array, :default => [], :doc => "Parts of the record to measure moments of"
12
+ field :no_std_dev, :boolean, :doc => "Don't compute standard deviations"
13
+
14
+ def get_key record
15
+ super(record) unless (self.group_by || self.by)
16
+ get(self.group_by || self.by, record)
17
+ end
18
+
19
+ def receive_of o
20
+ @of = case o
21
+ when String then o.split(',')
22
+ when Array then o
23
+ else []
24
+ end
25
+ end
26
+
27
+ def start record
28
+ super(record)
29
+ @measurements = {}.tap do |m|
30
+ self.of.each do |property|
31
+ m[property] = []
32
+ end
33
+ end
34
+ end
35
+
36
+ def accumulate record
37
+ super(record)
38
+ self.of.each do |property|
39
+ if raw = get(property, record)
40
+ self.measurements[property] << (raw.to_f rescue next)
41
+ end
42
+ end
43
+ end
44
+
45
+ def results
46
+ {}.tap do |r|
47
+ measurements.each_pair do |property, values|
48
+ r[property] = {}
49
+ next if values.empty?
50
+ count = values.size.to_f
51
+ r[property][:count] = count.to_i
52
+
53
+ mean = values.inject(0.0) { |sum, value| sum += value } / count
54
+ r[property][:mean] = mean
55
+ unless no_std_dev
56
+ variance = values.inject(0.0) { |sum, value| diff = (value - mean) ; sum += diff * diff } / count
57
+ std = Math.sqrt(variance)
58
+ r[property][:std_dev] = std
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ def finalize
65
+ yield({:group => key, :count => size}.merge(:results => results))
66
+ end
67
+
68
+ register
69
+ end
70
+ end
71
+ end
72
+