ul-wukong 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +60 -0
- data/.gitmodules +6 -0
- data/.rspec +2 -0
- data/.travis.yml +19 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +17 -0
- data/Guardfile +12 -0
- data/LICENSE.md +95 -0
- data/NOTES-travis.md +31 -0
- data/README-old.md +422 -0
- data/README.md +1308 -0
- data/Rakefile +28 -0
- data/TODO.md +99 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +6 -0
- data/bin/md5sort +20 -0
- data/bin/setcat +11 -0
- data/bin/tabchar +5 -0
- data/bin/uniq-ord +59 -0
- data/bin/uniqc +3 -0
- data/bin/wu +34 -0
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-date +13 -0
- data/bin/wu-datetime +13 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +186 -0
- data/bin/wu-local +4 -0
- data/bin/wu-plus +9 -0
- data/bin/wu-source +5 -0
- data/bin/wu-sum +31 -0
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/basic/tiny_count.rb +8 -0
- data/examples/basic/word_count/accumulator.rb +26 -0
- data/examples/basic/word_count/tokenizer.rb +13 -0
- data/examples/basic/word_count/word_count.rb +6 -0
- data/examples/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/deploy_pack/Gemfile +6 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/dsl/dataflow/simple.rb +12 -0
- data/examples/dsl/dataflow/telegram.rb +45 -0
- data/examples/dsl/workflow/cherry_pie.dot +97 -0
- data/examples/dsl/workflow/cherry_pie.md +104 -0
- data/examples/dsl/workflow/cherry_pie.png +0 -0
- data/examples/dsl/workflow/cherry_pie.rb +101 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/examples_helper.rb +9 -0
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/loadable.rb +2 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/examples/munging/airline_flights/airplane.rb +0 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +107 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/rake_helper.rb +97 -0
- data/examples/ruby_project/Gemfile +6 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/splitter.rb +94 -0
- data/examples/string_reverser.rb +7 -0
- data/examples/text/pig_latin/pig_latinizer.rb +35 -0
- data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +36 -0
- data/lib/hanuman/graph.rb +97 -0
- data/lib/hanuman/graphvizzer.rb +206 -0
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +128 -0
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/lib/wu/graph/union_find.rb +62 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/lib/wu/munging.rb +71 -0
- data/lib/wu/social/models/twitter.rb +31 -0
- data/lib/wu/wikipedia/models.rb +20 -0
- data/lib/wukong.rb +54 -0
- data/lib/wukong/dataflow.rb +43 -0
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +214 -0
- data/lib/wukong/driver/event_machine_driver.rb +15 -0
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +42 -0
- data/lib/wukong/local/runner.rb +96 -0
- data/lib/wukong/local/stdio_driver.rb +104 -0
- data/lib/wukong/logger.rb +102 -0
- data/lib/wukong/model/faker.rb +136 -0
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +110 -0
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +169 -0
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers.rb +74 -0
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
- data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
- data/lib/wukong/version.rb +3 -0
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/extract.rb +122 -0
- data/lib/wukong/widget/filters.rb +452 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +10 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +368 -0
- data/lib/wukong/widget/reducers/count.rb +73 -0
- data/lib/wukong/widget/reducers/group.rb +128 -0
- data/lib/wukong/widget/reducers/group_concat.rb +98 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +180 -0
- data/lib/wukong/widget/reducers/uniq.rb +91 -0
- data/lib/wukong/widget/serializers.rb +317 -0
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +7 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
- data/spec/examples/dataflow/parsing_spec.rb +14 -0
- data/spec/examples/dataflow/simple_spec.rb +34 -0
- data/spec/examples/dataflow/telegram_spec.rb +43 -0
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +18 -0
- data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
- data/spec/hanuman/graph_spec.rb +119 -0
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +81 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +43 -0
- data/spec/support/example_test_helpers.rb +95 -0
- data/spec/support/hanuman_test_helpers.rb +92 -0
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +94 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/dataflow_spec.rb +87 -0
- data/spec/wukong/driver_spec.rb +154 -0
- data/spec/wukong/local/runner_spec.rb +29 -0
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/model/faker_spec.rb +132 -0
- data/spec/wukong/processor_spec.rb +21 -0
- data/spec/wukong/runner_spec.rb +132 -0
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/filters_spec.rb +79 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +21 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
- data/spec/wukong/widget/serializers_spec.rb +114 -0
- data/spec/wukong/widget/sink_spec.rb +19 -0
- data/spec/wukong/widget/source_spec.rb +65 -0
- data/spec/wukong/wu-local_spec.rb +109 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +35 -0
- metadata +465 -0
@@ -0,0 +1,73 @@
|
|
1
|
+
require_relative("accumulator")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
|
6
|
+
# A processor which counts the total number of its input records.
|
7
|
+
#
|
8
|
+
# On it's own, this widget is really just a poor man's `wc -l`.
|
9
|
+
# It's really intended to serve as a superclass for more complex
|
10
|
+
# accumulators.
|
11
|
+
#
|
12
|
+
# @example Count the total number of input records on the command-line.
|
13
|
+
#
|
14
|
+
# $ wc -l input
|
15
|
+
# 283 input
|
16
|
+
# $ cat input | wu-local count
|
17
|
+
# 283
|
18
|
+
class Count < Accumulator
|
19
|
+
|
20
|
+
description <<EOF
|
21
|
+
This processor counts the number of input records it receives.
|
22
|
+
|
23
|
+
$ wc -l input
|
24
|
+
283 input
|
25
|
+
$ cat input | wu-local count
|
26
|
+
283
|
27
|
+
|
28
|
+
This processor will not output any records until it receives its final
|
29
|
+
input record.
|
30
|
+
EOF
|
31
|
+
|
32
|
+
# The total size of the input recors.
|
33
|
+
attr_accessor :size
|
34
|
+
|
35
|
+
# Initializes the count to 0.
|
36
|
+
def setup
|
37
|
+
super()
|
38
|
+
@size = 0
|
39
|
+
end
|
40
|
+
|
41
|
+
# Accumulate a `record` by incrmenting the total size.
|
42
|
+
#
|
43
|
+
# @param [Object] record
|
44
|
+
def accumulate record
|
45
|
+
self.size += 1
|
46
|
+
end
|
47
|
+
|
48
|
+
# Keeps all records in the same group so that one count is
|
49
|
+
# emitted at the end.
|
50
|
+
#
|
51
|
+
# Overriding this method and returning different keys for
|
52
|
+
# different records is the beginning of constructing a "group
|
53
|
+
# by" type widget.
|
54
|
+
#
|
55
|
+
# @param [Object] record
|
56
|
+
# @return [:__first__group__]
|
57
|
+
# @see Group
|
58
|
+
def get_key record
|
59
|
+
:__first_group__
|
60
|
+
end
|
61
|
+
|
62
|
+
# Yields the total size.
|
63
|
+
#
|
64
|
+
# @yield [size]
|
65
|
+
# @yieldparam [Integer] size
|
66
|
+
def finalize
|
67
|
+
yield self.size
|
68
|
+
end
|
69
|
+
|
70
|
+
register
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require_relative("../utils")
|
2
|
+
require_relative("count")
|
3
|
+
|
4
|
+
module Wukong
|
5
|
+
class Processor
|
6
|
+
|
7
|
+
# Groups sorted input records and emits each group with a count.
|
8
|
+
#
|
9
|
+
# Allows you to use several ways of extracting the key that
|
10
|
+
# defines the group.
|
11
|
+
#
|
12
|
+
# **Note:** The input records must be previously sorted by the
|
13
|
+
# same key used for grouping in order to ensure that groups are
|
14
|
+
# not split up.
|
15
|
+
#
|
16
|
+
# @example Group simple string values on the command-line.
|
17
|
+
#
|
18
|
+
# $ cat input
|
19
|
+
# apple
|
20
|
+
# cat
|
21
|
+
# banana
|
22
|
+
# apple
|
23
|
+
# ...
|
24
|
+
# $ cat input | wu-local sort | wu-local group --to=tsv
|
25
|
+
# apple 4
|
26
|
+
# banana 2
|
27
|
+
# cat 5
|
28
|
+
# ...
|
29
|
+
#
|
30
|
+
# @example Group using a nested key within a JSON string on the command-line
|
31
|
+
#
|
32
|
+
# $ cat input
|
33
|
+
# {"id": 1, "word": "apple" }
|
34
|
+
# {"id": 2, "word": "cat" }
|
35
|
+
# {"id": 3, "word": "banana"}
|
36
|
+
# ...
|
37
|
+
# $ cat input | wu-local sort --on=word | wu-local group --by=word --to=tsv
|
38
|
+
# apple 4
|
39
|
+
# banana 2
|
40
|
+
# cat 5
|
41
|
+
# ...
|
42
|
+
#
|
43
|
+
# A group fits nicely at the end of a dataflow. Since it requires
|
44
|
+
# a sort, it is blocking.
|
45
|
+
#
|
46
|
+
# @example Using a group at the end of a dataflow
|
47
|
+
#
|
48
|
+
# Wukong.dataflow(:makes_groups) do
|
49
|
+
# ... | sort(on: 'field') | group(by: 'field') | to_tsv
|
50
|
+
# end
|
51
|
+
#
|
52
|
+
# @see Sort
|
53
|
+
class Group < Count
|
54
|
+
|
55
|
+
description <<EOF
|
56
|
+
This processor groups consecutive input records that share the same
|
57
|
+
"group key". There are several ways to extract this group key from a
|
58
|
+
record.
|
59
|
+
|
60
|
+
NOTE: The input records must be previously sorted by the
|
61
|
+
same key used for grouping in order to ensure that groups are
|
62
|
+
not split up.
|
63
|
+
|
64
|
+
By default the input records themselves are used as their own group
|
65
|
+
keys, allowing to count identical values, a la `uniq -c`:
|
66
|
+
|
67
|
+
$ cat input
|
68
|
+
apple
|
69
|
+
cat
|
70
|
+
banana
|
71
|
+
apple
|
72
|
+
...
|
73
|
+
|
74
|
+
$ cat input | wu-local sort | wu-local group --to=tsv
|
75
|
+
apple 4
|
76
|
+
banana 2
|
77
|
+
cat 5
|
78
|
+
...
|
79
|
+
|
80
|
+
You can also group by some part of in input record:
|
81
|
+
|
82
|
+
$ cat input
|
83
|
+
{"id": 1, "word": "apple" }
|
84
|
+
{"id": 2, "word": "cat" }
|
85
|
+
{"id": 3, "word": "banana"}
|
86
|
+
...
|
87
|
+
|
88
|
+
$ cat input | wu-local sort --on==word | wu-local group --by=word --to=tsv
|
89
|
+
apple 4
|
90
|
+
banana 2
|
91
|
+
cat 5
|
92
|
+
...
|
93
|
+
|
94
|
+
This processor will not produce any output for a given group until it
|
95
|
+
sees the last record of that group.
|
96
|
+
EOF
|
97
|
+
|
98
|
+
include DynamicGet
|
99
|
+
field :by, Whatever, :doc => "Part of the record to group by"
|
100
|
+
|
101
|
+
# Get the key which defines the group for this `record`.
|
102
|
+
#
|
103
|
+
# @param [Object] record
|
104
|
+
# @return [Object]
|
105
|
+
def get_key(record)
|
106
|
+
get(self.by, record)
|
107
|
+
end
|
108
|
+
|
109
|
+
# Reset the size counter for new group.
|
110
|
+
#
|
111
|
+
# @param [Object] record
|
112
|
+
def start record
|
113
|
+
self.size = 0
|
114
|
+
end
|
115
|
+
|
116
|
+
# Yields the current group along with its size
|
117
|
+
#
|
118
|
+
# @yield [key, size]
|
119
|
+
# @yieldparam [Object] key the key defining the group
|
120
|
+
# @yieldparam [Integer] size the size of the group
|
121
|
+
def finalize
|
122
|
+
yield [key, size]
|
123
|
+
end
|
124
|
+
|
125
|
+
register
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative("group")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
|
6
|
+
# Concatenates the elements of a group, yielding the group key,
|
7
|
+
# the count, and its members.
|
8
|
+
#
|
9
|
+
# @example Concatenating elements of a group on the command-line.
|
10
|
+
#
|
11
|
+
# $ cat input
|
12
|
+
# {"id": 1, "parent_id": 4}
|
13
|
+
# {"id": 2, "parent_id": 3}
|
14
|
+
# {"id": 3, "parent_id": 3}
|
15
|
+
# ...
|
16
|
+
# $ cat input | wu-local group_concat --by=parent_id --to=tsv
|
17
|
+
# 4 1 {"id": 1, "parent_id": 4}
|
18
|
+
# 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
|
19
|
+
# ...
|
20
|
+
#
|
21
|
+
# GroupConcat takes all the same options as Group.
|
22
|
+
#
|
23
|
+
# @see Group
|
24
|
+
class GroupConcat < Group
|
25
|
+
|
26
|
+
description <<EOF
|
27
|
+
This processor concatenates records of a consecutive group of records
|
28
|
+
into a single record.
|
29
|
+
|
30
|
+
$ cat input
|
31
|
+
{"id": 1, "parent_id": 4}
|
32
|
+
{"id": 2, "parent_id": 3}
|
33
|
+
{"id": 3, "parent_id": 3}
|
34
|
+
...
|
35
|
+
|
36
|
+
$ cat input | wu-local group_concat --by=parent_id --to=tsv
|
37
|
+
4 1 {"id": 1, "parent_id": 4}
|
38
|
+
3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
|
39
|
+
...
|
40
|
+
|
41
|
+
Each output record consists of tab-separated fields in the following
|
42
|
+
order:
|
43
|
+
|
44
|
+
1) The key defining the group of input records in this output record
|
45
|
+
2) The number of input records in the group
|
46
|
+
3) Each input record in the group
|
47
|
+
...
|
48
|
+
|
49
|
+
This processor will not produce any output for a given group until it
|
50
|
+
sees the last record of that group. See the documentation for the
|
51
|
+
'group' processor for more information.
|
52
|
+
EOF
|
53
|
+
|
54
|
+
# The members of the current group.
|
55
|
+
attr_accessor :members
|
56
|
+
|
57
|
+
# Initializes the empty members array.
|
58
|
+
def setup
|
59
|
+
super()
|
60
|
+
@members = []
|
61
|
+
end
|
62
|
+
|
63
|
+
# Initializes the empty members array.
|
64
|
+
#
|
65
|
+
# @param [Object] record
|
66
|
+
def start record
|
67
|
+
super(record)
|
68
|
+
self.members = []
|
69
|
+
end
|
70
|
+
|
71
|
+
# Accumulate each record, adding it to the current members.
|
72
|
+
#
|
73
|
+
# @param [Object] record
|
74
|
+
def accumulate record
|
75
|
+
super(record)
|
76
|
+
self.members << record
|
77
|
+
end
|
78
|
+
|
79
|
+
# Yields the group, including its key, its size, and each
|
80
|
+
# member.
|
81
|
+
#
|
82
|
+
# @yield [key, size, *members]
|
83
|
+
# @yieldparam [Object] key the key defining the group
|
84
|
+
# @yieldparam [Integer] size the number of members in the group
|
85
|
+
# @yieldparam [Array<Object>] the members of the group
|
86
|
+
def finalize
|
87
|
+
group = [key, size]
|
88
|
+
group.concat(members)
|
89
|
+
yield group
|
90
|
+
end
|
91
|
+
|
92
|
+
register
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# A base widget for building more complex improver widgets.
|
5
|
+
class Improver < Processor
|
6
|
+
|
7
|
+
# The current group of records.
|
8
|
+
attr_accessor :group
|
9
|
+
|
10
|
+
# Sets up this improver by defining an initial key (with a
|
11
|
+
# value that is unlikely to be found in real data) and calling
|
12
|
+
# `#zero` with no record.
|
13
|
+
def setup
|
14
|
+
@key = :__first_group__
|
15
|
+
zero
|
16
|
+
end
|
17
|
+
|
18
|
+
def recordize record
|
19
|
+
record.split("\t")
|
20
|
+
end
|
21
|
+
|
22
|
+
#
|
23
|
+
# All kinds of assumptions here,
|
24
|
+
# record is tab-delimited and the
|
25
|
+
# first field is a name of a function
|
26
|
+
# to call
|
27
|
+
#
|
28
|
+
def get_function record
|
29
|
+
record.first
|
30
|
+
end
|
31
|
+
|
32
|
+
# Processes the `record`.
|
33
|
+
def process(record)
|
34
|
+
fields = recordize(record)
|
35
|
+
func = get_function(fields)
|
36
|
+
case func
|
37
|
+
when 'zero' then
|
38
|
+
yield zero
|
39
|
+
when 'accumulate' then
|
40
|
+
accumulate(fields[1..-1])
|
41
|
+
when 'improve' then
|
42
|
+
yield improve(fields[1], self.group)
|
43
|
+
self.group = []
|
44
|
+
else
|
45
|
+
raise NoMethodError, "undefined method #{func} for Improver"
|
46
|
+
end
|
47
|
+
STDOUT.flush # WHY? Because.
|
48
|
+
end
|
49
|
+
|
50
|
+
# Starts accumulation for a new key. Return what you would
|
51
|
+
# with no improvements.
|
52
|
+
def zero
|
53
|
+
self.group = []
|
54
|
+
end
|
55
|
+
|
56
|
+
# Accumulates another +record+.
|
57
|
+
#
|
58
|
+
# @param [Object] record
|
59
|
+
def accumulate record
|
60
|
+
self.group << record
|
61
|
+
end
|
62
|
+
|
63
|
+
# Improve prev with group
|
64
|
+
#
|
65
|
+
#
|
66
|
+
def improve prev, group
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# Joins XML input data based on a root tag.
|
5
|
+
class JoinXML < Processor
|
6
|
+
|
7
|
+
field :root, String, default: 'xml', doc: "Name of the root XML element"
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@lines = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def process line
|
14
|
+
if match = terminator.match(line)
|
15
|
+
if match.end(0) == line.size
|
16
|
+
@lines << line
|
17
|
+
else
|
18
|
+
@lines << line[0...match.end(0)]
|
19
|
+
end
|
20
|
+
yield @lines.join("\n")
|
21
|
+
@lines = []
|
22
|
+
@lines << line[match.end(0)..-1] unless match.end(0) == line.size
|
23
|
+
else
|
24
|
+
@lines << line
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def terminator
|
29
|
+
%r{<\s*/\s*#{root}\s*>}i
|
30
|
+
end
|
31
|
+
|
32
|
+
register :join_xml
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require_relative("group")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
class Moments < Group
|
6
|
+
|
7
|
+
field :group_by, Whatever, :doc => "Part of the record to group by"
|
8
|
+
|
9
|
+
attr_accessor :measurements
|
10
|
+
|
11
|
+
field :of, Array, :default => [], :doc => "Parts of the record to measure moments of"
|
12
|
+
field :no_std_dev, :boolean, :doc => "Don't compute standard deviations"
|
13
|
+
|
14
|
+
def get_key record
|
15
|
+
super(record) unless (self.group_by || self.by)
|
16
|
+
get(self.group_by || self.by, record)
|
17
|
+
end
|
18
|
+
|
19
|
+
def receive_of o
|
20
|
+
@of = case o
|
21
|
+
when String then o.split(',')
|
22
|
+
when Array then o
|
23
|
+
else []
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def start record
|
28
|
+
super(record)
|
29
|
+
@measurements = {}.tap do |m|
|
30
|
+
self.of.each do |property|
|
31
|
+
m[property] = []
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def accumulate record
|
37
|
+
super(record)
|
38
|
+
self.of.each do |property|
|
39
|
+
if raw = get(property, record)
|
40
|
+
self.measurements[property] << (raw.to_f rescue next)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def results
|
46
|
+
{}.tap do |r|
|
47
|
+
measurements.each_pair do |property, values|
|
48
|
+
r[property] = {}
|
49
|
+
next if values.empty?
|
50
|
+
count = values.size.to_f
|
51
|
+
r[property][:count] = count.to_i
|
52
|
+
|
53
|
+
mean = values.inject(0.0) { |sum, value| sum += value } / count
|
54
|
+
r[property][:mean] = mean
|
55
|
+
unless no_std_dev
|
56
|
+
variance = values.inject(0.0) { |sum, value| diff = (value - mean) ; sum += diff * diff } / count
|
57
|
+
std = Math.sqrt(variance)
|
58
|
+
r[property][:std_dev] = std
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def finalize
|
65
|
+
yield({:group => key, :count => size}.merge(:results => results))
|
66
|
+
end
|
67
|
+
|
68
|
+
register
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|