ul-wukong 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +60 -0
- data/.gitmodules +6 -0
- data/.rspec +2 -0
- data/.travis.yml +19 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +17 -0
- data/Guardfile +12 -0
- data/LICENSE.md +95 -0
- data/NOTES-travis.md +31 -0
- data/README-old.md +422 -0
- data/README.md +1308 -0
- data/Rakefile +28 -0
- data/TODO.md +99 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +6 -0
- data/bin/md5sort +20 -0
- data/bin/setcat +11 -0
- data/bin/tabchar +5 -0
- data/bin/uniq-ord +59 -0
- data/bin/uniqc +3 -0
- data/bin/wu +34 -0
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-date +13 -0
- data/bin/wu-datetime +13 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +186 -0
- data/bin/wu-local +4 -0
- data/bin/wu-plus +9 -0
- data/bin/wu-source +5 -0
- data/bin/wu-sum +31 -0
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/basic/tiny_count.rb +8 -0
- data/examples/basic/word_count/accumulator.rb +26 -0
- data/examples/basic/word_count/tokenizer.rb +13 -0
- data/examples/basic/word_count/word_count.rb +6 -0
- data/examples/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/deploy_pack/Gemfile +6 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/dsl/dataflow/simple.rb +12 -0
- data/examples/dsl/dataflow/telegram.rb +45 -0
- data/examples/dsl/workflow/cherry_pie.dot +97 -0
- data/examples/dsl/workflow/cherry_pie.md +104 -0
- data/examples/dsl/workflow/cherry_pie.png +0 -0
- data/examples/dsl/workflow/cherry_pie.rb +101 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/examples_helper.rb +9 -0
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/loadable.rb +2 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/examples/munging/airline_flights/airplane.rb +0 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +107 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/rake_helper.rb +97 -0
- data/examples/ruby_project/Gemfile +6 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/splitter.rb +94 -0
- data/examples/string_reverser.rb +7 -0
- data/examples/text/pig_latin/pig_latinizer.rb +35 -0
- data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +36 -0
- data/lib/hanuman/graph.rb +97 -0
- data/lib/hanuman/graphvizzer.rb +206 -0
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +128 -0
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/lib/wu/graph/union_find.rb +62 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/lib/wu/munging.rb +71 -0
- data/lib/wu/social/models/twitter.rb +31 -0
- data/lib/wu/wikipedia/models.rb +20 -0
- data/lib/wukong.rb +54 -0
- data/lib/wukong/dataflow.rb +43 -0
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +214 -0
- data/lib/wukong/driver/event_machine_driver.rb +15 -0
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +42 -0
- data/lib/wukong/local/runner.rb +96 -0
- data/lib/wukong/local/stdio_driver.rb +104 -0
- data/lib/wukong/logger.rb +102 -0
- data/lib/wukong/model/faker.rb +136 -0
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +110 -0
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +169 -0
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers.rb +74 -0
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
- data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
- data/lib/wukong/version.rb +3 -0
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/extract.rb +122 -0
- data/lib/wukong/widget/filters.rb +452 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +10 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +368 -0
- data/lib/wukong/widget/reducers/count.rb +73 -0
- data/lib/wukong/widget/reducers/group.rb +128 -0
- data/lib/wukong/widget/reducers/group_concat.rb +98 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +180 -0
- data/lib/wukong/widget/reducers/uniq.rb +91 -0
- data/lib/wukong/widget/serializers.rb +317 -0
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +7 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
- data/spec/examples/dataflow/parsing_spec.rb +14 -0
- data/spec/examples/dataflow/simple_spec.rb +34 -0
- data/spec/examples/dataflow/telegram_spec.rb +43 -0
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +18 -0
- data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
- data/spec/hanuman/graph_spec.rb +119 -0
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +81 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +43 -0
- data/spec/support/example_test_helpers.rb +95 -0
- data/spec/support/hanuman_test_helpers.rb +92 -0
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +94 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/dataflow_spec.rb +87 -0
- data/spec/wukong/driver_spec.rb +154 -0
- data/spec/wukong/local/runner_spec.rb +29 -0
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/model/faker_spec.rb +132 -0
- data/spec/wukong/processor_spec.rb +21 -0
- data/spec/wukong/runner_spec.rb +132 -0
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/filters_spec.rb +79 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +21 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
- data/spec/wukong/widget/serializers_spec.rb +114 -0
- data/spec/wukong/widget/sink_spec.rb +19 -0
- data/spec/wukong/widget/source_spec.rb +65 -0
- data/spec/wukong/wu-local_spec.rb +109 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +35 -0
- metadata +465 -0
@@ -0,0 +1,180 @@
|
|
1
|
+
require_relative("accumulator")
|
2
|
+
require_relative("../utils")
|
3
|
+
|
4
|
+
module Wukong
|
5
|
+
class Processor
|
6
|
+
|
7
|
+
# Sorts input records.
|
8
|
+
#
|
9
|
+
# For many use cases you're better off using native tools like
|
10
|
+
# `/bin/sort` because they are faster and already do what you
|
11
|
+
# need.
|
12
|
+
#
|
13
|
+
# @example When /bin/sort is more than enough on the command-line
|
14
|
+
#
|
15
|
+
# $ cat input
|
16
|
+
# 1 apple
|
17
|
+
# 2 banana
|
18
|
+
# 3 cat
|
19
|
+
# 4 banana
|
20
|
+
# ...
|
21
|
+
# $ cat input | sort -k2
|
22
|
+
# 1 apple
|
23
|
+
# 2 banana
|
24
|
+
# 4 banana
|
25
|
+
# 3 cat
|
26
|
+
# ...
|
27
|
+
#
|
28
|
+
# Other times, you need something that can introspect more on its
|
29
|
+
# input:
|
30
|
+
#
|
31
|
+
# @example When you may prefer the sort widget on the command-line
|
32
|
+
#
|
33
|
+
# $ cat input
|
34
|
+
# {"id": 1, "word": "apple" }
|
35
|
+
# {"id": 2, "word": "cat" }
|
36
|
+
# {"id": 3, "word": "banana"}
|
37
|
+
# ...
|
38
|
+
# $ cat input | wu-local sort --on word
|
39
|
+
# {"id": 1, "word": "apple" }
|
40
|
+
# {"id": 3, "word": "banana"}
|
41
|
+
# {"id": 2, "word": "cat" }
|
42
|
+
# ...
|
43
|
+
#
|
44
|
+
# The sort widget is useful for modeling Hadoop jobs, but don't
|
45
|
+
# forget that [Hadoop does its own
|
46
|
+
# sorting](http://hadoop.apache.org/docs/r0.20.2/mapred_tutorial.html#Sort),
|
47
|
+
# so the sort widget doesn't belong in your map/reduce jobs.
|
48
|
+
#
|
49
|
+
# @example The wrong way to model a Hadoop map/reduce job
|
50
|
+
#
|
51
|
+
# Wukong.dataflow(:my_incorrect_job_dataflow) do
|
52
|
+
# parse | extract(part: 'country') | sort | group
|
53
|
+
# end
|
54
|
+
#
|
55
|
+
# @example The right way to model a Hadoop map/reduce job
|
56
|
+
#
|
57
|
+
# Wukong.dataflow(:mapper) do
|
58
|
+
# parse | extract(part: 'country')
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# Wukong.dataflow(:reducer) do
|
62
|
+
# group
|
63
|
+
# end
|
64
|
+
class Sort < Accumulator
|
65
|
+
|
66
|
+
description <<EOF
|
67
|
+
This processor sorts input records alphabetically or numerically based
|
68
|
+
on their value or the value of one of their parts.
|
69
|
+
|
70
|
+
NOTE: For many use cases you're better off using native tools like
|
71
|
+
`/bin/sort` because they are faster and already do what you
|
72
|
+
need.
|
73
|
+
|
74
|
+
You can sort simple inputs
|
75
|
+
|
76
|
+
$ cat input
|
77
|
+
1 apple
|
78
|
+
2 banana
|
79
|
+
3 cat
|
80
|
+
4 banana
|
81
|
+
...
|
82
|
+
|
83
|
+
$ cat input | wu-local sort --on=2
|
84
|
+
1 apple
|
85
|
+
2 banana
|
86
|
+
4 banana
|
87
|
+
3 cat
|
88
|
+
...
|
89
|
+
|
90
|
+
as well as complicated ones
|
91
|
+
|
92
|
+
$ cat input
|
93
|
+
{"id": 1, "word": "apple" }
|
94
|
+
{"id": 2, "word": "cat" }
|
95
|
+
{"id": 3, "word": "banana"}
|
96
|
+
...
|
97
|
+
|
98
|
+
$ cat input | wu-local sort --on=word
|
99
|
+
{"id": 1, "word": "apple" }
|
100
|
+
{"id": 3, "word": "banana"}
|
101
|
+
{"id": 2, "word": "cat" }
|
102
|
+
...
|
103
|
+
|
104
|
+
You can also sort in --reverse or using --numeric order instead of
|
105
|
+
lexical.
|
106
|
+
|
107
|
+
The sort widget is useful for modeling Hadoop jobs, but don't
|
108
|
+
forget that [Hadoop does its own
|
109
|
+
sorting](http://hadoop.apache.org/docs/r0.20.2/mapred_tutorial.html#Sort),
|
110
|
+
so the sort widget doesn't belong in your map/reduce jobs.
|
111
|
+
|
112
|
+
This processor will not produce any output till it has received all
|
113
|
+
its input records.
|
114
|
+
EOF
|
115
|
+
|
116
|
+
include DynamicGet
|
117
|
+
field :on, Whatever, :doc => "Part of the record to sort on"
|
118
|
+
field :reverse, :boolean, :default => false, :doc => "Sort in reverse order"
|
119
|
+
field :numeric, :boolean, :default => false, :doc => "Sort numerically instead of lexically"
|
120
|
+
|
121
|
+
# Intializes the array of records that will hold all the values.
|
122
|
+
def setup
|
123
|
+
super()
|
124
|
+
@records = []
|
125
|
+
end
|
126
|
+
|
127
|
+
# Keeps all the records in a single group so they can be sorted.
|
128
|
+
#
|
129
|
+
# @param [Object] record
|
130
|
+
# @return [:__first__group__]
|
131
|
+
def get_key(record)
|
132
|
+
:__first_group__
|
133
|
+
end
|
134
|
+
|
135
|
+
# Stores the `record` for later sorting.
|
136
|
+
#
|
137
|
+
# @param [Object] record
|
138
|
+
def accumulate record
|
139
|
+
@records << record
|
140
|
+
end
|
141
|
+
|
142
|
+
# Sorts all the stored records and yields in one sorted
|
143
|
+
# according to the field in the right order.
|
144
|
+
#
|
145
|
+
# @yield [record] each record in correct sort order
|
146
|
+
# @yeildparam [Object] record
|
147
|
+
def finalize
|
148
|
+
sorted = @records.sort{ |x, y| compare(x, y) }
|
149
|
+
sorted.reverse! if reverse
|
150
|
+
sorted.each{ |record| yield record }
|
151
|
+
end
|
152
|
+
|
153
|
+
# Extracts the sortable part of the input `record`.
|
154
|
+
#
|
155
|
+
# @param [Object] record
|
156
|
+
# @return [Object] the part of the record to sort on
|
157
|
+
def sortable(record)
|
158
|
+
get(self.on, record)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Compare records `x` and `y` using their sortable parts.
|
162
|
+
#
|
163
|
+
# Will use numeric sorting when asked.
|
164
|
+
#
|
165
|
+
# @param [Object] x
|
166
|
+
# @param [Object] y
|
167
|
+
# @return [1,0,-1] depends on which of x or y is considered greater
|
168
|
+
def compare(x, y)
|
169
|
+
a = (sortable(x) or return -1)
|
170
|
+
b = (sortable(y) or return 1)
|
171
|
+
if numeric
|
172
|
+
a = a.to_f ; b = b.to_f
|
173
|
+
end
|
174
|
+
a <=> b
|
175
|
+
end
|
176
|
+
|
177
|
+
register
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require_relative("accumulator")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
|
6
|
+
# A processor which emits only unique records from its input.
|
7
|
+
# It's intended to work just like `uniq`.
|
8
|
+
#
|
9
|
+
# @example Emit unique elements from the input (like `uniq`).
|
10
|
+
#
|
11
|
+
# $ uniq input
|
12
|
+
# apple
|
13
|
+
# banana
|
14
|
+
# pear
|
15
|
+
# $ cat input | wu-local uniq
|
16
|
+
# apple
|
17
|
+
# banana
|
18
|
+
# pear
|
19
|
+
#
|
20
|
+
# @example Emit unique elements from the input with counts (like `uniq -c`).
|
21
|
+
#
|
22
|
+
# $ uniq -c input
|
23
|
+
# 3 apple
|
24
|
+
# 2 banana
|
25
|
+
# 3 pear
|
26
|
+
# $ cat input | wu-local uniq --count --to=tsv
|
27
|
+
# apple 3
|
28
|
+
# banana 5
|
29
|
+
# pear 8
|
30
|
+
|
31
|
+
class Uniq < Accumulator
|
32
|
+
|
33
|
+
field :count, :boolean, doc: "Emit a count for each group of input records", default: false
|
34
|
+
|
35
|
+
description <<EOF
|
36
|
+
This processor uniq's its inputs.
|
37
|
+
|
38
|
+
$ uniq input
|
39
|
+
apple
|
40
|
+
banana
|
41
|
+
pear
|
42
|
+
$ cat input | wu-local uniq
|
43
|
+
apple
|
44
|
+
banana
|
45
|
+
pear
|
46
|
+
|
47
|
+
And it can count as well:
|
48
|
+
|
49
|
+
$ uniq -c input
|
50
|
+
3 apple
|
51
|
+
2 banana
|
52
|
+
3 pear
|
53
|
+
$ cat input | wu-local uniq --count --to=tsv
|
54
|
+
apple 3
|
55
|
+
banana 5
|
56
|
+
pear 8
|
57
|
+
EOF
|
58
|
+
|
59
|
+
# The total size of the input recors.
|
60
|
+
attr_accessor :size
|
61
|
+
|
62
|
+
# Initializes the count to 0.
|
63
|
+
def setup
|
64
|
+
super()
|
65
|
+
@size = 0
|
66
|
+
end
|
67
|
+
|
68
|
+
# Accumulate a `record` by incrmenting the total size.
|
69
|
+
#
|
70
|
+
# @param [Object] record
|
71
|
+
def accumulate record
|
72
|
+
self.size += 1
|
73
|
+
end
|
74
|
+
|
75
|
+
# Yields the total size.
|
76
|
+
#
|
77
|
+
# @yield [size]
|
78
|
+
# @yieldparam [Integer] size
|
79
|
+
def finalize
|
80
|
+
return if key == :__first_group__
|
81
|
+
if count
|
82
|
+
yield [key, self.size]
|
83
|
+
else
|
84
|
+
yield key
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
register
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,317 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
SerializerError = Class.new(Error)
|
5
|
+
|
6
|
+
class Serializer < Processor
|
7
|
+
|
8
|
+
def handle_error(record, err)
|
9
|
+
return if err.class == Errno::EPIPE
|
10
|
+
log.error "#{err.class}: #{err.message}"
|
11
|
+
err.backtrace.each { |line| log.debug(line) }
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
# A widget for serializing inputs to JSON.
|
17
|
+
#
|
18
|
+
# @example Serializing to JSON at the end of a data flow
|
19
|
+
#
|
20
|
+
# Wukong.dataflow(:emits_json) do
|
21
|
+
# ... | to_json
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# @see FromJson
|
25
|
+
class ToJson < Serializer
|
26
|
+
|
27
|
+
description <<EOF
|
28
|
+
Turns input records into JSON strings.
|
29
|
+
|
30
|
+
Pretty print input with the --pretty flag.
|
31
|
+
EOF
|
32
|
+
|
33
|
+
field :pretty, :boolean, default: false, :doc => "Pretty-print output"
|
34
|
+
|
35
|
+
# Yields the input `record` serialized as JSON.
|
36
|
+
#
|
37
|
+
# @param [Object] record
|
38
|
+
# @yield [json] the serialized json output
|
39
|
+
# @yieldparam [String] json
|
40
|
+
def process(record)
|
41
|
+
raise SerializerError.new("Cannot serialize: <nil>") if record.nil?
|
42
|
+
if record.respond_to?(:to_json) && !record.is_a?(Hash) # We only want to invoke to_json if it has been explicitly defined
|
43
|
+
json = record.to_json(pretty: pretty)
|
44
|
+
else
|
45
|
+
json = MultiJson.dump(record.try(:to_wire) || record, pretty: pretty)
|
46
|
+
end
|
47
|
+
yield json
|
48
|
+
rescue => e
|
49
|
+
handle_error(record, e)
|
50
|
+
end
|
51
|
+
register
|
52
|
+
end
|
53
|
+
|
54
|
+
# A widget for deserializing inputs from JSON.
|
55
|
+
#
|
56
|
+
# @example Deserializing from JSON at the beginning of a data flow
|
57
|
+
#
|
58
|
+
# Wukong.dataflow(:consumes_json) do
|
59
|
+
# from_json | ...
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# @see ToJson
|
63
|
+
class FromJson < Serializer
|
64
|
+
|
65
|
+
description <<EOF
|
66
|
+
Parse JSON input records into native Ruby objects.
|
67
|
+
|
68
|
+
$ cat input.json
|
69
|
+
{"hi": "there"}
|
70
|
+
$ cat input.json | wu-local from_json
|
71
|
+
{"hi"=>"there"}
|
72
|
+
EOF
|
73
|
+
|
74
|
+
# Yields the input `record` deserialized from JSON.
|
75
|
+
#
|
76
|
+
# @param [String] json
|
77
|
+
# @yield [obj] the deserialized object
|
78
|
+
# @yieldparam [Object] obj
|
79
|
+
def process(record)
|
80
|
+
if record.respond_to?(:from_json)
|
81
|
+
obj = record.from_json
|
82
|
+
else
|
83
|
+
obj = MultiJson.load(record)
|
84
|
+
end
|
85
|
+
yield obj
|
86
|
+
rescue => e
|
87
|
+
handle_error(record, e)
|
88
|
+
end
|
89
|
+
register
|
90
|
+
end
|
91
|
+
|
92
|
+
# A widget for serializing inputs to TSV.
|
93
|
+
#
|
94
|
+
# @example Serializing to TSV at the end of a data flow
|
95
|
+
#
|
96
|
+
# Wukong.dataflow(:emits_tsv) do
|
97
|
+
# ... | to_tsv
|
98
|
+
# end
|
99
|
+
#
|
100
|
+
# @see FromTsv
|
101
|
+
class ToTsv < Serializer
|
102
|
+
# Yields the input `record` serialized as TSV.
|
103
|
+
#
|
104
|
+
# @param [Object] record
|
105
|
+
# @yield [tsv] the serialized TSV output
|
106
|
+
# @yieldparam [String] tsv
|
107
|
+
def process(record)
|
108
|
+
if record.respond_to?(:to_tsv)
|
109
|
+
tsv = record.to_tsv
|
110
|
+
else
|
111
|
+
wire_format = record.try(:to_wire) || record
|
112
|
+
raise SerializerError.new("Record must be in Array format to be serialized as TSV") unless wire_format.respond_to?(:map)
|
113
|
+
tsv = wire_format.map(&:to_s).join("\t")
|
114
|
+
end
|
115
|
+
yield tsv
|
116
|
+
rescue => e
|
117
|
+
handle_error(record, e)
|
118
|
+
end
|
119
|
+
register
|
120
|
+
end
|
121
|
+
|
122
|
+
# A widget for deserializing inputs from TSV.
|
123
|
+
#
|
124
|
+
# @example Deserializing from TSV at the beginning of a data flow
|
125
|
+
#
|
126
|
+
# Wukong.dataflow(:consumes_tsv) do
|
127
|
+
# from_tsv | ...
|
128
|
+
# end
|
129
|
+
#
|
130
|
+
# @see ToTsv
|
131
|
+
class FromTsv < Serializer
|
132
|
+
# Yields the input `record` deserialized from TSV.
|
133
|
+
#
|
134
|
+
# @param [String] tsv
|
135
|
+
# @yield [obj] the deserialized object
|
136
|
+
# @yieldparam [Object] obj
|
137
|
+
def process(record)
|
138
|
+
if record.respond_to?(:from_tsv)
|
139
|
+
obj = record.from_tsv
|
140
|
+
else
|
141
|
+
obj = record.split(/\t/, -1)
|
142
|
+
end
|
143
|
+
yield obj
|
144
|
+
rescue => e
|
145
|
+
handle_error(record, e)
|
146
|
+
end
|
147
|
+
register
|
148
|
+
end
|
149
|
+
|
150
|
+
# A widget for serializing inputs to CSV.
|
151
|
+
#
|
152
|
+
# @example Serializing to CSV at the end of a data flow
|
153
|
+
#
|
154
|
+
# Wukong.dataflow(:emits_csv) do
|
155
|
+
# ... | to_csv
|
156
|
+
# end
|
157
|
+
#
|
158
|
+
# @see FromCsv
|
159
|
+
class ToCsv < Serializer
|
160
|
+
# Yields the input `record` serialized as CSV.
|
161
|
+
#
|
162
|
+
# @param [Object] record
|
163
|
+
# @yield [csv] the serialized CSV output
|
164
|
+
# @yieldparam [String] csv
|
165
|
+
def process(record)
|
166
|
+
begin
|
167
|
+
csv = record.map(&:to_s).join(",")
|
168
|
+
rescue => e
|
169
|
+
# FIXME -- should we log here or what?
|
170
|
+
return
|
171
|
+
end
|
172
|
+
yield csv
|
173
|
+
end
|
174
|
+
register
|
175
|
+
end
|
176
|
+
|
177
|
+
# A widget for deserializing inputs from CSV.
|
178
|
+
#
|
179
|
+
# @example Deserializing from CSV at the beginning of a data flow
|
180
|
+
#
|
181
|
+
# Wukong.dataflow(:consumes_csv) do
|
182
|
+
# from_csv | ...
|
183
|
+
# end
|
184
|
+
#
|
185
|
+
# @see ToCsv
|
186
|
+
class FromCsv < Serializer
|
187
|
+
# Yields the input `record` deserialized from CSV.
|
188
|
+
#
|
189
|
+
# @param [String] csv
|
190
|
+
# @yield [obj] the deserialized object
|
191
|
+
# @yieldparam [Object] obj
|
192
|
+
def process(csv)
|
193
|
+
begin
|
194
|
+
record = csv.split(/,/)
|
195
|
+
rescue => e
|
196
|
+
# FIXME -- should we log here or what?
|
197
|
+
return
|
198
|
+
end
|
199
|
+
yield record
|
200
|
+
end
|
201
|
+
register
|
202
|
+
end
|
203
|
+
|
204
|
+
# A widget for serializing inputs to a delimited format.
|
205
|
+
#
|
206
|
+
# @example Serializing to a delimited format at the end of a data flow
|
207
|
+
#
|
208
|
+
# Wukong.dataflow(:emits_delimited) do
|
209
|
+
# ... | to_delimited(delimiter: "--")
|
210
|
+
# end
|
211
|
+
#
|
212
|
+
# @see FromDelimited
|
213
|
+
class ToDelimited < Serializer
|
214
|
+
field :delimiter, String, :default => "\t", :doc => "Delimiter to use between fields in a record"
|
215
|
+
# Yields the input `record` serialized in a delimited format..
|
216
|
+
#
|
217
|
+
# @param [Object] record
|
218
|
+
# @yield [delimited] the serialized delimited output
|
219
|
+
# @yieldparam [String] delimited
|
220
|
+
def process(record)
|
221
|
+
begin
|
222
|
+
delimited = record.map(&:to_s).join(delimiter)
|
223
|
+
rescue => e
|
224
|
+
# FIXME -- should we log here or what?
|
225
|
+
return
|
226
|
+
end
|
227
|
+
yield delimited
|
228
|
+
end
|
229
|
+
register
|
230
|
+
end
|
231
|
+
|
232
|
+
# A widget for deserializing inputs from a delimited format.
|
233
|
+
#
|
234
|
+
# @example Deserializing from a delimited format at the beginning of a data flow
|
235
|
+
#
|
236
|
+
# Wukong.dataflow(:consumes_delimited) do
|
237
|
+
# from_delimited(delimiter: "--") | ...
|
238
|
+
# end
|
239
|
+
#
|
240
|
+
# @see ToDelimited
|
241
|
+
class FromDelimited < Serializer
|
242
|
+
field :delimiter, String, :default => "\t", :doc => "Delimiter to use between fields in a record"
|
243
|
+
# Yields the input `record` deserialized from a delimited format.
|
244
|
+
#
|
245
|
+
# @param [String] delimited
|
246
|
+
# @yield [obj] the deserialized object
|
247
|
+
# @yieldparam [Object] obj
|
248
|
+
def process(delimited)
|
249
|
+
begin
|
250
|
+
record = delimited.split(delimiter)
|
251
|
+
rescue => e
|
252
|
+
# FIXME -- should we log here or what?
|
253
|
+
return
|
254
|
+
end
|
255
|
+
yield record
|
256
|
+
end
|
257
|
+
register
|
258
|
+
end
|
259
|
+
|
260
|
+
# A widget for serializing inputs to Ruby's `inspect` format.
|
261
|
+
#
|
262
|
+
# @example Serializing to Ruby's inspect format at the end of a data flow
|
263
|
+
#
|
264
|
+
# Wukong.dataflow(:emits_inspected) do
|
265
|
+
# ... | to_inspect
|
266
|
+
# end
|
267
|
+
class ToInspect < Serializer
|
268
|
+
# Yields the input record(s) passed through Ruby's `inspect`.
|
269
|
+
#
|
270
|
+
# @param [Array<Object>]
|
271
|
+
# @yield [inspected]
|
272
|
+
# @yieldparam [String] inspected
|
273
|
+
def process(record)
|
274
|
+
yield record.inspect
|
275
|
+
end
|
276
|
+
register
|
277
|
+
end
|
278
|
+
|
279
|
+
# A widget for turning a record into an instance of some class.
|
280
|
+
# The class must provide a "class method" `receive` which accepts
|
281
|
+
# a Hash argument.
|
282
|
+
class Recordize < Serializer
|
283
|
+
field :model, Whatever, :doc => "Model class to turn records into"
|
284
|
+
|
285
|
+
# Turn the given `record` into an instance of the class named
|
286
|
+
# with the `model` field.
|
287
|
+
#
|
288
|
+
# @param [Hash, #to_wire] record
|
289
|
+
# @return [Object]
|
290
|
+
def process(record)
|
291
|
+
wire_format = record.try(:to_wire) || record
|
292
|
+
raise SerializerError.new("Can only recordize a Hash-like record") unless wire_format.is_a?(Hash)
|
293
|
+
klass = model_class_for(wire_format)
|
294
|
+
if klass
|
295
|
+
yield klass.receive(wire_format)
|
296
|
+
else
|
297
|
+
log.error("No default model class and no explicit model for: #{wire_format.inspect}")
|
298
|
+
end
|
299
|
+
rescue => e
|
300
|
+
handle_error(record, e)
|
301
|
+
end
|
302
|
+
|
303
|
+
def model_class_for(record)
|
304
|
+
if explicit_type = (record[:_type] || record["_type"])
|
305
|
+
begin
|
306
|
+
return explicit_type.constantize
|
307
|
+
rescue NameError => e
|
308
|
+
log.warn("Could not find a class for <#{explicit_type}>")
|
309
|
+
end
|
310
|
+
end
|
311
|
+
return model if model
|
312
|
+
end
|
313
|
+
|
314
|
+
register
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|