ul-wukong 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +60 -0
- data/.gitmodules +6 -0
- data/.rspec +2 -0
- data/.travis.yml +19 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +7 -0
- data/Gemfile +17 -0
- data/Guardfile +12 -0
- data/LICENSE.md +95 -0
- data/NOTES-travis.md +31 -0
- data/README-old.md +422 -0
- data/README.md +1308 -0
- data/Rakefile +28 -0
- data/TODO.md +99 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +6 -0
- data/bin/md5sort +20 -0
- data/bin/setcat +11 -0
- data/bin/tabchar +5 -0
- data/bin/uniq-ord +59 -0
- data/bin/uniqc +3 -0
- data/bin/wu +34 -0
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-date +13 -0
- data/bin/wu-datetime +13 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +186 -0
- data/bin/wu-local +4 -0
- data/bin/wu-plus +9 -0
- data/bin/wu-source +5 -0
- data/bin/wu-sum +31 -0
- data/diagrams/wu_local.dot +39 -0
- data/diagrams/wu_local.dot.png +0 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/basic/string_reverser.rb +23 -0
- data/examples/basic/tiny_count.rb +8 -0
- data/examples/basic/word_count/accumulator.rb +26 -0
- data/examples/basic/word_count/tokenizer.rb +13 -0
- data/examples/basic/word_count/word_count.rb +6 -0
- data/examples/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/deploy_pack/Gemfile +6 -0
- data/examples/deploy_pack/README.md +6 -0
- data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
- data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
- data/examples/deploy_pack/config/environment.rb +1 -0
- data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
- data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
- data/examples/dsl/dataflow/simple.rb +12 -0
- data/examples/dsl/dataflow/telegram.rb +45 -0
- data/examples/dsl/workflow/cherry_pie.dot +97 -0
- data/examples/dsl/workflow/cherry_pie.md +104 -0
- data/examples/dsl/workflow/cherry_pie.png +0 -0
- data/examples/dsl/workflow/cherry_pie.rb +101 -0
- data/examples/empty/.gitkeep +0 -0
- data/examples/examples_helper.rb +9 -0
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/implied_geolocation/README.md +63 -0
- data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
- data/examples/improver/tweet_summary.rb +73 -0
- data/examples/loadable.rb +2 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/examples/munging/airline_flights/airplane.rb +0 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/indexable.rb +75 -0
- data/examples/munging/airline_flights/indexable_spec.rb +90 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +107 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/rake_helper.rb +97 -0
- data/examples/ruby_project/Gemfile +6 -0
- data/examples/ruby_project/README.md +6 -0
- data/examples/ruby_project/a/b/c/.gitkeep +0 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/serverlogs/models/logline.rb +102 -0
- data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
- data/examples/serverlogs/visit_paths/common.rb +4 -0
- data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
- data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
- data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
- data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
- data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
- data/examples/splitter.rb +94 -0
- data/examples/string_reverser.rb +7 -0
- data/examples/text/pig_latin/pig_latinizer.rb +35 -0
- data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/twitter.rb +5 -0
- data/lib/hanuman.rb +36 -0
- data/lib/hanuman/graph.rb +97 -0
- data/lib/hanuman/graphvizzer.rb +206 -0
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +128 -0
- data/lib/hanuman/tree.rb +67 -0
- data/lib/wu/geo.rb +4 -0
- data/lib/wu/geo/geo_grids.numbers +0 -0
- data/lib/wu/geo/geolocated.rb +331 -0
- data/lib/wu/geo/quadtile.rb +69 -0
- data/lib/wu/graph/union_find.rb +62 -0
- data/lib/wu/model/reconcilable.rb +63 -0
- data/lib/wu/munging.rb +71 -0
- data/lib/wu/social/models/twitter.rb +31 -0
- data/lib/wu/wikipedia/models.rb +20 -0
- data/lib/wukong.rb +54 -0
- data/lib/wukong/dataflow.rb +43 -0
- data/lib/wukong/doc_helpers.rb +14 -0
- data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
- data/lib/wukong/doc_helpers/field_handler.rb +91 -0
- data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
- data/lib/wukong/driver.rb +214 -0
- data/lib/wukong/driver/event_machine_driver.rb +15 -0
- data/lib/wukong/driver/wiring.rb +68 -0
- data/lib/wukong/local.rb +42 -0
- data/lib/wukong/local/runner.rb +96 -0
- data/lib/wukong/local/stdio_driver.rb +104 -0
- data/lib/wukong/logger.rb +102 -0
- data/lib/wukong/model/faker.rb +136 -0
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/plugin.rb +48 -0
- data/lib/wukong/processor.rb +110 -0
- data/lib/wukong/rake_helper.rb +6 -0
- data/lib/wukong/runner.rb +169 -0
- data/lib/wukong/runner/boot_sequence.rb +123 -0
- data/lib/wukong/runner/code_loader.rb +52 -0
- data/lib/wukong/runner/command_runner.rb +44 -0
- data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
- data/lib/wukong/runner/help_message.rb +42 -0
- data/lib/wukong/source.rb +33 -0
- data/lib/wukong/source/source_driver.rb +74 -0
- data/lib/wukong/source/source_runner.rb +38 -0
- data/lib/wukong/spec_helpers.rb +74 -0
- data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
- data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
- data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
- data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
- data/lib/wukong/version.rb +3 -0
- data/lib/wukong/widget/echo.rb +55 -0
- data/lib/wukong/widget/extract.rb +122 -0
- data/lib/wukong/widget/filters.rb +452 -0
- data/lib/wukong/widget/logger.rb +56 -0
- data/lib/wukong/widget/operators.rb +82 -0
- data/lib/wukong/widget/reducers.rb +10 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +368 -0
- data/lib/wukong/widget/reducers/count.rb +73 -0
- data/lib/wukong/widget/reducers/group.rb +128 -0
- data/lib/wukong/widget/reducers/group_concat.rb +98 -0
- data/lib/wukong/widget/reducers/improver.rb +71 -0
- data/lib/wukong/widget/reducers/join_xml.rb +37 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +180 -0
- data/lib/wukong/widget/reducers/uniq.rb +91 -0
- data/lib/wukong/widget/serializers.rb +317 -0
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +7 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
- data/spec/examples/dataflow/parsing_spec.rb +14 -0
- data/spec/examples/dataflow/simple_spec.rb +34 -0
- data/spec/examples/dataflow/telegram_spec.rb +43 -0
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +18 -0
- data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
- data/spec/hanuman/graph_spec.rb +119 -0
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +81 -0
- data/spec/hanuman/tree_spec.rb +119 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +43 -0
- data/spec/support/example_test_helpers.rb +95 -0
- data/spec/support/hanuman_test_helpers.rb +92 -0
- data/spec/support/integration_helper.rb +38 -0
- data/spec/support/model_test_helpers.rb +115 -0
- data/spec/support/shared_context_for_graphs.rb +57 -0
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +94 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/wu/model/reconcilable_spec.rb +152 -0
- data/spec/wukong/dataflow_spec.rb +87 -0
- data/spec/wukong/driver_spec.rb +154 -0
- data/spec/wukong/local/runner_spec.rb +29 -0
- data/spec/wukong/local/stdio_driver_spec.rb +73 -0
- data/spec/wukong/local_spec.rb +6 -0
- data/spec/wukong/logger_spec.rb +49 -0
- data/spec/wukong/model/faker_spec.rb +132 -0
- data/spec/wukong/processor_spec.rb +21 -0
- data/spec/wukong/runner_spec.rb +132 -0
- data/spec/wukong/source_spec.rb +6 -0
- data/spec/wukong/widget/extract_spec.rb +101 -0
- data/spec/wukong/widget/filters_spec.rb +79 -0
- data/spec/wukong/widget/logger_spec.rb +23 -0
- data/spec/wukong/widget/operators_spec.rb +25 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +21 -0
- data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
- data/spec/wukong/widget/serializers_spec.rb +114 -0
- data/spec/wukong/widget/sink_spec.rb +19 -0
- data/spec/wukong/widget/source_spec.rb +65 -0
- data/spec/wukong/wu-local_spec.rb +109 -0
- data/spec/wukong/wu-source_spec.rb +32 -0
- data/spec/wukong/wu_spec.rb +14 -0
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +35 -0
- metadata +465 -0
data/bin/wu-local
ADDED
data/bin/wu-plus
ADDED
data/bin/wu-source
ADDED
data/bin/wu-sum
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/streamer/summing_reducer'
|
5
|
+
|
6
|
+
#
|
7
|
+
#
|
8
|
+
class Summer < Wukong::Streamer::SummingReducer
|
9
|
+
attr_accessor :sample_line
|
10
|
+
|
11
|
+
def initialize *args
|
12
|
+
self.summing_elements = [0]
|
13
|
+
super *args
|
14
|
+
end
|
15
|
+
|
16
|
+
def start! *args
|
17
|
+
self.sample_line = args
|
18
|
+
super *args
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_key *fields
|
22
|
+
fields.values_at(2,3)
|
23
|
+
end
|
24
|
+
|
25
|
+
def finalize
|
26
|
+
summing_elements.each{|idx| sample_line[idx] = sums[idx]}
|
27
|
+
yield sample_line
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
Wukong::Script.new(Summer, nil).run
|
@@ -0,0 +1,39 @@
|
|
1
|
+
digraph WuLocalControlFlow {
|
2
|
+
size ="100,100";
|
3
|
+
stdin [label=<
|
4
|
+
<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="4">
|
5
|
+
<TR><TD><FONT FACE="BOLD" POINT-SIZE="20">STDIN</FONT></TD></TR>
|
6
|
+
<TR><TD><FONT POINT-SIZE="10">The line of input text</FONT></TD></TR>
|
7
|
+
<TR><TD><FONT FACE="MONOSPACE">Shall I compare thee to a summers day?<BR/>Thou art more lovely and more temperate<BR/>...</FONT></TD></TR>
|
8
|
+
|
9
|
+
</TABLE>>];
|
10
|
+
command [shape=diamond,label=<
|
11
|
+
<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="4">
|
12
|
+
<TR><TD><FONT FACE="BOLD" POINT-SIZE="20">Command</FONT></TD></TR>
|
13
|
+
<TR><TD><FONT POINT-SIZE="10">A UNIX process launched on the command line</FONT></TD></TR>
|
14
|
+
<TR><TD><FONT FACE="MONOSPACE">wu-local word_counter</FONT></TD></TR>
|
15
|
+
</TABLE>>];
|
16
|
+
runner [shape=box,label=<
|
17
|
+
<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="4">
|
18
|
+
<TR><TD><FONT FACE="BOLD" POINT-SIZE="20">Runner</FONT></TD></TR>
|
19
|
+
<TR><TD><FONT POINT-SIZE="10">Loads plugins and code, configures and resolves settings, boots plugins, validates command line, then runs.</FONT></TD></TR>
|
20
|
+
<TR><TD><FONT FACE="MONOSPACE">Wukong::Local::LocalRunner</FONT></TD></TR>
|
21
|
+
</TABLE>>];
|
22
|
+
driver [shape=box,label=<
|
23
|
+
<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="4">
|
24
|
+
<TR><TD><FONT FACE="BOLD" POINT-SIZE="20">Driver</FONT></TD></TR>
|
25
|
+
<TR><TD><FONT POINT-SIZE="10">Passes input to processor, handles output</FONT></TD></TR>
|
26
|
+
<TR><TD><FONT FACE="MONOSPACE">Wukong::Local::StdioDriver</FONT></TD></TR>
|
27
|
+
</TABLE>>];
|
28
|
+
stdout [label=<
|
29
|
+
<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="4">
|
30
|
+
<TR><TD><FONT FACE="BOLD" POINT-SIZE="20">STDOUT</FONT></TD></TR>
|
31
|
+
<TR><TD><FONT POINT-SIZE="10">The resulting output lines</FONT></TD></TR>
|
32
|
+
<TR><TD><FONT FACE="MONOSPACE">8<BR/>7<BR/>...</FONT></TD></TR>
|
33
|
+
</TABLE>>];
|
34
|
+
|
35
|
+
command -> runner [label="Implemented By"];
|
36
|
+
runner -> driver [label="Instantiates"];
|
37
|
+
stdin -> driver [label="Reads 1 Line"];
|
38
|
+
driver -> stdout [label="Writes N Lines"];
|
39
|
+
}
|
Binary file
|
data/examples/Gemfile
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
source :rubygems
|
2
|
+
|
3
|
+
gem "configliere", '~> 0.4'
|
4
|
+
gem "multi_json", '>= 1.3.6'
|
5
|
+
gem "gorillib", '>= 0.4.2'
|
6
|
+
gem "uuidtools"
|
7
|
+
gem "eventmachine"
|
8
|
+
gem "log4r"
|
9
|
+
|
10
|
+
group :examples do
|
11
|
+
gem "forgery"
|
12
|
+
gem "nokogiri"
|
13
|
+
gem "sanitize"
|
14
|
+
gem "addressable"
|
15
|
+
gem "forgery"
|
16
|
+
gem "crack"
|
17
|
+
gem "oj"
|
18
|
+
gem "activesupport"
|
19
|
+
end
|
20
|
+
|
21
|
+
group :development do
|
22
|
+
gem "bundler", '~> 1.1'
|
23
|
+
gem "rake", '>= 0.9'
|
24
|
+
gem "rspec", '>= 2.8'
|
25
|
+
gem "guard", '>= 1.0'
|
26
|
+
gem "guard-rspec", '>= 0.6'
|
27
|
+
gem "simplecov", '>= 0.5'
|
28
|
+
gem "pry"
|
29
|
+
gem "ap"
|
30
|
+
gem "ruby-progressbar"
|
31
|
+
end
|
32
|
+
|
33
|
+
group :docs do
|
34
|
+
gem "yard"
|
35
|
+
gem "redcarpet"
|
36
|
+
gem "addressable"
|
37
|
+
gem "htmlentities"
|
38
|
+
end
|
data/examples/README.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Wukong.processor(:string_reverser) do
|
2
|
+
|
3
|
+
def setup
|
4
|
+
log.info("Inside the setup method")
|
5
|
+
@count = 0
|
6
|
+
EM.add_periodic_timer(10){ notify('metrics', count: @count) }
|
7
|
+
end
|
8
|
+
|
9
|
+
def process(record)
|
10
|
+
@count += 1
|
11
|
+
yield record.reverse
|
12
|
+
yield nil
|
13
|
+
end
|
14
|
+
|
15
|
+
def finalize
|
16
|
+
log.info("Finalizing flow")
|
17
|
+
end
|
18
|
+
|
19
|
+
def stop
|
20
|
+
log.info("Inside the stop method")
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
Wukong.processor(:accumulator) do
|
2
|
+
attr_accessor :count, :current
|
3
|
+
|
4
|
+
def reset!() @current = nil ; @count = 0 ; end
|
5
|
+
|
6
|
+
def report_then_reset!(&blk)
|
7
|
+
yield [current, count].join("\t") unless current.nil?
|
8
|
+
reset!
|
9
|
+
end
|
10
|
+
|
11
|
+
def accumulate(word, seen)
|
12
|
+
@current = word if @current.nil?
|
13
|
+
@count += seen
|
14
|
+
end
|
15
|
+
|
16
|
+
def process(pair, &blk)
|
17
|
+
word, seen = pair.split("\t")
|
18
|
+
report_then_reset!(&blk) unless word == current
|
19
|
+
accumulate(word, seen.to_i)
|
20
|
+
end
|
21
|
+
|
22
|
+
def finalize(&blk)
|
23
|
+
report_then_reset!(&blk)
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Wukong.processor(:tokenizer) do
|
2
|
+
|
3
|
+
field :min_length, Integer, :default => 1
|
4
|
+
|
5
|
+
def process(record)
|
6
|
+
words = record.downcase.strip.split(/\W/)
|
7
|
+
lengthy = words.select{ |word| word.length >= min_length }
|
8
|
+
lengthy.each do |word|
|
9
|
+
yield [ word, 1 ].join("\t")
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'wukong/widgets/sinks/hbase_record_sink.rb'
|
2
|
+
|
3
|
+
Wukong.chain(:friend_graph) do
|
4
|
+
tail(:scrapables) do
|
5
|
+
directory 'scrapables/ids-%{t:ymd}.tsv'
|
6
|
+
end
|
7
|
+
|
8
|
+
requester = decorator('tw_requester.rb') do
|
9
|
+
input :scrape_url, Url
|
10
|
+
output :raw_json_request, JsonString
|
11
|
+
config do
|
12
|
+
define :request_types, :default => [:follower_ids, :friend_ids], :doc => 'which requests to make: follower_ids, user_timeline, etc'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
retriable_requester = retriable do
|
17
|
+
with :timeouts => [1,2,3]
|
18
|
+
on_failure :sleep
|
19
|
+
guest requester
|
20
|
+
end
|
21
|
+
|
22
|
+
tail(:scrapables)> retriable_requester > processor('tw_parse.rb') > hbase_record_sink
|
23
|
+
end
|
24
|
+
|
25
|
+
Wukong.processor(:tw_parse) do
|
26
|
+
def process
|
27
|
+
end
|
28
|
+
end
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
require_relative("../app/processors/string_reverser.rb")
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'wukong/widget/many_to_many'
|
2
|
+
require 'gorillib/enumerable/sum'
|
3
|
+
|
4
|
+
#
|
5
|
+
# An example dataflow --
|
6
|
+
#
|
7
|
+
|
8
|
+
Wukong.processor(:delay_buffer) do
|
9
|
+
attr_accessor :queue
|
10
|
+
field :delay, Integer, position: 0, doc: "number of records to hold in buffer"
|
11
|
+
|
12
|
+
def process(rec)
|
13
|
+
queue << rec
|
14
|
+
emit(next_item) if ready?
|
15
|
+
end
|
16
|
+
|
17
|
+
def next_item
|
18
|
+
queue.shift
|
19
|
+
end
|
20
|
+
|
21
|
+
# true if there are records at the end of the delay stage
|
22
|
+
def ready?
|
23
|
+
warn "Hmm, too many records in queue: #{queue}" if queue.size > delay+1
|
24
|
+
queue.size > delay
|
25
|
+
end
|
26
|
+
|
27
|
+
# resets to an empty state
|
28
|
+
def setup(*)
|
29
|
+
super
|
30
|
+
@queue = Array.new
|
31
|
+
end
|
32
|
+
|
33
|
+
# emits all remaining elements of the queue
|
34
|
+
def stop
|
35
|
+
queue.each{|rec| emit(rec) }
|
36
|
+
super
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class Wukong::Batcher < Wukong::Processor
|
41
|
+
register_action
|
42
|
+
include Hanuman::Slottable
|
43
|
+
include Hanuman::OutputSlotted
|
44
|
+
|
45
|
+
attr_accessor :queues
|
46
|
+
consume :n_1, Integer, doc: "n-1'th value: the one just emitted"
|
47
|
+
consume :tictoc, Integer, doc: "input to drive flow"
|
48
|
+
consume :n_2, Integer, doc: "n-2'nd value: the one before the one just emitted"
|
49
|
+
|
50
|
+
# resets to an empty state, calls super
|
51
|
+
def initialize(*)
|
52
|
+
super
|
53
|
+
@queues = Hash.new{|h,k| h[k] = Array.new } # autovivifying
|
54
|
+
end
|
55
|
+
|
56
|
+
def process_input(channel, rec)
|
57
|
+
queues[channel] << rec
|
58
|
+
emit(next_item) if ready?
|
59
|
+
end
|
60
|
+
|
61
|
+
def next_item
|
62
|
+
queues.map{|_, queue| queue.shift }
|
63
|
+
end
|
64
|
+
|
65
|
+
# true if there is at least one record in each queue
|
66
|
+
def ready?
|
67
|
+
inslots.values.all?{|inslot| queues[inslot.name].length > 0 }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
Wukong.chain(:fibonacci_series) do
|
72
|
+
|
73
|
+
delay_buffer(1, name: :my_delay)
|
74
|
+
|
75
|
+
# * I don't want to have to name everything
|
76
|
+
# - are few/some/most things named?
|
77
|
+
# * I must be able to have the same stage type on a graph more than once
|
78
|
+
# * If naming things is a general case, I want it to
|
79
|
+
# - be clean, and for it to
|
80
|
+
# - not cause a ruckus when stage type has its own args
|
81
|
+
#
|
82
|
+
|
83
|
+
batcher(name: :feedback) >
|
84
|
+
map(name: :summer, &:sum) >
|
85
|
+
many_to_many(name: :fibonacci_n)
|
86
|
+
|
87
|
+
spew(6, item: 0, name: :ticker) > feedback.tictoc
|
88
|
+
|
89
|
+
fibonacci_n > feedback.n_1
|
90
|
+
fibonacci_n > output
|
91
|
+
fibonacci_n > :delay > feedback.n_2
|
92
|
+
|
93
|
+
# preload the feedback buffer
|
94
|
+
feedback.n_1.process(0)
|
95
|
+
feedback.n_2.process(0)
|
96
|
+
feedback.n_2.process(1)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Wukong.dataflow(:dump) do
|
100
|
+
# stdout << Wukong.dataflow(:fibbonaci_series).out
|
101
|
+
# end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'wukong/widgets/sinks/hbase_record_sink.rb'
|
2
|
+
|
3
|
+
Wukong.chain(:friend_graph) do
|
4
|
+
tail(:scrapables) do
|
5
|
+
directory 'scrapables/ids-%{t:ymd}.tsv'
|
6
|
+
end
|
7
|
+
|
8
|
+
requester = decorator('tw_requester.rb') do
|
9
|
+
input :scrape_url, Url
|
10
|
+
output :raw_json_request, JsonString
|
11
|
+
config do
|
12
|
+
define :request_types, :default => [:follower_ids, :friend_ids], :doc => 'which requests to make: follower_ids, user_timeline, etc'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
retriable_requester = retriable do
|
17
|
+
with :timeouts => [1,2,3]
|
18
|
+
on_failure :sleep
|
19
|
+
guest requester
|
20
|
+
end
|
21
|
+
|
22
|
+
tail(:scrapables)> retriable_requester > processor('tw_parse.rb') > hbase_record_sink
|
23
|
+
end
|
24
|
+
|
25
|
+
Wukong.processor(:tw_parse) do
|
26
|
+
def process
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require File.expand_path('../examples_helper', File.dirname(__FILE__))
|
2
|
+
|
3
|
+
Wukong.dataflow(:simple) do
|
4
|
+
doc <<-DOC
|
5
|
+
A stupidly simple dataflow: reverses each input string
|
6
|
+
DOC
|
7
|
+
|
8
|
+
file_source(Pathname.path_to(:data, 'text/jabberwocky.txt')) >
|
9
|
+
map{|str| str.reverse } >
|
10
|
+
file_sink(Pathname.path_to(:tmp, 'dataflow/simple_output.rb'))
|
11
|
+
|
12
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.expand_path('../examples_helper', File.dirname(__FILE__))
|
2
|
+
|
3
|
+
# "Telegram Problem"
|
4
|
+
#
|
5
|
+
# The "Telegram Problem", originally described by Peter Naur:
|
6
|
+
# * accepts lines of text
|
7
|
+
# * generates output lines that are shortern than a given length (or contain only one word)
|
8
|
+
# * without splitting any of the words in the text (if a word is longer than the line width, emit it on its own line).
|
9
|
+
|
10
|
+
Wukong.processor :recompose do
|
11
|
+
field :break_length, Integer
|
12
|
+
attr_accessor :line
|
13
|
+
|
14
|
+
def initialize(*) super; @line = "" ; end
|
15
|
+
|
16
|
+
def process(word)
|
17
|
+
if word == "" then flush! ; emit("") ; return ; end
|
18
|
+
flush! if "#{line} #{word}".lstrip.length > break_length
|
19
|
+
if word.length >= break_length
|
20
|
+
emit word
|
21
|
+
else
|
22
|
+
line << " " << word
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def flush!
|
27
|
+
emit line[1..-1] unless line.blank?
|
28
|
+
self.line = ""
|
29
|
+
end
|
30
|
+
|
31
|
+
def stop
|
32
|
+
flush!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
ExampleUniverse.dataflow(:telegram) do
|
37
|
+
input :default, file_source(Pathname.path_to(:data, 'text/rectification_of_names.txt'))
|
38
|
+
output :dump, file_sink( Pathname.path_to(:tmp, 'output/dataflow/telegram/names.txt'))
|
39
|
+
|
40
|
+
input(:default) >
|
41
|
+
map{|line| line.blank? ? [""] : line.strip.split(/\s+/m) } >
|
42
|
+
flatten >
|
43
|
+
recompose(:break_length => 80) >
|
44
|
+
output(:dump)
|
45
|
+
end
|