wukong 3.0.0.pre → 3.0.0.pre2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
data/Rakefile
CHANGED
@@ -1,12 +1,28 @@
|
|
1
|
-
require 'bundler'
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
2
3
|
|
3
4
|
require 'rspec/core/rake_task'
|
5
|
+
RSpec::Core::RakeTask.new(:specs)
|
6
|
+
|
4
7
|
require 'yard'
|
8
|
+
YARD::Rake::YardocTask.new
|
5
9
|
|
6
|
-
RSpec
|
7
|
-
|
10
|
+
desc 'Run RSpec with code coverage'
|
11
|
+
task :cov do
|
12
|
+
ENV['WUKONG_COV'] = true
|
13
|
+
Rake::Task[:specs].execute
|
8
14
|
end
|
9
15
|
|
10
|
-
YARD::Rake::YardocTask.new
|
11
|
-
|
12
16
|
task :default => :specs
|
17
|
+
|
18
|
+
desc "Create a TAGS file for this project"
|
19
|
+
task :tags do
|
20
|
+
files = [%w[Gemfile Guardfile Rakefile README.md].map { |b| File.join(File.dirname(__FILE__), b) }]
|
21
|
+
%w[bin examples lib spec].each do |dir|
|
22
|
+
files << Dir[File.join(File.dirname(__FILE__), "#{dir}/**/*.rb")]
|
23
|
+
end
|
24
|
+
files.each do |arry|
|
25
|
+
sh "etags", '-a', *arry unless arry.empty?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
data/TODO.md
CHANGED
@@ -4,21 +4,21 @@
|
|
4
4
|
- hooks up source to flow,
|
5
5
|
- if iterated source, drives it, otherwise sits in the flow
|
6
6
|
|
7
|
-
* these set the contract for the inbound
|
7
|
+
* these set the contract for the inbound products
|
8
8
|
|
9
9
|
### slots
|
10
10
|
|
11
11
|
Typical case: one input, `:input`, one output `:output`
|
12
12
|
|
13
|
-
* there are as many
|
13
|
+
* there are as many products as
|
14
14
|
- the total number of action stage outputs
|
15
|
-
- the concrete input
|
15
|
+
- the concrete input products
|
16
16
|
* The number of rsrc->action edges is at most the total number of input slots
|
17
|
-
- (you cannot wire multiple
|
17
|
+
- (you cannot wire multiple products to the same input slot)
|
18
18
|
|
19
19
|
|
20
20
|
1. action stage B wires up to an action stage A (which really means "the full set of A's outputs")
|
21
|
-
2. I wire action A's output as production
|
21
|
+
2. I wire action A's output as production product X
|
22
22
|
3.
|
23
23
|
|
24
24
|
4. How do I address other stages?
|
@@ -71,7 +71,7 @@ __________________________________________________________________________
|
|
71
71
|
| foo |
|
72
72
|
----------
|
73
73
|
|
74
|
-
create a
|
74
|
+
create a product with no action? action with anonymous product, wired up later?
|
75
75
|
|
76
76
|
|
77
77
|
* connections:
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding:UTF-8
|
3
|
+
|
4
|
+
if ARGV.include?('--help')
|
5
|
+
puts <<USAGE
|
6
|
+
wu-clean-encoding cleans malformed characters from stdin.
|
7
|
+
|
8
|
+
If a character is malformed, as defined by valid_encoding?,
|
9
|
+
it is replaced with a '�'.
|
10
|
+
|
11
|
+
wu-clean-encoding was built to work with UTF-8, and no
|
12
|
+
guarantees are provided for other encodings.
|
13
|
+
USAGE
|
14
|
+
exit(0)
|
15
|
+
end
|
16
|
+
|
17
|
+
ARGF.each do |line|
|
18
|
+
if line.valid_encoding?
|
19
|
+
$stdout.write line
|
20
|
+
else
|
21
|
+
repaired_line = []
|
22
|
+
line.each_char do |char|
|
23
|
+
if char.valid_encoding?
|
24
|
+
repaired_line << char
|
25
|
+
else
|
26
|
+
repaired_line << "�"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
$stdout.write repaired_line.join
|
30
|
+
end
|
31
|
+
end
|
data/bin/wu-lign
CHANGED
@@ -161,8 +161,8 @@ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type,
|
|
161
161
|
case type
|
162
162
|
when :mixed, nil then lambda{|s| "%-#{width}s" % s }
|
163
163
|
when :str then lambda{|s| "%-#{width}s" % s }
|
164
|
-
when :int then lambda{|s| "%#{width}d" % s.gsub(
|
165
|
-
when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f }
|
164
|
+
when :int then lambda{|s| "%#{width}d" % s.gsub(/[^\d\-\+]+/, "").to_i }
|
165
|
+
when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.gsub(/[^\d\.eE\-\+]+/, "").to_f }
|
166
166
|
else raise "oops type #{type}" end
|
167
167
|
end
|
168
168
|
|
data/bin/wu-local
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
settings = Wukong::Local::Configuration
|
6
|
+
settings.use(:commandline)
|
7
|
+
|
8
|
+
def settings.usage
|
9
|
+
"usage: #{File.basename($0)} PROCESSOR|FLOW [ --param=value | -p value | --param | -p]"
|
10
|
+
end
|
11
|
+
|
12
|
+
settings.description = <<-EOF
|
13
|
+
wu-local is a tool for running Wukong processors and flows locally on
|
14
|
+
the command-line. Use wu-local by passing it a processor and feeding
|
15
|
+
in some data:
|
16
|
+
|
17
|
+
$ echo 'UNIX is Clever and Fun...' | wu-local tokenizer.rb
|
18
|
+
UNIX
|
19
|
+
is
|
20
|
+
Clever
|
21
|
+
and
|
22
|
+
Fun
|
23
|
+
|
24
|
+
If your processors have named fields you can pass them in as
|
25
|
+
arguments:
|
26
|
+
|
27
|
+
$ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4
|
28
|
+
UNIX
|
29
|
+
Clever
|
30
|
+
|
31
|
+
You can chain processors and calls to wu-local together:
|
32
|
+
|
33
|
+
$ echo 'UNIX is clever and fun...' | wu-local tokenizer.rb --min_length=4 | wu-local downcaser.rb
|
34
|
+
unix
|
35
|
+
clever
|
36
|
+
|
37
|
+
Which is a good way to develop a combined data flow which you can
|
38
|
+
again test locally:
|
39
|
+
|
40
|
+
$ echo 'UNIX is clever and fun...' | wu-local tokenize_and_downcase_big_words.rb
|
41
|
+
unix
|
42
|
+
clever
|
43
|
+
EOF
|
44
|
+
|
45
|
+
settings.define :run, description: "Name of the processor or dataflow to use. Defaults to basename of the given path.", flag: 'r'
|
46
|
+
|
47
|
+
require 'wukong/boot' ; Wukong.boot!(settings)
|
48
|
+
|
49
|
+
thing = settings.rest.first
|
50
|
+
case
|
51
|
+
when thing.nil?
|
52
|
+
settings.dump_help
|
53
|
+
exit(1)
|
54
|
+
when Wukong.registry.registered?(thing.to_sym)
|
55
|
+
processor = thing.to_sym
|
56
|
+
when File.exist?(thing)
|
57
|
+
load thing
|
58
|
+
processor = settings.run || File.basename(thing, '.rb')
|
59
|
+
else
|
60
|
+
settings.dump_help
|
61
|
+
exit(2)
|
62
|
+
end
|
63
|
+
# p settings
|
64
|
+
begin
|
65
|
+
Wukong::LocalDriver.run(processor.to_sym, settings)
|
66
|
+
rescue Wukong::Error => e
|
67
|
+
$stderr.puts e.message
|
68
|
+
exit(3)
|
69
|
+
end
|
data/bin/wu-server
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'configliere'
|
3
|
+
require 'wukong'
|
4
|
+
require 'eventmachine'
|
5
|
+
require 'em-synchrony'
|
6
|
+
require 'multi_json'
|
7
|
+
|
8
|
+
Settings({
|
9
|
+
host: "localhost",
|
10
|
+
port: 9500
|
11
|
+
})
|
12
|
+
|
13
|
+
Settings.use :commandline
|
14
|
+
Settings.resolve!
|
15
|
+
|
16
|
+
# Load the file on the command line
|
17
|
+
wu_file = ARGV.shift
|
18
|
+
load wu_file
|
19
|
+
$processor = File.basename(wu_file, '.rb').to_sym
|
20
|
+
|
21
|
+
class Wukong::Server
|
22
|
+
def prepare(options = {})
|
23
|
+
dataflow_class = Wukong.dataflow(:server){ send(options[:processor]) }
|
24
|
+
flow_builder = Wukong.registry.retrieve(:server)
|
25
|
+
flow = flow_builder.build(processor: $processor)
|
26
|
+
|
27
|
+
@buffer = []
|
28
|
+
@processor = flow.stages.values.first
|
29
|
+
@processor.emitter = ->(value){ @buffer << value }
|
30
|
+
end
|
31
|
+
|
32
|
+
def process(record)
|
33
|
+
@process.process(record)
|
34
|
+
end
|
35
|
+
|
36
|
+
def cleanup
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
# EventMachine server
|
42
|
+
|
43
|
+
class WukongMachine < EM::Protocols::LineAndTextProtocol
|
44
|
+
def post_init
|
45
|
+
puts "[server] Client connected"
|
46
|
+
@wukong = WukongInterface.new
|
47
|
+
@wukong.prepare(processor: $processor)
|
48
|
+
end
|
49
|
+
|
50
|
+
def receive_data(data)
|
51
|
+
@buffer = []
|
52
|
+
input = MultiJson.load data
|
53
|
+
|
54
|
+
op = proc { @wukong.process(input) }
|
55
|
+
callback = proc { send_data MultiJson.dump(@buffer) + "\n" }
|
56
|
+
EM.defer(op, callback)
|
57
|
+
|
58
|
+
rescue MultiJson::DecodeError => ex
|
59
|
+
STDERR.puts "[server] Dropped: Malformed request"
|
60
|
+
end
|
61
|
+
|
62
|
+
def unbind
|
63
|
+
puts "[server] Client disconnected."
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
EM::run {
|
68
|
+
EM::start_server(Settings[:host], Settings[:port], WukongMachine)
|
69
|
+
puts "Listening on #{Settings[:host]}:#{Settings[:port]}"
|
70
|
+
}
|
data/examples/Gemfile
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
source :rubygems
|
2
|
+
|
3
|
+
gem "configliere", '~> 0.4'
|
4
|
+
gem "multi_json", '>= 1.3.6'
|
5
|
+
gem "vayacondios-client", '>= 0.0.3'
|
6
|
+
gem "gorillib", '>= 0.4.2'
|
7
|
+
gem "uuidtools"
|
8
|
+
gem "eventmachine"
|
9
|
+
gem "log4r"
|
10
|
+
|
11
|
+
group :examples do
|
12
|
+
gem "forgery"
|
13
|
+
gem "nokogiri"
|
14
|
+
# gem "sanitize"
|
15
|
+
gem "addressable"
|
16
|
+
gem "forgery"
|
17
|
+
gem "crack"
|
18
|
+
gem "oj"
|
19
|
+
gem "activesupport"
|
20
|
+
end
|
21
|
+
|
22
|
+
group :development do
|
23
|
+
gem "bundler", '~> 1.1'
|
24
|
+
gem "rake", '>= 0.9'
|
25
|
+
gem "rspec", '>= 2.8'
|
26
|
+
gem "guard", '>= 1.0'
|
27
|
+
gem "guard-rspec", '>= 0.6'
|
28
|
+
gem "simplecov", '>= 0.5'
|
29
|
+
gem "pry"
|
30
|
+
gem "ap"
|
31
|
+
end
|
32
|
+
|
33
|
+
group :docs do
|
34
|
+
gem "yard"
|
35
|
+
gem "redcarpet"
|
36
|
+
gem "addressable"
|
37
|
+
gem "htmlentities"
|
38
|
+
end
|
data/examples/README.md
ADDED
@@ -1,23 +1,30 @@
|
|
1
|
+
#
|
2
|
+
# Parses logs in either the [Apache Common Log Format](http://en.wikipedia.org/wiki/Common_Log_Format)
|
3
|
+
# or [Apache Combined Log Format](http://httpd.apache.org/docs/2.2/logs.html#combined)
|
4
|
+
#
|
5
|
+
# Common: `%h %l %u %t "%r" %>s %b`
|
6
|
+
# Combined: `%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-agent}i"`
|
7
|
+
#
|
1
8
|
class ApacheLogLine
|
2
9
|
include Gorillib::Model
|
3
10
|
|
4
|
-
field :
|
5
|
-
field :
|
6
|
-
field :
|
11
|
+
field :client, Hostname
|
12
|
+
field :rfc_1413, String
|
13
|
+
field :userid, String
|
7
14
|
field :log_timestamp, Time
|
8
15
|
field :http_method, String
|
9
|
-
field :
|
16
|
+
field :rsrc, String
|
10
17
|
field :protocol, String
|
11
18
|
field :response_code, Integer
|
12
19
|
field :size, Integer
|
13
|
-
field :referer, String
|
14
|
-
field :user_agent, String
|
20
|
+
# field :referer, String
|
21
|
+
# field :user_agent, String
|
15
22
|
|
16
23
|
def page_type
|
17
24
|
case
|
18
|
-
when
|
19
|
-
when
|
20
|
-
when
|
25
|
+
when rsrc =~ /\.(css|js)$/ then :asset
|
26
|
+
when rsrc =~ /\.(png|gif|ico)$/ then :image
|
27
|
+
when rsrc =~ /\.(pl|s?html?|asp|jsp|cgi)$/ then :page
|
21
28
|
else :other
|
22
29
|
end
|
23
30
|
end
|
@@ -25,37 +32,69 @@ class ApacheLogLine
|
|
25
32
|
#
|
26
33
|
# Regular expression to parse an apache log line.
|
27
34
|
#
|
35
|
+
# local - - [24/Oct/1994:13:43:13 -0600] "GET index.html HTTP/1.0" 200 3185
|
28
36
|
# 83.240.154.3 - - [07/Jun/2008:20:37:11 +0000] "GET /faq HTTP/1.1" 200 569 "http://infochimps.org/search?query=CAC" "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
37
|
+
# whidbey.whidbey.com - - [04/Sep/1995:00:30:18 -0400] "GET /pub/sshay/images/btthumb.jpg" 200 4624
|
38
|
+
# jgbustam-ppp.clark.net - - [04/Sep/1995:00:00:28 -0400] "GET /pub/jgbustam/famosos/alpha.html HTTP/1.0" 304 -
|
29
39
|
#
|
30
|
-
|
31
|
-
(\S+) #
|
32
|
-
\s(\S+) #
|
33
|
-
\s(\S+) #
|
34
|
-
|
40
|
+
COMMON_LOG_RE = Regexp.compile(%r{\A
|
41
|
+
(\S+) # client 83.240.154.3
|
42
|
+
\s(\S+) # rfc_1413 -
|
43
|
+
\s(\S+) # userid -
|
44
|
+
\s\[([\w\:\+\-\ \/]+)\] # date part [07/Jun/2008:20:37:11 +0000]
|
35
45
|
\s\"(?:(\S+) # http_method "GET
|
36
|
-
\s(
|
37
|
-
|
38
|
-
\s(\d
|
39
|
-
\s(\d
|
40
|
-
|
41
|
-
|
42
|
-
\z}x)
|
46
|
+
\s(.+?) # rsrc /faq
|
47
|
+
(?:\s(HTTP/\d+\.\d+))?|-)\" # protocol HTTP/1.1"
|
48
|
+
\s(\d+|-) # response_code 200
|
49
|
+
\s(\d+|-) # size 569
|
50
|
+
\z
|
51
|
+
}x)
|
43
52
|
|
44
|
-
|
53
|
+
COMBINED_LOG_RE = Regexp.compile(%r{\A
|
54
|
+
(\S+) # client 83.240.154.3
|
55
|
+
\s(\S+) # rfc_1413 -
|
56
|
+
\s(\S+) # userid -
|
57
|
+
\s\[([\w\:\+\-\ \/]+)\] # date part [07/Jun/2008:20:37:11 +0000]
|
58
|
+
\s\"(?:(\S+) # http_method "GET
|
59
|
+
\s([^\"]+?) # rsrc /faq
|
60
|
+
(?:\s(HTTP/\d+\.\d+))?|-)\" # protocol HTTP/1.1"
|
61
|
+
\s(\d+|-) # response_code 200
|
62
|
+
\s(\d+|-) # size 569
|
63
|
+
(?:\s\"([^\"]*)\") # referer "http://infochimps.org/search?query=CAC"
|
64
|
+
(?:\s\"([^\"]*)\") # ua "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.16) Gecko/2009120208 Firefox/3.0.16"
|
65
|
+
\z
|
66
|
+
}x)
|
67
|
+
|
68
|
+
# LOG_RE = Regexp.compile(%r{\A(\S+)\s})
|
69
|
+
|
70
|
+
MONTHS = { 'Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10, 'Nov' => 11, 'Dec' => 12, }
|
45
71
|
|
46
72
|
# Converts a time like `10/Apr/2007:10:58:27 +0300` to something parseable
|
47
73
|
def receive_log_timestamp(raw_ts)
|
74
|
+
return super(nil) if raw_ts.nil?
|
48
75
|
match = %r{(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)\s([\+\-\w]+)}.match(raw_ts)
|
49
|
-
warn "Can't parse date #{raw_ts}"
|
76
|
+
unless match then warn "Can't parse date #{raw_ts}" ; return super(nil) ; end
|
77
|
+
#
|
50
78
|
day, month_name, year, hour, min, sec, tz = match.captures
|
51
79
|
month = MONTHS[month_name]
|
52
|
-
|
80
|
+
tz.insert(3, ':') # -0600 to -06:00
|
81
|
+
#
|
82
|
+
# super "#{year}-#{month}-#{day} #{hour}:#{min}:#{sec} #{tz}"
|
83
|
+
super Time.new(year.to_i, month, day.to_i, hour.to_i, min.to_i, sec.to_i, tz)
|
84
|
+
end
|
85
|
+
|
86
|
+
# @returns the log_timestamp in the common log format
|
87
|
+
def unparsed_log_timestamp
|
88
|
+
return if log_timestamp.blank?
|
89
|
+
log_timestamp.strftime("%d/%b/%Y:%H:%M:%S %z")
|
53
90
|
end
|
54
91
|
|
55
92
|
# Use the regex to break line into fields
|
56
93
|
# Emit each record as flat line
|
57
94
|
def self.make(line)
|
58
|
-
m =
|
95
|
+
m = COMMON_LOG_RE.match(line) or return
|
59
96
|
from_tuple *m.captures
|
97
|
+
rescue ArgumentError => err
|
98
|
+
raise unless err.message =~ /invalid byte sequence in UTF-8/
|
60
99
|
end
|
61
100
|
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'wukong/widget/many_to_many'
|
2
|
+
require 'gorillib/enumerable/sum'
|
3
|
+
|
4
|
+
#
|
5
|
+
# An example dataflow --
|
6
|
+
#
|
7
|
+
|
8
|
+
Wukong.processor(:delay_buffer) do
|
9
|
+
attr_accessor :queue
|
10
|
+
field :delay, Integer, position: 0, doc: "number of records to hold in buffer"
|
11
|
+
|
12
|
+
def process(rec)
|
13
|
+
queue << rec
|
14
|
+
emit(next_item) if ready?
|
15
|
+
end
|
16
|
+
|
17
|
+
def next_item
|
18
|
+
queue.shift
|
19
|
+
end
|
20
|
+
|
21
|
+
# true if there are records at the end of the delay stage
|
22
|
+
def ready?
|
23
|
+
warn "Hmm, too many records in queue: #{queue}" if queue.size > delay+1
|
24
|
+
queue.size > delay
|
25
|
+
end
|
26
|
+
|
27
|
+
# resets to an empty state
|
28
|
+
def setup(*)
|
29
|
+
super
|
30
|
+
@queue = Array.new
|
31
|
+
end
|
32
|
+
|
33
|
+
# emits all remaining elements of the queue
|
34
|
+
def stop
|
35
|
+
queue.each{|rec| emit(rec) }
|
36
|
+
super
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class Wukong::Batcher < Wukong::Processor
|
41
|
+
register_action
|
42
|
+
include Hanuman::Slottable
|
43
|
+
include Hanuman::OutputSlotted
|
44
|
+
|
45
|
+
attr_accessor :queues
|
46
|
+
consume :n_1, Integer, doc: "n-1'th value: the one just emitted"
|
47
|
+
consume :tictoc, Integer, doc: "input to drive flow"
|
48
|
+
consume :n_2, Integer, doc: "n-2'nd value: the one before the one just emitted"
|
49
|
+
|
50
|
+
# resets to an empty state, calls super
|
51
|
+
def initialize(*)
|
52
|
+
super
|
53
|
+
@queues = Hash.new{|h,k| h[k] = Array.new } # autovivifying
|
54
|
+
end
|
55
|
+
|
56
|
+
def process_input(channel, rec)
|
57
|
+
queues[channel] << rec
|
58
|
+
emit(next_item) if ready?
|
59
|
+
end
|
60
|
+
|
61
|
+
def next_item
|
62
|
+
queues.map{|_, queue| queue.shift }
|
63
|
+
end
|
64
|
+
|
65
|
+
# true if there is at least one record in each queue
|
66
|
+
def ready?
|
67
|
+
inslots.values.all?{|inslot| queues[inslot.name].length > 0 }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
Wukong.chain(:fibonacci_series) do
|
72
|
+
|
73
|
+
delay_buffer(1, name: :my_delay)
|
74
|
+
|
75
|
+
# * I don't want to have to name everything
|
76
|
+
# - are few/some/most things named?
|
77
|
+
# * I must be able to have the same stage type on a graph more than once
|
78
|
+
# * If naming things is a general case, I want it to
|
79
|
+
# - be clean, and for it to
|
80
|
+
# - not cause a ruckus when stage type has its own args
|
81
|
+
#
|
82
|
+
|
83
|
+
batcher(name: :feedback) >
|
84
|
+
map(name: :summer, &:sum) >
|
85
|
+
many_to_many(name: :fibonacci_n)
|
86
|
+
|
87
|
+
spew(6, item: 0, name: :ticker) > feedback.tictoc
|
88
|
+
|
89
|
+
fibonacci_n > feedback.n_1
|
90
|
+
fibonacci_n > output
|
91
|
+
fibonacci_n > :delay > feedback.n_2
|
92
|
+
|
93
|
+
# preload the feedback buffer
|
94
|
+
feedback.n_1.process(0)
|
95
|
+
feedback.n_2.process(0)
|
96
|
+
feedback.n_2.process(1)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Wukong.dataflow(:dump) do
|
100
|
+
# stdout << Wukong.dataflow(:fibbonaci_series).out
|
101
|
+
# end
|