wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,108 @@
|
|
1
|
+
module Wukong
|
2
|
+
module SpecHelpers
|
3
|
+
# This module defines methods to be included into the
|
4
|
+
# Wukong::Processor class.
|
5
|
+
module ProcessorSpecMethods
|
6
|
+
|
7
|
+
# An array of accumulated records to process come match-time.
|
8
|
+
attr_reader :given_records
|
9
|
+
|
10
|
+
# Give a collection of records to the processor.
|
11
|
+
#
|
12
|
+
# @param [Array] records
|
13
|
+
def given *records
|
14
|
+
@given_records ||= []
|
15
|
+
@given_records.concat(records)
|
16
|
+
self # for chaining
|
17
|
+
end
|
18
|
+
|
19
|
+
# Give a collection of records to the processor but turn each
|
20
|
+
# to JSON first.
|
21
|
+
#
|
22
|
+
# @param [Array] records
|
23
|
+
def given_json *records
|
24
|
+
self.given(*records.map { |record| MultiJson.dump(record) })
|
25
|
+
end
|
26
|
+
|
27
|
+
# Give a collection of records to the processor but join each
|
28
|
+
# in a delimited format first.
|
29
|
+
#
|
30
|
+
# @param [Array] records
|
31
|
+
def given_delimited delimiter, *records
|
32
|
+
self.given(*records.map do |record|
|
33
|
+
record.map(&:to_s).join(delimiter)
|
34
|
+
end.join("\n"))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Give a collection of records to the processor but join each
|
38
|
+
# in TSV format first.
|
39
|
+
#
|
40
|
+
# @param [Array] records
|
41
|
+
def given_tsv *records
|
42
|
+
self.given_delimited("\t", *records)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Give a collection of records to the processor but join each
|
46
|
+
# in CSV format first.
|
47
|
+
#
|
48
|
+
# @param [Array] records
|
49
|
+
def given_csv *records
|
50
|
+
self.given_delimited(",", *records)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return the output of the processor on the given records.
|
54
|
+
#
|
55
|
+
# Calling this method, like passing the processor to an `emit`
|
56
|
+
# matcher, will trigger processing of all the given records.
|
57
|
+
#
|
58
|
+
# Returns a SpecDriver, which is a subclass of array, so the
|
59
|
+
# usual matchers like `include` and so on should work, as well
|
60
|
+
# as explicitly indexing to introspect on particular records.
|
61
|
+
#
|
62
|
+
# @return [SpecDriver]
|
63
|
+
def output
|
64
|
+
SpecDriver.new(self).run
|
65
|
+
end
|
66
|
+
|
67
|
+
# Return the output of the processor on the given records,
|
68
|
+
# parsing as a string with the given `delimiter` first.
|
69
|
+
#
|
70
|
+
# @param [String] delimiter
|
71
|
+
# @see #output
|
72
|
+
# @return [Array<String>]
|
73
|
+
def delimited_output(delimiter)
|
74
|
+
output.map { |record| record.split(delimiter) }
|
75
|
+
end
|
76
|
+
|
77
|
+
# Return the output of the processor on the given records,
|
78
|
+
# parsing as TSV first.
|
79
|
+
#
|
80
|
+
# @see #output
|
81
|
+
# @see #delimited_output
|
82
|
+
# @return [Array<String>]
|
83
|
+
def tsv_output
|
84
|
+
delimited_output("\t")
|
85
|
+
end
|
86
|
+
|
87
|
+
# Return the output of the processor on the given records,
|
88
|
+
# parsing as CSV first.
|
89
|
+
#
|
90
|
+
# @see #output
|
91
|
+
# @see #delimited_output
|
92
|
+
# @return [Array<String>]
|
93
|
+
def csv_output
|
94
|
+
delimited_output(",")
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return the output of the processor on the given records,
|
98
|
+
# parsing as JSONS first.
|
99
|
+
#
|
100
|
+
# @see #output
|
101
|
+
# @return [Hash,Array]
|
102
|
+
def json_output
|
103
|
+
output.map { |record| MultiJson.load(record) }
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
shared_examples_for 'a processor' do |options={}|
|
2
|
+
name = options[:named]
|
3
|
+
if name
|
4
|
+
it "is registered with the name '#{name}'" do
|
5
|
+
Wukong.registry.retrieve(name.to_sym).should_not be_nil
|
6
|
+
end
|
7
|
+
it{ create_processor(name).should respond_to(:setup) }
|
8
|
+
it{ create_processor(name).should respond_to(:process) }
|
9
|
+
it{ create_processor(name).should respond_to(:finalize) }
|
10
|
+
it{ create_processor(name).should respond_to(:stop) }
|
11
|
+
it{ create_processor(name).should respond_to(:notify) }
|
12
|
+
else
|
13
|
+
warn "Must supply a name for a processor you want to test"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Wukong
|
2
|
+
module SpecHelpers
|
3
|
+
class SpecDriver < Array
|
4
|
+
|
5
|
+
attr_reader :processor
|
6
|
+
|
7
|
+
def initialize processor
|
8
|
+
super()
|
9
|
+
@processor = processor
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
return false unless processor
|
14
|
+
processor.given_records.each do |input|
|
15
|
+
processor.process(input) do |output|
|
16
|
+
self << output
|
17
|
+
end
|
18
|
+
end
|
19
|
+
processor.finalize do |output|
|
20
|
+
self << output
|
21
|
+
end
|
22
|
+
processor.stop
|
23
|
+
self
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,195 @@
|
|
1
|
+
require_relative('spec_driver')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module SpecHelpers
|
5
|
+
|
6
|
+
module SpecMatchers
|
7
|
+
|
8
|
+
def emit *expected
|
9
|
+
EmitMatcher.new(*expected)
|
10
|
+
end
|
11
|
+
|
12
|
+
def emit_json *expected
|
13
|
+
JsonMatcher.new(*expected)
|
14
|
+
end
|
15
|
+
|
16
|
+
def emit_delimited delimiter, *expected
|
17
|
+
DelimiterMatcher.new(delimiter, *expected)
|
18
|
+
end
|
19
|
+
|
20
|
+
def emit_tsv *expected
|
21
|
+
TsvMatcher.new(*expected)
|
22
|
+
end
|
23
|
+
|
24
|
+
def emit_csv *expected
|
25
|
+
CsvMatcher.new(*expected)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class EmitMatcher
|
30
|
+
|
31
|
+
attr_accessor :driver, :expected, :reason, :expected_record, :actual_record, :mismatched_index
|
32
|
+
|
33
|
+
def matches?(processor)
|
34
|
+
self.driver = SpecDriver.new(processor)
|
35
|
+
driver.run
|
36
|
+
if actual_size != expected_size
|
37
|
+
self.reason = :size
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
return true if just_count?
|
41
|
+
expected.each_with_index do |expectation, index|
|
42
|
+
actual = output[index]
|
43
|
+
if actual != expectation
|
44
|
+
self.reason = :element
|
45
|
+
self.expected_record = expectation
|
46
|
+
self.actual_record = actual
|
47
|
+
self.mismatched_index = index
|
48
|
+
return false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
true
|
52
|
+
end
|
53
|
+
|
54
|
+
def initialize *expected
|
55
|
+
self.expected = expected
|
56
|
+
end
|
57
|
+
|
58
|
+
def failure_message
|
59
|
+
if reason == :size
|
60
|
+
"Expected #{expected_size} records, got #{actual_size}:\n\n#{pretty_output}"
|
61
|
+
else
|
62
|
+
"Expected the #{ordinalize(mismatched_index)} record to be#{parse_modifier}\n\n#{expected_record}\n\nbut got\n\n#{pretty_output}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def negative_failure_message
|
67
|
+
if reason == :size
|
68
|
+
"Expected to NOT get #{expected_size} records:\n\n#{output}"
|
69
|
+
else
|
70
|
+
"Expected the #{ordinalize(mismatched_index)} record to NOT be#{parse_modifier}\n\n#{pretty_output}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def records
|
75
|
+
@just_count = true
|
76
|
+
self # chaining
|
77
|
+
end
|
78
|
+
alias_method :record, :records
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
def just_count?
|
83
|
+
@just_count
|
84
|
+
end
|
85
|
+
|
86
|
+
def actual_size
|
87
|
+
driver.size
|
88
|
+
end
|
89
|
+
|
90
|
+
def expected_size
|
91
|
+
just_count? ? expected.first.to_i : expected.size
|
92
|
+
end
|
93
|
+
|
94
|
+
def output
|
95
|
+
driver
|
96
|
+
end
|
97
|
+
|
98
|
+
def parse_modifier
|
99
|
+
end
|
100
|
+
|
101
|
+
def pretty_output
|
102
|
+
[].tap do |pretty|
|
103
|
+
output.each_with_index do |record, index|
|
104
|
+
s = (record.is_a?(String) ? record : record.inspect)
|
105
|
+
prefix = case
|
106
|
+
when output.size > 1 && index == mismatched_index
|
107
|
+
" => "
|
108
|
+
when output.size > 1
|
109
|
+
" "
|
110
|
+
else
|
111
|
+
''
|
112
|
+
end
|
113
|
+
pretty << [prefix,s].join('')
|
114
|
+
end
|
115
|
+
end.join("\n")
|
116
|
+
end
|
117
|
+
|
118
|
+
# http://stackoverflow.com/questions/1081926/how-do-i-format-a-date-in-ruby-to-include-rd-as-in-3rd
|
119
|
+
def ordinalize array_index
|
120
|
+
n = array_index + 1
|
121
|
+
if (11..13).include?(n % 100)
|
122
|
+
"#{n}th"
|
123
|
+
else
|
124
|
+
case n % 10
|
125
|
+
when 1; "#{n}st"
|
126
|
+
when 2; "#{n}nd"
|
127
|
+
when 3; "#{n}rd"
|
128
|
+
else "#{n}th"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
class JsonMatcher < EmitMatcher
|
135
|
+
def output
|
136
|
+
driver.map do |record|
|
137
|
+
begin
|
138
|
+
MultiJson.load(record)
|
139
|
+
rescue => e
|
140
|
+
raise Error.new("Could not parse output of processor as JSON: \n\n#{record}")
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
def parse_modifier
|
145
|
+
' (after parsing as JSON)'
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class DelimitedMatcher < EmitMatcher
|
150
|
+
|
151
|
+
attr_accessor :delimiter
|
152
|
+
|
153
|
+
def initialize delimiter, *expected
|
154
|
+
self.delimiter = delimiter
|
155
|
+
super(*expected)
|
156
|
+
end
|
157
|
+
|
158
|
+
def output
|
159
|
+
driver.map do |record|
|
160
|
+
begin
|
161
|
+
record.to_s.split(delimiter)
|
162
|
+
rescue => e
|
163
|
+
raise Error.new("Could not parse as #{delimited_type}': \n\n#{record}")
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def delimited_type
|
169
|
+
"'#{delimiter}-delimited'"
|
170
|
+
end
|
171
|
+
|
172
|
+
def parse_modifier
|
173
|
+
" (after parsing as #{delimited_type})"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class TsvMatcher < DelimitedMatcher
|
178
|
+
def initialize *expected
|
179
|
+
super("\t", *expected)
|
180
|
+
end
|
181
|
+
def delimited_type
|
182
|
+
"TSV"
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class CsvMatcher < DelimitedMatcher
|
187
|
+
def initialize *expected
|
188
|
+
super(",", *expected)
|
189
|
+
end
|
190
|
+
def delimited_type
|
191
|
+
"CSV"
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
data/lib/wukong/version.rb
CHANGED
@@ -0,0 +1,311 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# A widget which filters input records according to some
|
5
|
+
# criterion.
|
6
|
+
class Filter < Processor
|
7
|
+
|
8
|
+
# Process a `record` by yielding it only if it should be
|
9
|
+
# selected by this filter.
|
10
|
+
#
|
11
|
+
# @param [Object] record an input record
|
12
|
+
# @yield [record] yielded if this record should pass the filter
|
13
|
+
# @yieldparam [Object] record
|
14
|
+
# @see #select?
|
15
|
+
# @see #reject?
|
16
|
+
def process(record)
|
17
|
+
yield(record) if select?(record)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Should the given `record` be passed by this filter?
|
21
|
+
#
|
22
|
+
# @param [Object] record
|
23
|
+
# @return [true, false]
|
24
|
+
# @see #reject?
|
25
|
+
def select?(record)
|
26
|
+
true
|
27
|
+
end
|
28
|
+
|
29
|
+
# Should the given `record` be rejected by this filter?
|
30
|
+
#
|
31
|
+
# @param [Object] record
|
32
|
+
# @return [true, false]
|
33
|
+
# @see #select?
|
34
|
+
def reject?(record)
|
35
|
+
not select?(record)
|
36
|
+
end
|
37
|
+
|
38
|
+
register
|
39
|
+
end
|
40
|
+
|
41
|
+
# A widget which passes all records, i.e. - it acts just like
|
42
|
+
# `cat`.
|
43
|
+
#
|
44
|
+
# @example Pass all records unmodified on the command line
|
45
|
+
#
|
46
|
+
# $ cat input
|
47
|
+
# 1
|
48
|
+
# 2
|
49
|
+
# 3
|
50
|
+
# $ cat input | wu-local identity
|
51
|
+
# 1
|
52
|
+
# 2
|
53
|
+
# 3
|
54
|
+
#
|
55
|
+
# @example Pass all records unmodified in a dataflow
|
56
|
+
#
|
57
|
+
# Wukong.dataflow(:uses_identity) do
|
58
|
+
# ... | identity | ...
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# @see Filter
|
62
|
+
# @see Null
|
63
|
+
class Identity < Filter
|
64
|
+
register
|
65
|
+
end
|
66
|
+
|
67
|
+
# A widget which doesn't pass any records, i.e. - it acts just
|
68
|
+
# like <tt>/dev/null</tt>.
|
69
|
+
#
|
70
|
+
# @example Filter all records on the command line
|
71
|
+
#
|
72
|
+
# $ cat input
|
73
|
+
# 1
|
74
|
+
# 2
|
75
|
+
# 3
|
76
|
+
# $ cat input | wu-local null
|
77
|
+
#
|
78
|
+
# @example Filter all records from a dataflow
|
79
|
+
#
|
80
|
+
# Wukong.dataflow(:uses_null) do
|
81
|
+
# ... | null | ...
|
82
|
+
# end
|
83
|
+
#
|
84
|
+
# @see Filter
|
85
|
+
# @see All
|
86
|
+
class Null < Filter
|
87
|
+
# Prevents any records from passing because it always returns
|
88
|
+
# `false`.
|
89
|
+
#
|
90
|
+
# @param [Object] record
|
91
|
+
# @return false
|
92
|
+
def select? record
|
93
|
+
false
|
94
|
+
end
|
95
|
+
register
|
96
|
+
end
|
97
|
+
|
98
|
+
# A widget which only passes records if they match a regular
|
99
|
+
# expression.
|
100
|
+
#
|
101
|
+
# @example Passing records which match a given expression on the command-line
|
102
|
+
#
|
103
|
+
# $ cat input
|
104
|
+
# apple
|
105
|
+
# banana
|
106
|
+
# cat
|
107
|
+
# $ cat input | wu-local regexp --match='^a'
|
108
|
+
# apple
|
109
|
+
#
|
110
|
+
# @example Passing records which match a given expression in a dataflow
|
111
|
+
#
|
112
|
+
# Wukong.dataflow(:uses_regexp) do
|
113
|
+
# ... | regexp(match: /^a/) | ...
|
114
|
+
# end
|
115
|
+
#
|
116
|
+
# @see Filter
|
117
|
+
# @see NotRegexpFilter
|
118
|
+
class RegexpFilter < Filter
|
119
|
+
|
120
|
+
# The regular expression to use to match records.
|
121
|
+
field :match, Regexp
|
122
|
+
|
123
|
+
# Selects a `record` only if it matches this widget's `match`
|
124
|
+
# field.
|
125
|
+
#
|
126
|
+
# @param [Object] record
|
127
|
+
# @return [true, false]
|
128
|
+
def select?(record)
|
129
|
+
return true unless match
|
130
|
+
match =~ record.to_s
|
131
|
+
end
|
132
|
+
register(:regexp)
|
133
|
+
end
|
134
|
+
|
135
|
+
# A widget which only passes records if they *don't* match a
|
136
|
+
# regular expression.
|
137
|
+
#
|
138
|
+
# @example Passing records which don't match a given expression on the command-line
|
139
|
+
#
|
140
|
+
# $ cat input
|
141
|
+
# apple
|
142
|
+
# banana
|
143
|
+
# cat
|
144
|
+
# $ cat input | wu-local not_regexp --match='^a'
|
145
|
+
# banana
|
146
|
+
# cat
|
147
|
+
#
|
148
|
+
# @example Passing records which don't match a given expression in a dataflow
|
149
|
+
#
|
150
|
+
# Wukong.dataflow(:uses_not_regexp) do
|
151
|
+
# ... | not_regexp(match: /^a/) | ...
|
152
|
+
# end
|
153
|
+
#
|
154
|
+
# @see Filter
|
155
|
+
# @see NotRegexpFilter
|
156
|
+
class NotRegexpFilter < RegexpFilter
|
157
|
+
# Select a `record` only if it <b>doesn't</b> match this
|
158
|
+
# widget's `match` field.
|
159
|
+
#
|
160
|
+
# @param [Object] record
|
161
|
+
# @return [true, false]
|
162
|
+
def select?(record)
|
163
|
+
return true unless match
|
164
|
+
not match =~ record.to_s
|
165
|
+
end
|
166
|
+
register(:not_regexp)
|
167
|
+
end
|
168
|
+
|
169
|
+
# A widget which only lets a certain number of records through.
|
170
|
+
#
|
171
|
+
# @example Letting the first 3 records through on the command-line
|
172
|
+
#
|
173
|
+
# $ cat input
|
174
|
+
# 1
|
175
|
+
# 2
|
176
|
+
# 3
|
177
|
+
# 4
|
178
|
+
# $ cat input | wu-local limit --max=3
|
179
|
+
# 1
|
180
|
+
# 2
|
181
|
+
# 3
|
182
|
+
#
|
183
|
+
# @example Letting the first 3 records through in a dataflow
|
184
|
+
#
|
185
|
+
# Wukong.dataflow(:uses_limit) do
|
186
|
+
# ... | limit(max: 3) | ...
|
187
|
+
# end
|
188
|
+
#
|
189
|
+
# @see Filter
|
190
|
+
class Limit < Filter
|
191
|
+
|
192
|
+
# The maximum number of records to let pass.
|
193
|
+
field :max, Integer, :default => Float::INFINITY
|
194
|
+
|
195
|
+
# The current record count.
|
196
|
+
attr_accessor :count
|
197
|
+
|
198
|
+
# Initializes the record count to zero.
|
199
|
+
def setup
|
200
|
+
self.count = 0
|
201
|
+
end
|
202
|
+
|
203
|
+
# Select a record only if we're below the max count. Increments
|
204
|
+
# the count for this widget.
|
205
|
+
#
|
206
|
+
# @param [Object] record
|
207
|
+
# @return [true, false]
|
208
|
+
def select?(record)
|
209
|
+
keep = @count < max
|
210
|
+
@count += 1
|
211
|
+
keep
|
212
|
+
end
|
213
|
+
register
|
214
|
+
end
|
215
|
+
|
216
|
+
# A widget which samples a certain fraction of input records.
|
217
|
+
#
|
218
|
+
# @example Sampling records on the command line
|
219
|
+
#
|
220
|
+
# $ cat input
|
221
|
+
# 1
|
222
|
+
# 2
|
223
|
+
# 3
|
224
|
+
# 4
|
225
|
+
# $ cat input | wu-local sample --fraction=0.5
|
226
|
+
# 1
|
227
|
+
# 3
|
228
|
+
#
|
229
|
+
# @example Sampling records in a dataflow
|
230
|
+
#
|
231
|
+
# Wukong.dataflow(:uses_sample) do
|
232
|
+
# ... | sample(fraction: 0.5) ...
|
233
|
+
# end
|
234
|
+
#
|
235
|
+
# @see Filter
|
236
|
+
# @see Limit
|
237
|
+
class Sample < Filter
|
238
|
+
|
239
|
+
# The fraction of records to let pass. Must be between 0.0 and
|
240
|
+
# 10.0
|
241
|
+
field :fraction, Float, :default => 1.0
|
242
|
+
|
243
|
+
# Selects a `record` randomly, with a probability given the the
|
244
|
+
# `fraction` for this widget.
|
245
|
+
#
|
246
|
+
# @param [Object] record
|
247
|
+
# @return [true, false]
|
248
|
+
def select?(record)
|
249
|
+
rand() < fraction
|
250
|
+
end
|
251
|
+
register
|
252
|
+
end
|
253
|
+
|
254
|
+
# A widget useful for creating filters on the fly in a dataflow.
|
255
|
+
#
|
256
|
+
# When writing a filtering processor out as a class, just use the
|
257
|
+
# DSL for creating processors:
|
258
|
+
#
|
259
|
+
# @example Creating a select filter the usual way
|
260
|
+
#
|
261
|
+
# Wukong.processor(:my_filter, Wukong::Processor::Filter) do
|
262
|
+
# def select? record
|
263
|
+
# record.length > 3
|
264
|
+
# end
|
265
|
+
# end
|
266
|
+
#
|
267
|
+
# When in a dataflow, sometimes it's easier to create a processor
|
268
|
+
# like this on the fly.
|
269
|
+
#
|
270
|
+
# @example Creating a select filter on the fly in a dataflow
|
271
|
+
#
|
272
|
+
# Wukong.dataflow(:my_flow) do
|
273
|
+
# ... | select { |record| record.length > 3 } | ...
|
274
|
+
# end
|
275
|
+
#
|
276
|
+
# @see Filter
|
277
|
+
# @see Reject
|
278
|
+
class Select < Filter
|
279
|
+
|
280
|
+
# Selects the given `record` by delegating to the
|
281
|
+
# `perform_action` method, which will automatically be
|
282
|
+
# populating by the block used to create this filter in the
|
283
|
+
# dataflow DSL.
|
284
|
+
#
|
285
|
+
# @param [Object] record
|
286
|
+
# @return [true, false]
|
287
|
+
# @see Processor#perform_action
|
288
|
+
def select?(record)
|
289
|
+
perform_action(record)
|
290
|
+
end
|
291
|
+
register
|
292
|
+
end
|
293
|
+
|
294
|
+
# A widget useful for creating filters on the fly in a dataflow.
|
295
|
+
#
|
296
|
+
# @see Select
|
297
|
+
class Reject < Filter
|
298
|
+
# Rejects the given `record` by delegating to the
|
299
|
+
# `perform_action` method.
|
300
|
+
#
|
301
|
+
# @param [Object] record
|
302
|
+
# @return [true, false]
|
303
|
+
# @see Processor#perform_action
|
304
|
+
def select?(record)
|
305
|
+
not perform_action(record)
|
306
|
+
end
|
307
|
+
register
|
308
|
+
end
|
309
|
+
|
310
|
+
end
|
311
|
+
end
|