wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,60 @@
|
|
1
|
+
class String
|
2
|
+
def match_all regex
|
3
|
+
self.to_enum(:scan, regex).map {Regexp.last_match}
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Wukong
|
8
|
+
module FlatPack
|
9
|
+
|
10
|
+
# Creates a 'simple' token from the supplied string
|
11
|
+
# and position.
|
12
|
+
def self.simple_token_from_string(str, position)
|
13
|
+
token_pieces = str.match(Language::NAMED_SIMPLE_TYPE_RE)
|
14
|
+
t = Flat::Tokens.token_for_indicator(token_pieces[:type])
|
15
|
+
t.position = position
|
16
|
+
t.length = token_pieces[:length].nil? ? nil : token_pieces[:length].to_i
|
17
|
+
t.modifier = token_pieces[:modifier]
|
18
|
+
return t
|
19
|
+
end
|
20
|
+
|
21
|
+
# Creates a fixed point token. Strict input formatting is
|
22
|
+
# enforced if the strict param is true.
|
23
|
+
def self.fixed_point_token_from_string(str, position, strict=true)
|
24
|
+
float_pieces = str.match(Language::NAMED_FIXED_POINT_RE)
|
25
|
+
t = Flat::Tokens::FixedPointToken.new
|
26
|
+
t.position = position
|
27
|
+
t.strict = strict
|
28
|
+
t.power = float_pieces[:power].nil? ? nil : float_pieces[:power].to_i
|
29
|
+
t.length = float_pieces[:length].to_i
|
30
|
+
return t
|
31
|
+
end
|
32
|
+
|
33
|
+
# Validates the supplied format string
|
34
|
+
# and creates a parser from it.
|
35
|
+
def self.create_parser(str, delimiter_width=0, strict_fixed_point=true)
|
36
|
+
return nil unless Language.string_in_lang str
|
37
|
+
lang = []
|
38
|
+
str.match_all(Language::CAPTURE_TOKEN_RE).each do |match|
|
39
|
+
token_str = match[0]
|
40
|
+
case token_str
|
41
|
+
when Language::TOTAL_SIMPLE_TYPE_RE
|
42
|
+
lang << simple_token_from_string(token_str, match.begin(0))
|
43
|
+
when Language::TOTAL_FIXED_POINT_RE
|
44
|
+
lang << fixed_point_token_from_string(token_str, match.begin(0), strict_fixed_point)
|
45
|
+
when Language::TOTAL_DATE_RE
|
46
|
+
date_match = token_str.match(Language::NAMED_DATE_RE)
|
47
|
+
#TODO: Implement
|
48
|
+
end
|
49
|
+
if delimiter_width != 0
|
50
|
+
t = Flat::Tokens::IgnoreToken.new
|
51
|
+
t.position = -1
|
52
|
+
t.length = delimiter_width
|
53
|
+
lang << t
|
54
|
+
end
|
55
|
+
end
|
56
|
+
lang = lang[0..-2] if delimiter_width != 0 #pop off the delimiter on the end
|
57
|
+
return Flat::Parser.new(lang)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Wukong
|
2
|
+
module FlatPack
|
3
|
+
module Language
|
4
|
+
|
5
|
+
#language definition
|
6
|
+
SIMPLE_TYPES = %w{i f s b _}
|
7
|
+
SIMPLE_TYPE_RE = "[#{SIMPLE_TYPES.join}]"
|
8
|
+
|
9
|
+
MODIFIERS = %w{+ *}
|
10
|
+
MODIFIER_RE = "[#{MODIFIERS.join}]"
|
11
|
+
|
12
|
+
SIMPLE_TOKEN_RE = "#{SIMPLE_TYPE_RE}(?:#{MODIFIER_RE}|[0-9]+)?"
|
13
|
+
|
14
|
+
DATE_TYPES = %w{a A b B c d H I j m M p S U w W x X Y Z}
|
15
|
+
DATE_TYPES_RE = "[#{DATE_TYPES.join} ]" # the extra space is supposed to be there
|
16
|
+
DATE_TOKEN_RE = "%#{DATE_TYPES_RE}*%"
|
17
|
+
|
18
|
+
FIXED_POINT_TYPE = 'D'
|
19
|
+
FIXED_POINT_SEP = 'e'
|
20
|
+
FIXED_POINT_TOKEN_RE = "#{FIXED_POINT_TYPE}\\d+(?:#{FIXED_POINT_SEP}\\d+)?"
|
21
|
+
|
22
|
+
TOKENS = [SIMPLE_TOKEN_RE, DATE_TOKEN_RE, FIXED_POINT_TOKEN_RE]
|
23
|
+
TOKEN_RE = "#{TOKENS.join('|')}"
|
24
|
+
CAPTURE_TOKEN_RE = /(#{TOKENS.join('|')})/
|
25
|
+
|
26
|
+
LANGUAGE_RE = /^(?:(#{TOKEN_RE}) *)+$/
|
27
|
+
|
28
|
+
#total regexes, i.e. regexes that must match the whole string
|
29
|
+
TOTAL_SIMPLE_TYPE_RE = /^#{SIMPLE_TOKEN_RE}$/
|
30
|
+
TOTAL_FIXED_POINT_RE = /^#{FIXED_POINT_TOKEN_RE}$/
|
31
|
+
TOTAL_DATE_RE = /^#{DATE_TOKEN_RE}$/
|
32
|
+
|
33
|
+
#named regexes used for parsing tokens
|
34
|
+
NAMED_SIMPLE_TYPE_RE = /(?<type>#{SIMPLE_TYPE_RE})(?:(?<length>[0-9]+)|(?<modifier>#{MODIFIER_RE}))?/
|
35
|
+
NAMED_FIXED_POINT_RE = /#{FIXED_POINT_TYPE}(?<length>\d+)(?:#{FIXED_POINT_SEP}(?<power>\d+))?/
|
36
|
+
NAMED_DATE_RE = /%(?<format>#{DATE_TYPES_RE})%/
|
37
|
+
|
38
|
+
# Returns true if the supplied string is in
|
39
|
+
# Flat's formatting language, as determined
|
40
|
+
# by the LANGUAGE_RE regex.
|
41
|
+
def self.string_in_lang(str)
|
42
|
+
return (not (str =~ LANGUAGE_RE).nil?)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Wukong
|
2
|
+
module FlatPack
|
3
|
+
class Parser
|
4
|
+
attr_accessor :re
|
5
|
+
attr_accessor :lang
|
6
|
+
|
7
|
+
def initialize(lang)
|
8
|
+
@lang = lang
|
9
|
+
@re = re_from_language @lang
|
10
|
+
end
|
11
|
+
|
12
|
+
# returns true if the supplied string is in the parser's language
|
13
|
+
def string_in_lang? str
|
14
|
+
return (not (str =~ @re).nil?)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Creates a regular expression from the
|
18
|
+
# supplied language
|
19
|
+
def re_from_language lang
|
20
|
+
regex = "^"
|
21
|
+
lang.each do |token|
|
22
|
+
regex += "(#{token.re})"
|
23
|
+
end
|
24
|
+
regex += "$"
|
25
|
+
return Regexp.new(regex)
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse(str,trim=false)
|
29
|
+
return nil unless string_in_lang? str
|
30
|
+
result = []
|
31
|
+
str.match(@re)[1..-1].each_with_index do |val,index|
|
32
|
+
token = lang[index].translate(val)
|
33
|
+
if trim and token.is_a?(String)
|
34
|
+
token.strip!
|
35
|
+
end
|
36
|
+
result << token
|
37
|
+
end
|
38
|
+
return result - [:ignore]
|
39
|
+
end
|
40
|
+
|
41
|
+
def file_to_tsv(in_filename,out_filename,trim=true)
|
42
|
+
infile = File.open(in_filename,'r')
|
43
|
+
outfile = File.open(out_filename,'a')
|
44
|
+
infile.each_line do |line|
|
45
|
+
outfile.write(line_to_tsv(line,trim))
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def line_to_tsv(line,trim=true)
|
50
|
+
fields = parse(line,trim)
|
51
|
+
return fields.join("\t") + "\n"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
module Wukong
|
2
|
+
module FlatPack
|
3
|
+
module Tokens
|
4
|
+
TOKEN_CLASSES = {}
|
5
|
+
|
6
|
+
def self.token_for_indicator(indicator)
|
7
|
+
return TOKEN_CLASSES[indicator].new
|
8
|
+
end
|
9
|
+
|
10
|
+
class Token
|
11
|
+
attr_accessor :position
|
12
|
+
attr_accessor :length
|
13
|
+
attr_accessor :indicator
|
14
|
+
|
15
|
+
def self.indicator= indicator
|
16
|
+
TOKEN_CLASSES[indicator] = self
|
17
|
+
@indicator = indicator
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class FixedPointToken < Token
|
22
|
+
attr_accessor :power
|
23
|
+
attr_accessor :strict
|
24
|
+
|
25
|
+
self.indicator = 'D'
|
26
|
+
|
27
|
+
#TODO: Allow negative powers
|
28
|
+
def re
|
29
|
+
strict ? "(?:(?:\\+|-)\\d{#{@length-1}}|\\d{#{@length}})" : ".{#{@length}}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def translate str
|
33
|
+
return nil if str.strip == ""
|
34
|
+
base = str.to_f
|
35
|
+
return base / (10**@power)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class BasicToken < Token
|
40
|
+
attr_accessor :modifier
|
41
|
+
|
42
|
+
def re token= '.'
|
43
|
+
if not @length.nil?
|
44
|
+
return "#{token}{#{@length}}"
|
45
|
+
elsif not @modifier.nil?
|
46
|
+
return "#{token}#{@modifier}"
|
47
|
+
else
|
48
|
+
return token
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
class IntToken < BasicToken
|
55
|
+
self.indicator = 'i'
|
56
|
+
RE = '(?:\+|-)?\\d'
|
57
|
+
|
58
|
+
def re
|
59
|
+
if not @length.nil?
|
60
|
+
return "(?:(?:\\+|-)\\d{#{@length-1}}|\\d{#{@length}})"
|
61
|
+
elsif not @modifier.nil?
|
62
|
+
return "#{RE}#{@modifier}"
|
63
|
+
else
|
64
|
+
return RE
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def translate str
|
69
|
+
return Integer(str)
|
70
|
+
rescue ArgumentError => err
|
71
|
+
return str.to_i
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class StringToken < BasicToken
|
76
|
+
self.indicator = 's'
|
77
|
+
|
78
|
+
def translate str
|
79
|
+
return str
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
class FloatToken < BasicToken
|
84
|
+
self.indicator = 'f'
|
85
|
+
#TODO: Implement floats
|
86
|
+
|
87
|
+
def get_re
|
88
|
+
#TODO: Implement
|
89
|
+
end
|
90
|
+
|
91
|
+
def translate
|
92
|
+
#TODO: Implement
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class BoolToken < BasicToken
|
97
|
+
self.indicator = 'b'
|
98
|
+
TRUE_TOKENS = ['t','y','1']
|
99
|
+
FALSE_TOKENS = ['f','n','0']
|
100
|
+
|
101
|
+
#TODO: Add back multi-char options and think through allowing padding
|
102
|
+
#TODO: Allow users to override true and false
|
103
|
+
|
104
|
+
def re
|
105
|
+
return "(?:#{(TRUE_TOKENS + TRUE_TOKENS.map {|c| c.upcase} +
|
106
|
+
FALSE_TOKENS + FALSE_TOKENS.map{|c| c.upcase}).join('|')})"
|
107
|
+
end
|
108
|
+
|
109
|
+
def translate str
|
110
|
+
if TRUE_TOKENS.include?(str.downcase)
|
111
|
+
return true
|
112
|
+
elsif FALSE_TOKENS.include?(str.downcase)
|
113
|
+
return false
|
114
|
+
else
|
115
|
+
return nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class IgnoreToken < BasicToken
|
121
|
+
self.indicator = '_'
|
122
|
+
|
123
|
+
# ignore symbols are removed from the final output
|
124
|
+
def translate str
|
125
|
+
return :ignore
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
data/lib/wukong/processor.rb
CHANGED
@@ -1,142 +1,88 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
Settings.define :monitor_interval, :default => 50_000, :type => Integer
|
1
|
+
require 'log4r'
|
4
2
|
|
5
3
|
module Wukong
|
6
|
-
class
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
4
|
+
class ProcessorBuilder < Hanuman::StageBuilder
|
5
|
+
def namespace(*args)
|
6
|
+
args.first.is_a?(Class) ? args.first : Wukong::Processor
|
7
|
+
end
|
8
|
+
end
|
11
9
|
|
12
|
-
|
13
|
-
|
10
|
+
# The Processor is the basic unit of computation in Wukong. A
|
11
|
+
# processor can be thought of as an arbitrary function that takes
|
12
|
+
# certain inputs and produces certain (or no) outputs.
|
13
|
+
#
|
14
|
+
# A Processor can be written and tested purely in Ruby and on your
|
15
|
+
# local machine. You can glue processors together
|
16
|
+
class Processor < Hanuman::Stage
|
17
|
+
|
18
|
+
field :action, Whatever
|
19
|
+
field :log, Whatever, :default => -> { log = Log4r::Logger.new(self.class.to_s) ; log.outputters = Log4r::StdoutOutputter.new('stdout', formatter: Log4r::PatternFormatter.new(pattern: "%d [%l] %c: %m")) ; log }
|
20
|
+
field :notifier, Vayacondios::NotifierFactory, :default => Vayacondios.default_notifier
|
14
21
|
|
15
|
-
|
16
|
-
|
22
|
+
def self.describe desc
|
23
|
+
@description = desc
|
17
24
|
end
|
18
25
|
|
19
|
-
|
20
|
-
|
21
|
-
self.count += 1
|
22
|
-
if (count % Settings.monitor_interval.to_i == 0)
|
23
|
-
log.info "emit\t%-23s\t%-47s\t%s" % [self.class, self.inspect, record.inspect]
|
24
|
-
end
|
25
|
-
output.process(record)
|
26
|
-
rescue Wukong::ProcessorError
|
27
|
-
raise
|
28
|
-
rescue StandardError => err
|
29
|
-
next_block = output.name rescue "(bad stage)"
|
30
|
-
log.warn "#{self}: error emitting #{next_block}: #{err.message}"
|
31
|
-
raise Wukong::ProcessorError, err.message, err.backtrace
|
26
|
+
def self.description
|
27
|
+
@description
|
32
28
|
end
|
33
29
|
|
34
|
-
def
|
35
|
-
BadRecord.make(*args)
|
30
|
+
def self.consumes label
|
36
31
|
end
|
37
32
|
|
38
|
-
def self.
|
39
|
-
register_action(name, &block)
|
33
|
+
def self.produces label
|
40
34
|
end
|
35
|
+
|
36
|
+
# This is a placeholder method intended to be overridden
|
37
|
+
def perform_action(*args) ; end
|
41
38
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
self.log = Log
|
46
|
-
|
47
|
-
config :error_handler, Vayacondios::NotifierFactory, :default => ->{ Vayacondios::NotifierFactory.receive(type: 'log', log: self.log) }
|
48
|
-
|
49
|
-
def bad_record(record, options = {})
|
50
|
-
error_handler.notify(record, options.merge(level: 'error'))
|
39
|
+
# The action attribute is turned into the perform action method
|
40
|
+
def receive_action(action)
|
41
|
+
self.define_singleton_method(:perform_action, &action)
|
51
42
|
end
|
52
|
-
end
|
53
43
|
|
54
|
-
|
55
|
-
#
|
56
|
-
def
|
57
|
-
|
44
|
+
# Valid notifier types are currently :http or :log
|
45
|
+
# This processor's log is passed to vayacondios
|
46
|
+
def receive_notifier(type)
|
47
|
+
if type.is_a?(Hash)
|
48
|
+
@notifier = Vayacondios::NotifierFactory.receive({type: 'log'}.merge(type))
|
49
|
+
else
|
50
|
+
@notifier = Vayacondios::NotifierFactory.receive(type: type, log: log)
|
51
|
+
end
|
58
52
|
end
|
59
|
-
register_processor
|
60
|
-
end
|
61
|
-
|
62
|
-
class Null < Processor
|
63
|
-
self.register_processor
|
64
53
|
|
65
|
-
#
|
66
|
-
def
|
67
|
-
|
54
|
+
# Send information to Vayacondios; data goes in, the right thing happens
|
55
|
+
def notify(topic, cargo)
|
56
|
+
notifier.notify(topic, cargo)
|
68
57
|
end
|
69
|
-
end
|
70
58
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
# contents. If you'll always emit exactly one record out per record in,
|
75
|
-
# you may prefer Wukong::Widget::Map.
|
76
|
-
#
|
77
|
-
# @example regenerate a wordbag with counts matching the original
|
78
|
-
# foreach{|rec| rec.count.times{ emit(rec.word) } }
|
79
|
-
#
|
80
|
-
# @see Project
|
81
|
-
# @see Map
|
82
|
-
class Foreach < Processor
|
83
|
-
self.register_processor
|
84
|
-
|
85
|
-
# @param [Proc] proc used for body of process method
|
86
|
-
# @yield ... or supply it as a &block arg.
|
87
|
-
def initialize(prc=nil, &block)
|
88
|
-
prc ||= block or raise "Please supply a proc or a block to #{self.class}.new"
|
89
|
-
define_singleton_method(:process, prc)
|
90
|
-
end
|
91
|
-
|
92
|
-
def self.make(workflow, *args, &block)
|
93
|
-
obj = new(*args, &block)
|
94
|
-
workflow.add_stage obj
|
95
|
-
obj
|
59
|
+
# This method is called after the processor class has been instantiated
|
60
|
+
# but before any records are given to it to process
|
61
|
+
def setup
|
96
62
|
end
|
97
|
-
end
|
98
|
-
|
99
|
-
#
|
100
|
-
# Evaluates the block and emits the result if non-nil
|
101
|
-
#
|
102
|
-
# @example turn a record into a tuple
|
103
|
-
# map{|rec| rec.attributes.values }
|
104
|
-
#
|
105
|
-
# @example pass along first matching term, drop on the floor otherwise
|
106
|
-
# map{|str| str[/\b(love|hate|happy|sad)\b/] }
|
107
|
-
#
|
108
|
-
class Map < Processor
|
109
|
-
self.register_processor
|
110
|
-
attr_reader :blk
|
111
63
|
|
112
|
-
#
|
113
|
-
#
|
114
|
-
def
|
115
|
-
|
64
|
+
# This method is called once per record
|
65
|
+
# Override this in your subclass
|
66
|
+
def process(record, &emit)
|
67
|
+
yield record
|
116
68
|
end
|
117
69
|
|
118
|
-
|
119
|
-
|
120
|
-
|
70
|
+
# This method is called to signal the last record has been
|
71
|
+
# received but that further processing may still be done, events
|
72
|
+
# still be yielded, &c.
|
73
|
+
#
|
74
|
+
# This can be used within an aggregating processor (like a reducer
|
75
|
+
# in a map/reduce job) to start processing the final aggregate of
|
76
|
+
# records since the "last record" has already been received.
|
77
|
+
def finalize
|
121
78
|
end
|
122
79
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
80
|
+
# This method is called after all records have been passed. It
|
81
|
+
# signals that processing should stop.
|
82
|
+
|
83
|
+
# This method is called after all records have been processed
|
84
|
+
def stop
|
127
85
|
end
|
128
|
-
end
|
129
86
|
|
130
|
-
#
|
131
|
-
# Flatten emits each item in an enumerable as its own record
|
132
|
-
#
|
133
|
-
# @example turn a document into all its words
|
134
|
-
# input > map{|line| line.split(/\W+/) } > flatten > output
|
135
|
-
class Flatten < Processor
|
136
|
-
self.register_processor
|
137
|
-
|
138
|
-
def process(iter)
|
139
|
-
iter.each{|*args| emit(*args) }
|
140
|
-
end
|
141
87
|
end
|
142
88
|
end
|