wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,61 @@
|
|
1
|
+
require_relative("accumulator")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
|
6
|
+
# A processor which counts the total number of its input records.
|
7
|
+
#
|
8
|
+
# On it's own, this widget is really just a poor man's `wc -l`.
|
9
|
+
# It's really intended to serve as a superclass for more complex
|
10
|
+
# accumulators.
|
11
|
+
#
|
12
|
+
# @example Count the total number of input records on the command-line.
|
13
|
+
#
|
14
|
+
# $ wc -l input
|
15
|
+
# 283 input
|
16
|
+
# $ cat input | wu-local count
|
17
|
+
# 283
|
18
|
+
class Count < Accumulator
|
19
|
+
|
20
|
+
# The total size of the input recors.
|
21
|
+
attr_accessor :size
|
22
|
+
|
23
|
+
# Initializes the count to 0.
|
24
|
+
def setup
|
25
|
+
super()
|
26
|
+
@size = 0
|
27
|
+
end
|
28
|
+
|
29
|
+
# Accumulate a `record` by incrmenting the total size.
|
30
|
+
#
|
31
|
+
# @param [Object] record
|
32
|
+
def accumulate record
|
33
|
+
self.size += 1
|
34
|
+
end
|
35
|
+
|
36
|
+
# Keeps all records in the same group so that one count is
|
37
|
+
# emitted at the end.
|
38
|
+
#
|
39
|
+
# Overriding this method and returning different keys for
|
40
|
+
# different records is the beginning of constructing a "group
|
41
|
+
# by" type widget.
|
42
|
+
#
|
43
|
+
# @param [Object] record
|
44
|
+
# @return [:__first__group__]
|
45
|
+
# @see Group
|
46
|
+
def get_key record
|
47
|
+
:__first_group__
|
48
|
+
end
|
49
|
+
|
50
|
+
# Yields the total size.
|
51
|
+
#
|
52
|
+
# @yield [size]
|
53
|
+
# @yieldparam [Integer] size
|
54
|
+
def finalize
|
55
|
+
yield self.size
|
56
|
+
end
|
57
|
+
|
58
|
+
register
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require_relative("../utils")
|
2
|
+
require_relative("count")
|
3
|
+
|
4
|
+
module Wukong
|
5
|
+
class Processor
|
6
|
+
|
7
|
+
# Groups sorted input records and emits each group with a count.
|
8
|
+
#
|
9
|
+
# Allows you to use several ways of extracting the key that
|
10
|
+
# defines the group.
|
11
|
+
#
|
12
|
+
# **Note:** The input records must be previously sorted by the
|
13
|
+
# same key used for grouping in order to ensure that groups are
|
14
|
+
# not split up.
|
15
|
+
#
|
16
|
+
# @example Group simple string values on the command-line.
|
17
|
+
#
|
18
|
+
# $ cat input
|
19
|
+
# apple
|
20
|
+
# cat
|
21
|
+
# banana
|
22
|
+
# apple
|
23
|
+
# ...
|
24
|
+
# $ cat input | wu-local sort | wu-local group
|
25
|
+
# apple 4
|
26
|
+
# banana 2
|
27
|
+
# cat 5
|
28
|
+
# ...
|
29
|
+
#
|
30
|
+
# @example Group using a nested key within a JSON string on the command-line
|
31
|
+
#
|
32
|
+
# $ cat input
|
33
|
+
# {"id": 1, "word": "apple" }
|
34
|
+
# {"id": 2, "word": "cat" }
|
35
|
+
# {"id": 3, "word": "banana"}
|
36
|
+
# ...
|
37
|
+
# $ cat input | wu-local sort --on==word | wu-local group --by=word
|
38
|
+
# apple 4
|
39
|
+
# banana 2
|
40
|
+
# cat 5
|
41
|
+
# ...
|
42
|
+
#
|
43
|
+
# A group fits nicely at the end of a dataflow. Since it requires
|
44
|
+
# a sort, it is blocking.
|
45
|
+
#
|
46
|
+
# @example Using a group at the end of a dataflow
|
47
|
+
#
|
48
|
+
# Wukong.dataflow(:makes_groups) do
|
49
|
+
# ... | sort(on: 'field') | group(by: 'field')
|
50
|
+
# end
|
51
|
+
#
|
52
|
+
# @see Sort
|
53
|
+
class Group < Count
|
54
|
+
|
55
|
+
include DynamicGet
|
56
|
+
field :by, Whatever
|
57
|
+
|
58
|
+
# Get the key which defines the group for this `record`.
|
59
|
+
#
|
60
|
+
# @param [Object] record
|
61
|
+
# @return [Object]
|
62
|
+
def get_key(record)
|
63
|
+
get(self.by, record)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Reset the size counter for new group.
|
67
|
+
#
|
68
|
+
# @param [Object] record
|
69
|
+
def start record
|
70
|
+
self.size = 0
|
71
|
+
end
|
72
|
+
|
73
|
+
# Yields the current group along with its size
|
74
|
+
#
|
75
|
+
# @yield [key, size]
|
76
|
+
# @yieldparam [Object] key the key defining the group
|
77
|
+
# @yieldparam [Integer] size the size of the group
|
78
|
+
def finalize
|
79
|
+
yield [key, size].map(&:to_s).join("\t")
|
80
|
+
end
|
81
|
+
|
82
|
+
register
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require_relative("group")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
|
6
|
+
# Concatenates the elements of a group, yielding the group key,
|
7
|
+
# the count, and its members.
|
8
|
+
#
|
9
|
+
# @example Concatenating elements of a group on the command-line.
|
10
|
+
#
|
11
|
+
# $ cat input
|
12
|
+
# {"id": 1, "parent_id": 4}
|
13
|
+
# {"id": 2, "parent_id": 3}
|
14
|
+
# {"id": 3, "parent_id": 3}
|
15
|
+
# ...
|
16
|
+
# $ cat input | wu-local group_concat --by=parent_id
|
17
|
+
# 4 1 {"id": 1, "parent_id": 4}
|
18
|
+
# 3 2 {"id": 2, "parent_id": 3} {"id": 3, "parent_id": 3}
|
19
|
+
# ...
|
20
|
+
#
|
21
|
+
# GroupConcat takes all the same options as Group.
|
22
|
+
#
|
23
|
+
# @see Group
|
24
|
+
class GroupConcat < Group
|
25
|
+
|
26
|
+
# The members of the current group.
|
27
|
+
attr_accessor :members
|
28
|
+
|
29
|
+
# Initializes the empty members array.
|
30
|
+
def setup
|
31
|
+
super()
|
32
|
+
@members = []
|
33
|
+
end
|
34
|
+
|
35
|
+
# Initializes the empty members array.
|
36
|
+
#
|
37
|
+
# @param [Object] record
|
38
|
+
def start record
|
39
|
+
super(record)
|
40
|
+
self.members = []
|
41
|
+
end
|
42
|
+
|
43
|
+
# Accumulate each record, adding it to the current members.
|
44
|
+
#
|
45
|
+
# @param [Object] record
|
46
|
+
def accumulate record
|
47
|
+
super(record)
|
48
|
+
self.members << record
|
49
|
+
end
|
50
|
+
|
51
|
+
# Yields the group, including its key, its size, and each
|
52
|
+
# member.
|
53
|
+
#
|
54
|
+
# @yield [key, size, *members]
|
55
|
+
# @yieldparam [Object] key the key defining the group
|
56
|
+
# @yieldparam [Integer] size the number of members in the group
|
57
|
+
# @yieldparam [Array<Object>] the members of the group
|
58
|
+
def finalize
|
59
|
+
group = [key, size]
|
60
|
+
group.concat(members)
|
61
|
+
yield group.map(&:to_s).join("\t")
|
62
|
+
end
|
63
|
+
|
64
|
+
register
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require_relative("group")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
class Moments < Group
|
6
|
+
|
7
|
+
field :group_by, Whatever
|
8
|
+
|
9
|
+
attr_accessor :measurements
|
10
|
+
|
11
|
+
field :of, Array, :default => []
|
12
|
+
field :std_dev, :boolean, :default => true
|
13
|
+
|
14
|
+
def get_key record
|
15
|
+
super(record) unless (self.group_by || self.by)
|
16
|
+
get(self.group_by || self.by, record)
|
17
|
+
end
|
18
|
+
|
19
|
+
def receive_of o
|
20
|
+
@of = case o
|
21
|
+
when String then o.split(',')
|
22
|
+
when Array then o
|
23
|
+
else []
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def start record
|
28
|
+
super(record)
|
29
|
+
@measurements = {}.tap do |m|
|
30
|
+
self.of.each do |property|
|
31
|
+
m[property] = []
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def accumulate record
|
37
|
+
super(record)
|
38
|
+
self.of.each do |property|
|
39
|
+
if raw = get(property, record)
|
40
|
+
self.measurements[property] << (raw.to_f rescue next)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def results
|
46
|
+
{}.tap do |r|
|
47
|
+
measurements.each_pair do |property, values|
|
48
|
+
r[property] = {}
|
49
|
+
next if values.empty?
|
50
|
+
count = values.size.to_f
|
51
|
+
r[property][:count] = count.to_i
|
52
|
+
|
53
|
+
mean = values.inject(0.0) { |sum, value| sum += value } / count
|
54
|
+
r[property][:mean] = mean
|
55
|
+
if std_dev
|
56
|
+
variance = values.inject(0.0) { |sum, value| diff = (value - mean) ; sum += diff * diff } / count
|
57
|
+
std = Math.sqrt(variance)
|
58
|
+
r[property][:std_dev] = std
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def finalize
|
65
|
+
yield({:group => key, :count => size}.merge(:results => results))
|
66
|
+
end
|
67
|
+
|
68
|
+
register
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require_relative("accumulator")
|
2
|
+
require_relative("../utils")
|
3
|
+
|
4
|
+
module Wukong
|
5
|
+
class Processor
|
6
|
+
|
7
|
+
# Sorts input records.
|
8
|
+
#
|
9
|
+
# For many use cases you're better off using native tools like
|
10
|
+
# `/bin/sort` because they are faster and already do what you
|
11
|
+
# need.
|
12
|
+
#
|
13
|
+
# @example When /bin/sort is more than enough on the command-line
|
14
|
+
#
|
15
|
+
# $ cat input
|
16
|
+
# 1 apple
|
17
|
+
# 2 banana
|
18
|
+
# 3 cat
|
19
|
+
# 4 banana
|
20
|
+
# ...
|
21
|
+
# $ cat input | sort -k2
|
22
|
+
# 1 apple
|
23
|
+
# 2 banana
|
24
|
+
# 4 banana
|
25
|
+
# 3 cat
|
26
|
+
# ...
|
27
|
+
#
|
28
|
+
# Other times, you need something that can introspect more on its
|
29
|
+
# input:
|
30
|
+
#
|
31
|
+
# @example When you may prefer the sort widget on the command-line
|
32
|
+
#
|
33
|
+
# $ cat input
|
34
|
+
# {"id": 1, "word": "apple" }
|
35
|
+
# {"id": 2, "word": "cat" }
|
36
|
+
# {"id": 3, "word": "banana"}
|
37
|
+
# ...
|
38
|
+
# $ cat input | wu-local sort --on word
|
39
|
+
# {"id": 1, "word": "apple" }
|
40
|
+
# {"id": 3, "word": "banana"}
|
41
|
+
# {"id": 2, "word": "cat" }
|
42
|
+
# ...
|
43
|
+
#
|
44
|
+
# The sort widget is useful for modeling Hadoop jobs, but don't
|
45
|
+
# forget that [Hadoop does its own
|
46
|
+
# sorting](http://hadoop.apache.org/docs/r0.20.2/mapred_tutorial.html#Sort),
|
47
|
+
# so the sort widget doesn't belong in your map/reduce jobs.
|
48
|
+
#
|
49
|
+
# @example The wrong way to model a Hadoop map/reduce job
|
50
|
+
#
|
51
|
+
# Wukong.dataflow(:my_incorrect_job_dataflow) do
|
52
|
+
# parse | extract(part: 'country') | sort | group
|
53
|
+
# end
|
54
|
+
#
|
55
|
+
# @example The right way to model a Hadoop map/reduce job
|
56
|
+
#
|
57
|
+
# Wukong.dataflow(:mapper) do
|
58
|
+
# parse | extract(part: 'country')
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# Wukong.dataflow(:reducer) do
|
62
|
+
# group
|
63
|
+
# end
|
64
|
+
class Sort < Accumulator
|
65
|
+
|
66
|
+
include DynamicGet
|
67
|
+
field :on, Whatever
|
68
|
+
field :reverse, :boolean, :default => false
|
69
|
+
field :numeric, :boolean, :default => false
|
70
|
+
|
71
|
+
# Intializes the array of records that will hold all the values.
|
72
|
+
def setup
|
73
|
+
super()
|
74
|
+
@records = []
|
75
|
+
end
|
76
|
+
|
77
|
+
# Keeps all the records in a single group so they can be sorted.
|
78
|
+
#
|
79
|
+
# @param [Object] record
|
80
|
+
# @return [:__first__group__]
|
81
|
+
def get_key(record)
|
82
|
+
:__first_group__
|
83
|
+
end
|
84
|
+
|
85
|
+
# Stores the `record` for later sorting.
|
86
|
+
#
|
87
|
+
# @param [Object] record
|
88
|
+
def accumulate record
|
89
|
+
@records << record
|
90
|
+
end
|
91
|
+
|
92
|
+
# Sorts all the stored records and yields in one sorted
|
93
|
+
# according to the field in the right order.
|
94
|
+
#
|
95
|
+
# @yield [record] each record in correct sort order
|
96
|
+
# @yeildparam [Object] record
|
97
|
+
def finalize
|
98
|
+
sorted = @records.sort{ |x, y| compare(x, y) }
|
99
|
+
sorted.reverse! if reverse
|
100
|
+
sorted.each{ |record| yield record }
|
101
|
+
end
|
102
|
+
|
103
|
+
# Extracts the sortable part of the input `record`.
|
104
|
+
#
|
105
|
+
# @param [Object] record
|
106
|
+
# @return [Object] the part of the record to sort on
|
107
|
+
def sortable(record)
|
108
|
+
get(self.on, record)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Compare records `x` and `y` using their sortable parts.
|
112
|
+
#
|
113
|
+
# Will use numeric sorting when asked.
|
114
|
+
#
|
115
|
+
# @param [Object] x
|
116
|
+
# @param [Object] y
|
117
|
+
# @return [1,0,-1] depends on which of x or y is considered greater
|
118
|
+
def compare(x, y)
|
119
|
+
a = (sortable(x) or return -1)
|
120
|
+
b = (sortable(y) or return 1)
|
121
|
+
if numeric
|
122
|
+
a = a.to_f ; b = b.to_f
|
123
|
+
end
|
124
|
+
a <=> b
|
125
|
+
end
|
126
|
+
|
127
|
+
register
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,287 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# An empty parent class for all Serializers to subclass.
|
5
|
+
class Serializer < Processor
|
6
|
+
end
|
7
|
+
|
8
|
+
# A widget for serializing inputs to JSON.
|
9
|
+
#
|
10
|
+
# @example Serializing to JSON at the end of a data flow
|
11
|
+
#
|
12
|
+
# Wukong.dataflow(:emits_json) do
|
13
|
+
# ... | to_json
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# @see FromJson
|
17
|
+
class ToJson < Serializer
|
18
|
+
# Yields the input `record` serialized as JSON.
|
19
|
+
#
|
20
|
+
# @param [Object] record
|
21
|
+
# @yield [json] the serialized json output
|
22
|
+
# @yieldparam [String] json
|
23
|
+
def process(record)
|
24
|
+
begin
|
25
|
+
json = ::MultiJson.dump(record)
|
26
|
+
rescue => e
|
27
|
+
# FIXME -- should we log here or what?
|
28
|
+
return
|
29
|
+
end
|
30
|
+
yield json
|
31
|
+
end
|
32
|
+
register
|
33
|
+
end
|
34
|
+
|
35
|
+
# A widget for deserializing inputs from JSON.
|
36
|
+
#
|
37
|
+
# @example Deserializing from JSON at the beginning of a data flow
|
38
|
+
#
|
39
|
+
# Wukong.dataflow(:consumes_json) do
|
40
|
+
# from_json | ...
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# @see ToJson
|
44
|
+
class FromJson < Serializer
|
45
|
+
# Yields the input `record` deserialized from JSON.
|
46
|
+
#
|
47
|
+
# @param [String] json
|
48
|
+
# @yield [obj] the deserialized object
|
49
|
+
# @yieldparam [Object] obj
|
50
|
+
def process(json)
|
51
|
+
begin
|
52
|
+
obj = ::MultiJson.load(json)
|
53
|
+
rescue => e
|
54
|
+
# FIXME -- should we log here or what?
|
55
|
+
return
|
56
|
+
end
|
57
|
+
yield obj
|
58
|
+
end
|
59
|
+
register
|
60
|
+
end
|
61
|
+
|
62
|
+
# A widget for serializing inputs to TSV.
|
63
|
+
#
|
64
|
+
# @example Serializing to TSV at the end of a data flow
|
65
|
+
#
|
66
|
+
# Wukong.dataflow(:emits_tsv) do
|
67
|
+
# ... | to_tsv
|
68
|
+
# end
|
69
|
+
#
|
70
|
+
# @see FromTsv
|
71
|
+
class ToTsv < Serializer
|
72
|
+
# Yields the input `record` serialized as TSV.
|
73
|
+
#
|
74
|
+
# @param [Object] record
|
75
|
+
# @yield [tsv] the serialized TSV output
|
76
|
+
# @yieldparam [String] tsv
|
77
|
+
def process(record)
|
78
|
+
begin
|
79
|
+
tsv = record.map(&:to_s).join("\t")
|
80
|
+
rescue => e
|
81
|
+
# FIXME -- should we log here or what?
|
82
|
+
return
|
83
|
+
end
|
84
|
+
yield tsv
|
85
|
+
end
|
86
|
+
register
|
87
|
+
end
|
88
|
+
|
89
|
+
# A widget for deserializing inputs from TSV.
|
90
|
+
#
|
91
|
+
# @example Deserializing from TSV at the beginning of a data flow
|
92
|
+
#
|
93
|
+
# Wukong.dataflow(:consumes_tsv) do
|
94
|
+
# from_tsv | ...
|
95
|
+
# end
|
96
|
+
#
|
97
|
+
# @see ToTsv
|
98
|
+
class FromTsv < Serializer
|
99
|
+
# Yields the input `record` deserialized from TSV.
|
100
|
+
#
|
101
|
+
# @param [String] tsv
|
102
|
+
# @yield [obj] the deserialized object
|
103
|
+
# @yieldparam [Object] obj
|
104
|
+
def process(tsv)
|
105
|
+
begin
|
106
|
+
record = tsv.split(/\t/)
|
107
|
+
rescue => e
|
108
|
+
# FIXME -- should we log here or what?
|
109
|
+
return
|
110
|
+
end
|
111
|
+
yield record
|
112
|
+
end
|
113
|
+
register
|
114
|
+
end
|
115
|
+
|
116
|
+
# A widget for serializing inputs to CSV.
|
117
|
+
#
|
118
|
+
# @example Serializing to CSV at the end of a data flow
|
119
|
+
#
|
120
|
+
# Wukong.dataflow(:emits_csv) do
|
121
|
+
# ... | to_csv
|
122
|
+
# end
|
123
|
+
#
|
124
|
+
# @see FromCsv
|
125
|
+
class ToCsv < Serializer
|
126
|
+
# Yields the input `record` serialized as CSV.
|
127
|
+
#
|
128
|
+
# @param [Object] record
|
129
|
+
# @yield [csv] the serialized CSV output
|
130
|
+
# @yieldparam [String] csv
|
131
|
+
def process(record)
|
132
|
+
begin
|
133
|
+
csv = record.map(&:to_s).join(",")
|
134
|
+
rescue => e
|
135
|
+
# FIXME -- should we log here or what?
|
136
|
+
return
|
137
|
+
end
|
138
|
+
yield csv
|
139
|
+
end
|
140
|
+
register
|
141
|
+
end
|
142
|
+
|
143
|
+
# A widget for deserializing inputs from CSV.
|
144
|
+
#
|
145
|
+
# @example Deserializing from CSV at the beginning of a data flow
|
146
|
+
#
|
147
|
+
# Wukong.dataflow(:consumes_csv) do
|
148
|
+
# from_csv | ...
|
149
|
+
# end
|
150
|
+
#
|
151
|
+
# @see ToCsv
|
152
|
+
class FromCsv < Serializer
|
153
|
+
# Yields the input `record` deserialized from CSV.
|
154
|
+
#
|
155
|
+
# @param [String] csv
|
156
|
+
# @yield [obj] the deserialized object
|
157
|
+
# @yieldparam [Object] obj
|
158
|
+
def process(csv)
|
159
|
+
begin
|
160
|
+
record = csv.split(/,/)
|
161
|
+
rescue => e
|
162
|
+
# FIXME -- should we log here or what?
|
163
|
+
return
|
164
|
+
end
|
165
|
+
yield record
|
166
|
+
end
|
167
|
+
register
|
168
|
+
end
|
169
|
+
|
170
|
+
# A widget for serializing inputs to a delimited format.
|
171
|
+
#
|
172
|
+
# @example Serializing to a delimited format at the end of a data flow
|
173
|
+
#
|
174
|
+
# Wukong.dataflow(:emits_delimited) do
|
175
|
+
# ... | to_delimited(delimiter: "--")
|
176
|
+
# end
|
177
|
+
#
|
178
|
+
# @see FromDelimited
|
179
|
+
class ToDelimited < Serializer
|
180
|
+
field :delimiter, String, :default => "\t"
|
181
|
+
# Yields the input `record` serialized in a delimited format..
|
182
|
+
#
|
183
|
+
# @param [Object] record
|
184
|
+
# @yield [delimited] the serialized delimited output
|
185
|
+
# @yieldparam [String] delimited
|
186
|
+
def process(record)
|
187
|
+
begin
|
188
|
+
delimited = record.map(&:to_s).join(delimiter)
|
189
|
+
rescue => e
|
190
|
+
# FIXME -- should we log here or what?
|
191
|
+
return
|
192
|
+
end
|
193
|
+
yield delimited
|
194
|
+
end
|
195
|
+
register
|
196
|
+
end
|
197
|
+
|
198
|
+
# A widget for deserializing inputs from a delimited format.
|
199
|
+
#
|
200
|
+
# @example Deserializing from a delimited format at the beginning of a data flow
|
201
|
+
#
|
202
|
+
# Wukong.dataflow(:consumes_delimited) do
|
203
|
+
# from_delimited(delimiter: "--") | ...
|
204
|
+
# end
|
205
|
+
#
|
206
|
+
# @see ToDelimited
|
207
|
+
class FromDelimited < Serializer
|
208
|
+
field :delimiter, String, :default => "\t"
|
209
|
+
# Yields the input `record` deserialized from a delimited format.
|
210
|
+
#
|
211
|
+
# @param [String] delimited
|
212
|
+
# @yield [obj] the deserialized object
|
213
|
+
# @yieldparam [Object] obj
|
214
|
+
def process(delimited)
|
215
|
+
begin
|
216
|
+
record = delimited.split(delimiter)
|
217
|
+
rescue => e
|
218
|
+
# FIXME -- should we log here or what?
|
219
|
+
return
|
220
|
+
end
|
221
|
+
yield record
|
222
|
+
end
|
223
|
+
register
|
224
|
+
end
|
225
|
+
|
226
|
+
# A widget for serializing inputs to Ruby's `inspect` format.
|
227
|
+
#
|
228
|
+
# @example Serializing to Ruby's inspect format at the end of a data flow
|
229
|
+
#
|
230
|
+
# Wukong.dataflow(:emits_inspected) do
|
231
|
+
# ... | to_inspect
|
232
|
+
# end
|
233
|
+
class ToInspect < Serializer
|
234
|
+
# Yields the input record(s) passed through Ruby's `inspect`.
|
235
|
+
#
|
236
|
+
# @param [Array<Object>]
|
237
|
+
# @yield [inspected]
|
238
|
+
# @yieldparam [String] inspected
|
239
|
+
def process(*args)
|
240
|
+
yield args.size == 1 ? args.first.inspect : args.inspect
|
241
|
+
end
|
242
|
+
register
|
243
|
+
end
|
244
|
+
|
245
|
+
# A widget for pretty printing input records.
|
246
|
+
#
|
247
|
+
# @example Pretty printing JSON on the command-line
|
248
|
+
#
|
249
|
+
# $ cat input
|
250
|
+
# {"id": 1, "word": "apple" }
|
251
|
+
# $ cat input | wu-local pretty
|
252
|
+
# {
|
253
|
+
# "id":2,
|
254
|
+
# "parent_id":3
|
255
|
+
# }
|
256
|
+
class Pretty < Serializer
|
257
|
+
# Pretty print `record` if we can.
|
258
|
+
#
|
259
|
+
# @param [Object] record
|
260
|
+
# @yield [pretty]
|
261
|
+
# @yieldparam [String] pretty the pretty-printed record
|
262
|
+
def process record
|
263
|
+
if record.is_a?(String) && record =~ /^\s*\{/
|
264
|
+
yield pretty_json(record)
|
265
|
+
else
|
266
|
+
yield record.to_s
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Attempt to pretty-print the given `json`, returning the
|
271
|
+
# original on an error.
|
272
|
+
#
|
273
|
+
# @param [String] json ugly JSON
|
274
|
+
# @return [String] prettier JSON
|
275
|
+
def pretty_json json
|
276
|
+
begin
|
277
|
+
MultiJson.dump(MultiJson.load(json), :pretty => true)
|
278
|
+
rescue => e
|
279
|
+
json
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
register
|
284
|
+
end
|
285
|
+
|
286
|
+
end
|
287
|
+
end
|