wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,156 @@
|
|
1
|
+
require_relative('utils')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
class Processor
|
5
|
+
|
6
|
+
# A widget that will log all incoming records.
|
7
|
+
#
|
8
|
+
# @example Logging records from the command line
|
9
|
+
#
|
10
|
+
# $ cat input
|
11
|
+
# 1
|
12
|
+
# 2
|
13
|
+
# 3
|
14
|
+
# $ cat input | wu-local logger
|
15
|
+
# 2012-11-28 18:20:46 [INFO] Logger: 1
|
16
|
+
# 2012-11-28 18:20:46 [INFO] Logger: 2
|
17
|
+
# 2012-11-28 18:20:46 [INFO] Logger: 3
|
18
|
+
#
|
19
|
+
# @example Logging records within a dataflow
|
20
|
+
#
|
21
|
+
# Wukong.dataflow(:uses_logger) do
|
22
|
+
# ... | logger
|
23
|
+
# end
|
24
|
+
class Logger < Processor
|
25
|
+
# The level to use for logging.
|
26
|
+
field :level, Symbol, :default => :info
|
27
|
+
|
28
|
+
# Process a given `record` by logging it.
|
29
|
+
#
|
30
|
+
# @param [Object] record
|
31
|
+
def process(record)
|
32
|
+
log.send(level, record)
|
33
|
+
end
|
34
|
+
register
|
35
|
+
end
|
36
|
+
|
37
|
+
# A widget that extracts parts of incoming records.
|
38
|
+
#
|
39
|
+
# This widget can extract part of the following kinds of objects:
|
40
|
+
#
|
41
|
+
# - Hash
|
42
|
+
# - Array
|
43
|
+
# - JSON string
|
44
|
+
# - delimited string ("\t" or "," or other)
|
45
|
+
# - models
|
46
|
+
#
|
47
|
+
# In each case it will attempt to appropriately parse its
|
48
|
+
# <tt>:part</tt> argument.
|
49
|
+
#
|
50
|
+
# @example Extracting a column from an input TSV record on the command-line
|
51
|
+
#
|
52
|
+
# $ cat input
|
53
|
+
# snap crackle pop
|
54
|
+
# 1 2 3
|
55
|
+
# $ cat input | wu-local extract --part=2
|
56
|
+
# crackle
|
57
|
+
# pop
|
58
|
+
#
|
59
|
+
# @example Extracting a column from delimited data with a different delimiter
|
60
|
+
#
|
61
|
+
# $ cat input
|
62
|
+
# snap,crackle,pop
|
63
|
+
# 1,2,3
|
64
|
+
# $ cat input | wu-local extract --part=2 --delimiter=,
|
65
|
+
# crackle
|
66
|
+
# pop
|
67
|
+
#
|
68
|
+
# @example Extracting a field from within some JSON record on the command-line
|
69
|
+
#
|
70
|
+
# $ cat input
|
71
|
+
# {"id": 1, "text": "hi there"}
|
72
|
+
# {"id": 2, "text": "goodbye"}
|
73
|
+
# $ cat input | wu-local extract --part="text"
|
74
|
+
# hi there
|
75
|
+
# goodbye
|
76
|
+
#
|
77
|
+
# This even works on nested keys using a dot ('.') to separate the
|
78
|
+
# keys:
|
79
|
+
#
|
80
|
+
# @example Extracting a nested field from within some JSON record on the command-line
|
81
|
+
#
|
82
|
+
# $ cat input
|
83
|
+
# {"id": 1, {"data": {"text": "hi there"}}
|
84
|
+
# {"id": 2, {"data": {"text": "goodbye"}}
|
85
|
+
# $ cat input | wu-local extract --part="data.text"
|
86
|
+
# hi there
|
87
|
+
# goodbye
|
88
|
+
#
|
89
|
+
# Objects like Hashes, Arrays, and models, which would have to
|
90
|
+
# serialize within a command-line flow, can also be extracted from
|
91
|
+
# within a dataflow:
|
92
|
+
#
|
93
|
+
# @example Extracting a field from within a Hash in a dataflow
|
94
|
+
#
|
95
|
+
# Wukong.dataflow(:uses_extract) do
|
96
|
+
# ... | extract(part: 'data.text') | ...
|
97
|
+
# end
|
98
|
+
#
|
99
|
+
# @see DynamicGet
|
100
|
+
class Extract < Processor
|
101
|
+
include DynamicGet
|
102
|
+
|
103
|
+
# The part to extract.
|
104
|
+
field :part, Whatever, :default => nil
|
105
|
+
|
106
|
+
# Extract a `part` of a `record`.
|
107
|
+
#
|
108
|
+
# @param [Object] record
|
109
|
+
# @yield [part]
|
110
|
+
# @yieldparam [Object] part the part extracted from the record
|
111
|
+
def process record
|
112
|
+
yield get(self.part, record)
|
113
|
+
end
|
114
|
+
register
|
115
|
+
end
|
116
|
+
|
117
|
+
class Topic < Processor
|
118
|
+
field :topic, Symbol
|
119
|
+
def process(record)
|
120
|
+
yield perform_action(record)
|
121
|
+
end
|
122
|
+
|
123
|
+
def perform_action(record)
|
124
|
+
assign_topic(record, topic)
|
125
|
+
end
|
126
|
+
|
127
|
+
def assign_topic(record, topic_name)
|
128
|
+
record.define_singleton_method(:topic){ topic_name }
|
129
|
+
record
|
130
|
+
end
|
131
|
+
register
|
132
|
+
end
|
133
|
+
|
134
|
+
# Until further notice, this processor is unusable due to the invocation of yield
|
135
|
+
# class Foreach < Processor
|
136
|
+
# def process(record, &blk)
|
137
|
+
# perform_action(record, &blk)
|
138
|
+
# end
|
139
|
+
# register
|
140
|
+
# end
|
141
|
+
|
142
|
+
class Map < Processor
|
143
|
+
def process(record)
|
144
|
+
yield perform_action(record)
|
145
|
+
end
|
146
|
+
register
|
147
|
+
end
|
148
|
+
|
149
|
+
class Flatten < Processor
|
150
|
+
def process(records)
|
151
|
+
records.respond_to?(:each) ? records.each{ |record| yield(record) } : yield(records)
|
152
|
+
end
|
153
|
+
register
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require_relative("reducers/accumulator")
|
2
|
+
require_relative("reducers/sort")
|
3
|
+
require_relative("reducers/count")
|
4
|
+
require_relative("reducers/group")
|
5
|
+
require_relative("reducers/group_concat")
|
6
|
+
require_relative("reducers/moments")
|
7
|
+
require_relative("reducers/bin")
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# A base widget for building more complex accumulative widgets.
|
5
|
+
class Accumulator < Processor
|
6
|
+
|
7
|
+
# The current key used to define the current group being
|
8
|
+
# accumulated.
|
9
|
+
attr_accessor :key
|
10
|
+
|
11
|
+
# The current group of records.
|
12
|
+
attr_accessor :group
|
13
|
+
|
14
|
+
# Sets up this accumulator by defining an initial key (with a
|
15
|
+
# value that is unlikely to be found in real data) and calling
|
16
|
+
# `#start` with no record.
|
17
|
+
def setup
|
18
|
+
@key = :__first_group__
|
19
|
+
start(nil)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Processes the `record`.
|
23
|
+
#
|
24
|
+
# If the record is part of the current group (has a key that is
|
25
|
+
# the same as the current key) then will call `accumulate` with
|
26
|
+
# the record.
|
27
|
+
#
|
28
|
+
# If the record has a different key, will call `finalize` and
|
29
|
+
# then call `start` with the record.
|
30
|
+
#
|
31
|
+
# @param [Object] record
|
32
|
+
# @yield [finalized_record] each record yielded by `finalize`
|
33
|
+
# @yieldparam [Object] finalized_record
|
34
|
+
# @see #accumulate
|
35
|
+
# @see #finalize
|
36
|
+
# @see #get_key
|
37
|
+
# @see #start
|
38
|
+
def process(record)
|
39
|
+
this_key = get_key(record)
|
40
|
+
if this_key != self.key
|
41
|
+
finalize { |record| yield record } unless self.key == :__first_group__
|
42
|
+
self.key = this_key
|
43
|
+
start record
|
44
|
+
end
|
45
|
+
accumulate(record)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Starts accumulation for a new group of records with a new key.
|
49
|
+
# This is where you can reset counters, clear caches, &c.
|
50
|
+
#
|
51
|
+
# @param [Object] record
|
52
|
+
def start record
|
53
|
+
end
|
54
|
+
|
55
|
+
# Gets the key from the given +record+. By default a record's
|
56
|
+
# key is just the record itself.
|
57
|
+
#
|
58
|
+
# @param [Object] record
|
59
|
+
# @return [Object] the record's key
|
60
|
+
def get_key record
|
61
|
+
record
|
62
|
+
end
|
63
|
+
|
64
|
+
# Accumulates another +record+.
|
65
|
+
#
|
66
|
+
# Does nothing by default, intended for you to override.
|
67
|
+
#
|
68
|
+
# @param [Object] record
|
69
|
+
def accumulate record
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,318 @@
|
|
1
|
+
module Wukong
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
# A widget for binning input data. Will emit
|
5
|
+
#
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# @example Binning some input data on the command-line
|
9
|
+
#
|
10
|
+
# $ cat input
|
11
|
+
# 0.94628
|
12
|
+
# 0.03480
|
13
|
+
# 0.74418
|
14
|
+
# ...
|
15
|
+
# $ cat input | wu-local bin
|
16
|
+
#
|
17
|
+
# 0.02935 0.12638500000000003 7
|
18
|
+
# 0.12638500000000003 0.22342000000000004 11
|
19
|
+
# 0.22342000000000004 0.32045500000000005 15
|
20
|
+
#
|
21
|
+
# @example Control how the bins are defined and displayed
|
22
|
+
#
|
23
|
+
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1
|
24
|
+
# 0.0 0.1 10.0
|
25
|
+
# 0.1 0.2 12.0
|
26
|
+
# 0.2 0.3 8.0
|
27
|
+
# ...
|
28
|
+
#
|
29
|
+
# @example Include an additional column of normalized (fractional) counts
|
30
|
+
#
|
31
|
+
# $ cat input | wu-local bin --min=0.0 --max=1.0 --num_bins=10 --precision=1 --normalize
|
32
|
+
# 0.0 0.1 10.0 0.3
|
33
|
+
# 0.1 0.2 12.0 0.36
|
34
|
+
# 0.2 0.3 8.0 0.24
|
35
|
+
# ...
|
36
|
+
#
|
37
|
+
# @example Make a log-log histogram
|
38
|
+
#
|
39
|
+
# $ cat input | wu-local bin --log_bins --log_counts
|
40
|
+
# 1.000 3.162 1.099
|
41
|
+
# 3.162 10.000 1.946
|
42
|
+
# 10.000 31.623 3.045
|
43
|
+
# 31.623 100.000 4.234
|
44
|
+
#
|
45
|
+
# This widget works nicely with the Extract widget at the end of a
|
46
|
+
# data flow:
|
47
|
+
#
|
48
|
+
# @example Use the bin at the end of a dataflow
|
49
|
+
#
|
50
|
+
# Wukong.processor(:bins_at_end) do
|
51
|
+
# ... | extract(part: 'age') | bin(num_bins: 10)
|
52
|
+
# end
|
53
|
+
#
|
54
|
+
# @see Accumulator
|
55
|
+
# @see Extract
|
56
|
+
class Bin < Accumulator
|
57
|
+
|
58
|
+
field :num_bins, Integer
|
59
|
+
field :edges, Array
|
60
|
+
field :min, Float
|
61
|
+
field :max, Float
|
62
|
+
|
63
|
+
field :format_string, String
|
64
|
+
field :precision, Integer, :default => 3
|
65
|
+
|
66
|
+
include DynamicGet
|
67
|
+
field :by, Whatever
|
68
|
+
|
69
|
+
field :log_bins, :boolean, :default => false
|
70
|
+
field :log_counts, :boolean, :default => false
|
71
|
+
field :base, Float, :default => Math::E
|
72
|
+
|
73
|
+
field :normalize, :boolean, :default => false
|
74
|
+
|
75
|
+
# The accumulated values
|
76
|
+
attr_accessor :values
|
77
|
+
|
78
|
+
# The bins (pairs of edges)
|
79
|
+
attr_accessor :bins
|
80
|
+
|
81
|
+
# The value counts within each bin.
|
82
|
+
attr_accessor :counts
|
83
|
+
|
84
|
+
# The total number of accumulated values.
|
85
|
+
attr_accessor :total_count
|
86
|
+
|
87
|
+
# Initializes all storage. If we can calculate bins in advance,
|
88
|
+
# do so now.
|
89
|
+
def setup
|
90
|
+
super()
|
91
|
+
self.values = []
|
92
|
+
self.bins = []
|
93
|
+
self.counts = []
|
94
|
+
self.total_count = 0
|
95
|
+
if edges.nil?
|
96
|
+
set_edges_from_min_max_and_num_bins! if min && max && num_bins
|
97
|
+
else
|
98
|
+
set_bins_and_counts_from_edges!
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Keep all records in the same "group", at least from the
|
103
|
+
# Accumulator's perspective.
|
104
|
+
#
|
105
|
+
# @param [Object] record
|
106
|
+
# @return [:__first__group__]
|
107
|
+
def get_key record
|
108
|
+
:__first__group__
|
109
|
+
end
|
110
|
+
|
111
|
+
# Accumulates a single `record`.
|
112
|
+
#
|
113
|
+
# First we extract the value from the record. If we already
|
114
|
+
# have bins, add the value to the appropriate bin. Otherwise,
|
115
|
+
# store the value, updating any properties like `max` or `min`
|
116
|
+
# as necessary.
|
117
|
+
#
|
118
|
+
# @param [Object] record
|
119
|
+
def accumulate record
|
120
|
+
value = (value_from(record) or return)
|
121
|
+
self.total_count += 1
|
122
|
+
if bins?
|
123
|
+
add_to_some_bin(value)
|
124
|
+
else
|
125
|
+
self.min ||= value
|
126
|
+
self.min = value if value < min
|
127
|
+
self.max ||= value
|
128
|
+
self.max = value if value > max
|
129
|
+
self.values << value
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# Emits each bin with its edges and count. Adds the normalized
|
134
|
+
# count if requested.
|
135
|
+
#
|
136
|
+
# Will bins the values if we haven't done so on the fly already.
|
137
|
+
#
|
138
|
+
# @yield [lower, upper, count, normalized_count]
|
139
|
+
# @yieldparam [String] lower the lower (left) edge of the bin
|
140
|
+
# @yieldparam [String] upper the upper (right) edge of the bin
|
141
|
+
# @yieldparam [String] count the (logarithmic if requested) count of values in the bin
|
142
|
+
# @yieldparam [String] normalized_count the (logarithmic if requested) normalized count of values in the bin if requested
|
143
|
+
def finalize
|
144
|
+
bin! unless bins?
|
145
|
+
counts.each_with_index do |count, index|
|
146
|
+
bin = bins[index]
|
147
|
+
bin << log_count_if_necessary(count)
|
148
|
+
if normalize && total_count > 0
|
149
|
+
bin << log_count_if_necessary((count.to_f / total_count.to_f))
|
150
|
+
end
|
151
|
+
yield bin.map { |n| format(n) }.join("\t")
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Formats `n` so it's readable and compact.
|
156
|
+
#
|
157
|
+
# If this widget is given an explicit `format_string` then it
|
158
|
+
# will be used here (the value of `format_string` should have a
|
159
|
+
# slot for a float).
|
160
|
+
#
|
161
|
+
# Otherwise, large (or small) numbers will be formatted in
|
162
|
+
# scientific notation while "medium numbers" (0.001 < |n| <
|
163
|
+
# 1000) are merely printed, all with the given `precision`.
|
164
|
+
#
|
165
|
+
# @param [Float] n
|
166
|
+
# @return [String]
|
167
|
+
def format n
|
168
|
+
case
|
169
|
+
when format_string
|
170
|
+
format_string % n
|
171
|
+
when n == 0.0
|
172
|
+
0.0
|
173
|
+
when n.abs > 1000 || n.abs < 0.001
|
174
|
+
"%#{precision}.#{precision}E" % n
|
175
|
+
else
|
176
|
+
"%#{precision}.#{precision}f" % n
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Bins the accumulated values.
|
181
|
+
#
|
182
|
+
# @see #bins?
|
183
|
+
def bin!
|
184
|
+
set_num_bins_from_total_count! unless self.num_bins
|
185
|
+
set_edges_from_min_max_and_num_bins!
|
186
|
+
until values.empty?
|
187
|
+
value = values.shift
|
188
|
+
add_to_some_bin(value.to_f) if value
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# Does this widget have a populated list of bins?
|
193
|
+
#
|
194
|
+
# @return [true, false]
|
195
|
+
def bins?
|
196
|
+
bins && (! bins.empty?)
|
197
|
+
end
|
198
|
+
|
199
|
+
# Get a value from a given `record`.
|
200
|
+
#
|
201
|
+
# @param [Object] record
|
202
|
+
# @return [Float, nil]
|
203
|
+
def value_from record
|
204
|
+
val = get(self.by, record)
|
205
|
+
return unless val
|
206
|
+
val.to_f rescue nil
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns `val`, taking a logarithm to the appropriate base if
|
210
|
+
# required.
|
211
|
+
#
|
212
|
+
# @param [Float] val
|
213
|
+
# @return [Float] the original value or its logarithm if required
|
214
|
+
def log_count_if_necessary val
|
215
|
+
log_counts ? log_if_possible(val) : val
|
216
|
+
end
|
217
|
+
|
218
|
+
# Returns the logarithm of the given `val` if possible.
|
219
|
+
#
|
220
|
+
# Will return the original value if negative.
|
221
|
+
#
|
222
|
+
# @param [Float] val
|
223
|
+
# @return [Float]
|
224
|
+
def log_if_possible val
|
225
|
+
val > 0 ? Math.log(val, base) : val
|
226
|
+
end
|
227
|
+
|
228
|
+
private
|
229
|
+
|
230
|
+
# :nodoc
|
231
|
+
def receive_min new_min
|
232
|
+
raise Error.new("The minimum value must be strictly less than the maximum value") if max && new_min.to_f >= max
|
233
|
+
@min = new_min.to_f
|
234
|
+
end
|
235
|
+
|
236
|
+
# :nodoc
|
237
|
+
def receive_max new_max
|
238
|
+
raise Error.new("The maximum value must be strictly greater than the minimum value") if min && new_max.to_f <= min
|
239
|
+
@max = new_max.to_f
|
240
|
+
end
|
241
|
+
|
242
|
+
# :nodoc
|
243
|
+
def receive_num_bins n
|
244
|
+
raise Error.new("The number of bins must be a postive-definite integer") if n.to_i <= 0
|
245
|
+
@num_bins = n.to_i
|
246
|
+
end
|
247
|
+
|
248
|
+
# :nodoc
|
249
|
+
def receive_edges es
|
250
|
+
@edges = case es
|
251
|
+
when String then es.split(',')
|
252
|
+
when Array then es
|
253
|
+
end.map(&:to_f).sort
|
254
|
+
set_bins_and_counts_from_edges! if @edges
|
255
|
+
@edges
|
256
|
+
end
|
257
|
+
|
258
|
+
# :nodoc
|
259
|
+
def set_num_bins_from_total_count!
|
260
|
+
self.num_bins = Math.sqrt(total_count).to_i
|
261
|
+
end
|
262
|
+
|
263
|
+
# :nodoc
|
264
|
+
def set_bins_and_counts_from_edges!
|
265
|
+
@bins = [].tap do |b|
|
266
|
+
edges[0..-2].each_with_index do |edge, index|
|
267
|
+
b << [edge, edges[index+1]]
|
268
|
+
end
|
269
|
+
end
|
270
|
+
@counts = bins.length.times.map { 0 }
|
271
|
+
end
|
272
|
+
|
273
|
+
# :nodoc
|
274
|
+
def set_edges_from_min_max_and_num_bins!
|
275
|
+
e = []
|
276
|
+
|
277
|
+
if log_bins
|
278
|
+
bin_min = log_if_possible(min)
|
279
|
+
bin_max = log_if_possible(max)
|
280
|
+
else
|
281
|
+
bin_min = min
|
282
|
+
bin_max = max
|
283
|
+
end
|
284
|
+
|
285
|
+
bin_diff = (bin_max - bin_min) / num_bins
|
286
|
+
e << bin_min
|
287
|
+
current = bin_min + bin_diff
|
288
|
+
while current < bin_max
|
289
|
+
e << current
|
290
|
+
current += bin_diff
|
291
|
+
end
|
292
|
+
e << bin_max
|
293
|
+
|
294
|
+
if log_bins
|
295
|
+
self.edges = e.map { |n| Math.exp(n) }
|
296
|
+
else
|
297
|
+
self.edges = e
|
298
|
+
end
|
299
|
+
set_bins_and_counts_from_edges!
|
300
|
+
end
|
301
|
+
|
302
|
+
# :nodoc:
|
303
|
+
def add_to_some_bin value
|
304
|
+
# FIXME optimize this O(n) algorithm...
|
305
|
+
bins.each_with_index do |bin, index|
|
306
|
+
lower, upper = bin
|
307
|
+
if value >= lower && value < upper
|
308
|
+
counts[index] += 1
|
309
|
+
return
|
310
|
+
end
|
311
|
+
end
|
312
|
+
counts[-1] += 1 # if it's the maximal element
|
313
|
+
end
|
314
|
+
|
315
|
+
register
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|