wukong 3.0.0.pre → 3.0.0.pre2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +46 -33
- data/.gitmodules +3 -0
- data/.rspec +1 -1
- data/.travis.yml +8 -1
- data/.yardopts +0 -13
- data/Guardfile +4 -6
- data/{LICENSE.textile → LICENSE.md} +43 -55
- data/README-old.md +422 -0
- data/README.md +279 -418
- data/Rakefile +21 -5
- data/TODO.md +6 -6
- data/bin/wu-clean-encoding +31 -0
- data/bin/wu-lign +2 -2
- data/bin/wu-local +69 -0
- data/bin/wu-server +70 -0
- data/examples/Gemfile +38 -0
- data/examples/README.md +9 -0
- data/examples/dataflow/apache_log_line.rb +64 -25
- data/examples/dataflow/fibonacci_series.rb +101 -0
- data/examples/dataflow/parse_apache_logs.rb +37 -7
- data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
- data/examples/dataflow/simple.rb +4 -4
- data/examples/geo.rb +4 -0
- data/examples/geo/geo_grids.numbers +0 -0
- data/examples/geo/geolocated.rb +331 -0
- data/examples/geo/quadtile.rb +69 -0
- data/examples/geo/spec/geolocated_spec.rb +247 -0
- data/examples/geo/tile_fetcher.rb +77 -0
- data/examples/graph/minimum_spanning_tree.rb +61 -61
- data/examples/jabberwocky.txt +36 -0
- data/examples/models/wikipedia.rb +20 -0
- data/examples/munging/Gemfile +8 -0
- data/examples/munging/airline_flights/airline.rb +57 -0
- data/examples/munging/airline_flights/airline_flights.rake +83 -0
- data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
- data/examples/munging/airline_flights/airport.rb +211 -0
- data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
- data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
- data/examples/munging/airline_flights/flight.rb +156 -0
- data/examples/munging/airline_flights/models.rb +4 -0
- data/examples/munging/airline_flights/parse.rb +26 -0
- data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
- data/examples/munging/airline_flights/route.rb +35 -0
- data/examples/munging/airline_flights/tasks.rake +83 -0
- data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
- data/examples/munging/airline_flights/topcities.rb +167 -0
- data/examples/munging/airports/40_wbans.txt +40 -0
- data/examples/munging/airports/filter_weather_reports.rb +37 -0
- data/examples/munging/airports/join.pig +31 -0
- data/examples/munging/airports/to_tsv.rb +33 -0
- data/examples/munging/airports/usa_wbans.pig +19 -0
- data/examples/munging/airports/usa_wbans.txt +2157 -0
- data/examples/munging/airports/wbans.pig +19 -0
- data/examples/munging/airports/wbans.txt +2310 -0
- data/examples/munging/geo/geo_json.rb +54 -0
- data/examples/munging/geo/geo_models.rb +69 -0
- data/examples/munging/geo/geonames_models.rb +78 -0
- data/examples/munging/geo/iso_codes.rb +172 -0
- data/examples/munging/geo/reconcile_countries.rb +124 -0
- data/examples/munging/geo/tasks.rake +71 -0
- data/examples/munging/rake_helper.rb +62 -0
- data/examples/munging/weather/.gitignore +1 -0
- data/examples/munging/weather/Gemfile +4 -0
- data/examples/munging/weather/Rakefile +28 -0
- data/examples/munging/weather/extract_ish.rb +13 -0
- data/examples/munging/weather/models/weather.rb +119 -0
- data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
- data/examples/munging/wikipedia/README.md +34 -0
- data/examples/munging/wikipedia/Rakefile +193 -0
- data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
- data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
- data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
- data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
- data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
- data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
- data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
- data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
- data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
- data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
- data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
- data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
- data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
- data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
- data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
- data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
- data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
- data/examples/munging/wikipedia/pig_style_guide.md +25 -0
- data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
- data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
- data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
- data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
- data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
- data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
- data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
- data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
- data/examples/munging/wikipedia/utils/namespaces.json +1 -0
- data/examples/rake_helper.rb +85 -0
- data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
- data/examples/server_logs/logline.rb +95 -0
- data/examples/server_logs/models.rb +66 -0
- data/examples/server_logs/page_counts.pig +48 -0
- data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
- data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
- data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
- data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
- data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
- data/examples/string_reverser.rb +26 -0
- data/examples/text/pig_latin.rb +2 -2
- data/examples/text/regional_flavor/README.md +14 -0
- data/examples/text/regional_flavor/article_wordbags.pig +39 -0
- data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
- data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
- data/examples/word_count/accumulator.rb +26 -0
- data/examples/word_count/tokenizer.rb +13 -0
- data/examples/word_count/word_count.rb +6 -0
- data/examples/workflow/cherry_pie.dot +97 -0
- data/examples/workflow/cherry_pie.png +0 -0
- data/examples/workflow/cherry_pie.rb +61 -26
- data/lib/hanuman.rb +34 -7
- data/lib/hanuman/graph.rb +55 -31
- data/lib/hanuman/graphvizzer.rb +199 -178
- data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
- data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
- data/lib/hanuman/link.rb +35 -0
- data/lib/hanuman/registry.rb +46 -0
- data/lib/hanuman/stage.rb +76 -32
- data/lib/wukong.rb +23 -24
- data/lib/wukong/boot.rb +87 -0
- data/lib/wukong/configuration.rb +8 -0
- data/lib/wukong/dataflow.rb +45 -78
- data/lib/wukong/driver.rb +99 -0
- data/lib/wukong/emitter.rb +22 -0
- data/lib/wukong/model/faker.rb +24 -24
- data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
- data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
- data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
- data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
- data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
- data/lib/wukong/processor.rb +60 -114
- data/lib/wukong/spec_helpers.rb +81 -0
- data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
- data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
- data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
- data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
- data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
- data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
- data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
- data/lib/wukong/version.rb +2 -1
- data/lib/wukong/widget/filters.rb +311 -0
- data/lib/wukong/widget/processors.rb +156 -0
- data/lib/wukong/widget/reducers.rb +7 -0
- data/lib/wukong/widget/reducers/accumulator.rb +73 -0
- data/lib/wukong/widget/reducers/bin.rb +318 -0
- data/lib/wukong/widget/reducers/count.rb +61 -0
- data/lib/wukong/widget/reducers/group.rb +85 -0
- data/lib/wukong/widget/reducers/group_concat.rb +70 -0
- data/lib/wukong/widget/reducers/moments.rb +72 -0
- data/lib/wukong/widget/reducers/sort.rb +130 -0
- data/lib/wukong/widget/serializers.rb +287 -0
- data/lib/wukong/widget/sink.rb +10 -52
- data/lib/wukong/widget/source.rb +7 -113
- data/lib/wukong/widget/utils.rb +46 -0
- data/lib/wukong/widgets.rb +6 -0
- data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
- data/spec/examples/dataflow/parsing_spec.rb +12 -11
- data/spec/examples/dataflow/simple_spec.rb +32 -6
- data/spec/examples/dataflow/telegram_spec.rb +36 -36
- data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
- data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
- data/spec/examples/munging/airline_flights_spec.rb +202 -0
- data/spec/examples/text/pig_latin_spec.rb +13 -16
- data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
- data/spec/hanuman/graph_spec.rb +27 -2
- data/spec/hanuman/hanuman_spec.rb +10 -0
- data/spec/hanuman/registry_spec.rb +123 -0
- data/spec/hanuman/stage_spec.rb +61 -7
- data/spec/spec_helper.rb +29 -19
- data/spec/support/hanuman_test_helpers.rb +14 -12
- data/spec/support/shared_context_for_reducers.rb +37 -0
- data/spec/support/shared_examples_for_builders.rb +101 -0
- data/spec/support/shared_examples_for_shortcuts.rb +57 -0
- data/spec/support/wukong_test_helpers.rb +37 -11
- data/spec/wukong/dataflow_spec.rb +77 -55
- data/spec/wukong/local_runner_spec.rb +24 -24
- data/spec/wukong/model/faker_spec.rb +132 -131
- data/spec/wukong/runner_spec.rb +8 -8
- data/spec/wukong/widget/filters_spec.rb +61 -0
- data/spec/wukong/widget/processors_spec.rb +126 -0
- data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
- data/spec/wukong/widget/reducers/count_spec.rb +11 -0
- data/spec/wukong/widget/reducers/group_spec.rb +20 -0
- data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
- data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
- data/spec/wukong/widget/serializers_spec.rb +92 -0
- data/spec/wukong/widget/sink_spec.rb +15 -15
- data/spec/wukong/widget/source_spec.rb +65 -41
- data/spec/wukong/wukong_spec.rb +10 -0
- data/wukong.gemspec +17 -10
- metadata +359 -335
- data/.document +0 -5
- data/VERSION +0 -1
- data/bin/hdp-bin +0 -44
- data/bin/hdp-bzip +0 -23
- data/bin/hdp-cat +0 -3
- data/bin/hdp-catd +0 -3
- data/bin/hdp-cp +0 -3
- data/bin/hdp-du +0 -86
- data/bin/hdp-get +0 -3
- data/bin/hdp-kill +0 -3
- data/bin/hdp-kill-task +0 -3
- data/bin/hdp-ls +0 -11
- data/bin/hdp-mkdir +0 -2
- data/bin/hdp-mkdirp +0 -12
- data/bin/hdp-mv +0 -3
- data/bin/hdp-parts_to_keys.rb +0 -77
- data/bin/hdp-ps +0 -3
- data/bin/hdp-put +0 -3
- data/bin/hdp-rm +0 -32
- data/bin/hdp-sort +0 -40
- data/bin/hdp-stream +0 -40
- data/bin/hdp-stream-flat +0 -22
- data/bin/hdp-stream2 +0 -39
- data/bin/hdp-sync +0 -17
- data/bin/hdp-wc +0 -67
- data/bin/wu-flow +0 -10
- data/bin/wu-map +0 -17
- data/bin/wu-red +0 -17
- data/bin/wukong +0 -17
- data/data/CREDITS.md +0 -355
- data/data/graph/airfares.tsv +0 -2174
- data/data/text/gift_of_the_magi.txt +0 -225
- data/data/text/jabberwocky.txt +0 -36
- data/data/text/rectification_of_names.txt +0 -33
- data/data/twitter/a_atsigns_b.tsv +0 -64
- data/data/twitter/a_follows_b.tsv +0 -53
- data/data/twitter/tweet.tsv +0 -167
- data/data/twitter/twitter_user.tsv +0 -55
- data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
- data/docpages/INSTALL.textile +0 -92
- data/docpages/LICENSE.textile +0 -107
- data/docpages/README-elastic_map_reduce.textile +0 -377
- data/docpages/README-performance.textile +0 -90
- data/docpages/README-wulign.textile +0 -65
- data/docpages/UsingWukong-part1-get_ready.textile +0 -17
- data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
- data/docpages/UsingWukong-part3-parsing.textile +0 -138
- data/docpages/_config.yml +0 -39
- data/docpages/avro/avro_notes.textile +0 -56
- data/docpages/avro/performance.textile +0 -36
- data/docpages/avro/tethering.textile +0 -19
- data/docpages/bigdata-tips.textile +0 -143
- data/docpages/code/api_response_example.txt +0 -20
- data/docpages/code/parser_skeleton.rb +0 -38
- data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
- data/docpages/favicon.ico +0 -0
- data/docpages/gem.css +0 -16
- data/docpages/hadoop-tips.textile +0 -83
- data/docpages/index.textile +0 -92
- data/docpages/intro.textile +0 -8
- data/docpages/moreinfo.textile +0 -174
- data/docpages/news.html +0 -24
- data/docpages/pig/PigLatinExpressionsList.txt +0 -122
- data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
- data/docpages/pig/commandline_params.txt +0 -26
- data/docpages/pig/cookbook.html +0 -481
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +0 -1103
- data/docpages/pig/piglatin_ref2.html +0 -14340
- data/docpages/pig/setup.html +0 -505
- data/docpages/pig/skin/basic.css +0 -166
- data/docpages/pig/skin/breadcrumbs.js +0 -237
- data/docpages/pig/skin/fontsize.js +0 -166
- data/docpages/pig/skin/getBlank.js +0 -40
- data/docpages/pig/skin/getMenu.js +0 -45
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +0 -54
- data/docpages/pig/skin/profile.css +0 -181
- data/docpages/pig/skin/screen.css +0 -587
- data/docpages/pig/tutorial.html +0 -1059
- data/docpages/pig/udf.html +0 -1509
- data/docpages/tutorial.textile +0 -283
- data/docpages/usage.textile +0 -195
- data/docpages/wutils.textile +0 -263
- data/examples/dataflow/complex.rb +0 -11
- data/examples/dataflow/donuts.rb +0 -13
- data/examples/tiny_count/jabberwocky_output.tsv +0 -92
- data/examples/word_count.rb +0 -48
- data/examples/workflow/fiddle.rb +0 -24
- data/lib/away/escapement.rb +0 -129
- data/lib/away/exe.rb +0 -11
- data/lib/away/experimental.rb +0 -5
- data/lib/away/from_file.rb +0 -52
- data/lib/away/job.rb +0 -56
- data/lib/away/job/rake_compat.rb +0 -17
- data/lib/away/registry.rb +0 -79
- data/lib/away/runner.rb +0 -276
- data/lib/away/runner/execute.rb +0 -121
- data/lib/away/script.rb +0 -161
- data/lib/away/script/hadoop_command.rb +0 -240
- data/lib/away/source/file_list_source.rb +0 -15
- data/lib/away/source/looper.rb +0 -18
- data/lib/away/task.rb +0 -219
- data/lib/hanuman/action.rb +0 -21
- data/lib/hanuman/chain.rb +0 -4
- data/lib/hanuman/graphviz.rb +0 -74
- data/lib/hanuman/resource.rb +0 -6
- data/lib/hanuman/slot.rb +0 -87
- data/lib/hanuman/slottable.rb +0 -220
- data/lib/wukong/bad_record.rb +0 -15
- data/lib/wukong/event.rb +0 -44
- data/lib/wukong/local_runner.rb +0 -55
- data/lib/wukong/mapred.rb +0 -3
- data/lib/wukong/universe.rb +0 -48
- data/lib/wukong/widget/filter.rb +0 -81
- data/lib/wukong/widget/gibberish.rb +0 -123
- data/lib/wukong/widget/monitor.rb +0 -26
- data/lib/wukong/widget/reducer.rb +0 -66
- data/lib/wukong/widget/stringifier.rb +0 -50
- data/lib/wukong/workflow.rb +0 -22
- data/lib/wukong/workflow/command.rb +0 -42
- data/old/config/emr-example.yaml +0 -48
- data/old/examples/README.txt +0 -17
- data/old/examples/contrib/jeans/README.markdown +0 -165
- data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
- data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
- data/old/examples/contrib/jeans/data/sizes +0 -3
- data/old/examples/contrib/jeans/normalize.rb +0 -20
- data/old/examples/contrib/jeans/sizes.rb +0 -55
- data/old/examples/corpus/bnc_word_freq.rb +0 -44
- data/old/examples/corpus/bucket_counter.rb +0 -47
- data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
- data/old/examples/corpus/sentence_bigrams.rb +0 -53
- data/old/examples/corpus/sentence_coocurrence.rb +0 -66
- data/old/examples/corpus/stopwords.rb +0 -138
- data/old/examples/corpus/words_to_bigrams.rb +0 -53
- data/old/examples/emr/README.textile +0 -110
- data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
- data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
- data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
- data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
- data/old/examples/network_graph/adjacency_list.rb +0 -74
- data/old/examples/network_graph/breadth_first_search.rb +0 -72
- data/old/examples/network_graph/gen_2paths.rb +0 -68
- data/old/examples/network_graph/gen_multi_edge.rb +0 -112
- data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
- data/old/examples/pagerank/README.textile +0 -6
- data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
- data/old/examples/pagerank/pagerank.rb +0 -72
- data/old/examples/pagerank/pagerank_initialize.rb +0 -42
- data/old/examples/pagerank/run_pagerank.sh +0 -21
- data/old/examples/sample_records.rb +0 -33
- data/old/examples/server_logs/apache_log_parser.rb +0 -15
- data/old/examples/server_logs/nook.rb +0 -48
- data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
- data/old/examples/server_logs/user_agent.rb +0 -40
- data/old/examples/simple_word_count.rb +0 -82
- data/old/examples/size.rb +0 -61
- data/old/examples/stats/avg_value_frequency.rb +0 -86
- data/old/examples/stats/binning_percentile_estimator.rb +0 -140
- data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
- data/old/examples/stats/rank_and_bin.rb +0 -173
- data/old/examples/stupidly_simple_filter.rb +0 -40
- data/old/examples/word_count.rb +0 -75
- data/old/graph/graphviz_builder.rb +0 -580
- data/old/graph_easy/Attributes.pm +0 -4181
- data/old/graph_easy/Graphviz.pm +0 -2232
- data/old/wukong.rb +0 -18
- data/old/wukong/and_pig.rb +0 -38
- data/old/wukong/bad_record.rb +0 -18
- data/old/wukong/datatypes.rb +0 -24
- data/old/wukong/datatypes/enum.rb +0 -127
- data/old/wukong/datatypes/fake_types.rb +0 -17
- data/old/wukong/decorator.rb +0 -28
- data/old/wukong/encoding/asciize.rb +0 -108
- data/old/wukong/extensions.rb +0 -16
- data/old/wukong/extensions/array.rb +0 -18
- data/old/wukong/extensions/blank.rb +0 -93
- data/old/wukong/extensions/class.rb +0 -189
- data/old/wukong/extensions/date_time.rb +0 -53
- data/old/wukong/extensions/emittable.rb +0 -69
- data/old/wukong/extensions/enumerable.rb +0 -79
- data/old/wukong/extensions/hash.rb +0 -167
- data/old/wukong/extensions/hash_keys.rb +0 -16
- data/old/wukong/extensions/hash_like.rb +0 -150
- data/old/wukong/extensions/hashlike_class.rb +0 -47
- data/old/wukong/extensions/module.rb +0 -2
- data/old/wukong/extensions/pathname.rb +0 -27
- data/old/wukong/extensions/string.rb +0 -65
- data/old/wukong/extensions/struct.rb +0 -17
- data/old/wukong/extensions/symbol.rb +0 -11
- data/old/wukong/filename_pattern.rb +0 -74
- data/old/wukong/helper.rb +0 -7
- data/old/wukong/helper/stopwords.rb +0 -195
- data/old/wukong/helper/tokenize.rb +0 -35
- data/old/wukong/logger.rb +0 -38
- data/old/wukong/periodic_monitor.rb +0 -72
- data/old/wukong/schema.rb +0 -269
- data/old/wukong/script.rb +0 -286
- data/old/wukong/script/avro_command.rb +0 -5
- data/old/wukong/script/cassandra_loader_script.rb +0 -40
- data/old/wukong/script/emr_command.rb +0 -168
- data/old/wukong/script/hadoop_command.rb +0 -237
- data/old/wukong/script/local_command.rb +0 -41
- data/old/wukong/store.rb +0 -10
- data/old/wukong/store/base.rb +0 -27
- data/old/wukong/store/cassandra.rb +0 -10
- data/old/wukong/store/cassandra/streaming.rb +0 -75
- data/old/wukong/store/cassandra/struct_loader.rb +0 -21
- data/old/wukong/store/cassandra_model.rb +0 -91
- data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
- data/old/wukong/store/chunked_flat_file_store.rb +0 -48
- data/old/wukong/store/conditional_store.rb +0 -57
- data/old/wukong/store/factory.rb +0 -8
- data/old/wukong/store/flat_file_store.rb +0 -89
- data/old/wukong/store/key_store.rb +0 -51
- data/old/wukong/store/null_store.rb +0 -15
- data/old/wukong/store/read_thru_store.rb +0 -22
- data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
- data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
- data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
- data/old/wukong/streamer.rb +0 -30
- data/old/wukong/streamer/accumulating_reducer.rb +0 -83
- data/old/wukong/streamer/base.rb +0 -126
- data/old/wukong/streamer/counting_reducer.rb +0 -25
- data/old/wukong/streamer/filter.rb +0 -20
- data/old/wukong/streamer/instance_streamer.rb +0 -15
- data/old/wukong/streamer/json_streamer.rb +0 -21
- data/old/wukong/streamer/line_streamer.rb +0 -12
- data/old/wukong/streamer/list_reducer.rb +0 -31
- data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
- data/old/wukong/streamer/record_streamer.rb +0 -14
- data/old/wukong/streamer/reducer.rb +0 -11
- data/old/wukong/streamer/set_reducer.rb +0 -14
- data/old/wukong/streamer/struct_streamer.rb +0 -48
- data/old/wukong/streamer/summing_reducer.rb +0 -29
- data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
- data/old/wukong/typed_struct.rb +0 -12
- data/spec/away/encoding_spec.rb +0 -32
- data/spec/away/exe_spec.rb +0 -20
- data/spec/away/flow_spec.rb +0 -82
- data/spec/away/graph_spec.rb +0 -6
- data/spec/away/job_spec.rb +0 -15
- data/spec/away/rake_compat_spec.rb +0 -9
- data/spec/away/script_spec.rb +0 -81
- data/spec/hanuman/graphviz_spec.rb +0 -29
- data/spec/hanuman/slot_spec.rb +0 -2
- data/spec/support/examples_helper.rb +0 -10
- data/spec/support/streamer_test_helpers.rb +0 -6
- data/spec/support/wukong_widget_helpers.rb +0 -66
- data/spec/wukong/processor_spec.rb +0 -109
- data/spec/wukong/widget/filter_spec.rb +0 -99
- data/spec/wukong/widget/stringifier_spec.rb +0 -51
- data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,156 @@
|
|
1
|
+
# Raw data:
|
2
|
+
# Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Can
|
3
|
+
# 2007,1,1,1,1232,1225,1341,1340,WN,2891,N351,69,75,54,1,7,SMF,ONT,389,4,11,0,,0,0,0,0,0,0
|
4
|
+
|
5
|
+
class RawAirlineFlight
|
6
|
+
include Gorillib::Model
|
7
|
+
|
8
|
+
field :date_year, Integer, position: 1, doc: "Year (1987-2008)"
|
9
|
+
field :date_month, Integer, position: 2, doc: "Month (1-12)"
|
10
|
+
field :date_day, Integer, position: 3, doc: "Day of month (1-31)"
|
11
|
+
field :day_of_week, Integer, position: 4, doc: "Day of week -- 1 (Monday) - 7 (Sunday)"
|
12
|
+
#
|
13
|
+
field :act_dep_tod, String, position: 5, doc: "time of day for actual departure (local, hhmm)", blankish: [nil, '', 'NA']
|
14
|
+
field :crs_dep_tod, String, position: 6, doc: "time of day for scheduled departure (local, hhmm)"
|
15
|
+
field :act_arr_tod, String, position: 7, doc: "time of day for actual arrival (local, hhmm). Not adjusted for wrap-around.", blankish: [nil, '', 'NA']
|
16
|
+
field :crs_arr_tod, String, position: 8, doc: "time of day for scheduled arrival (local, hhmm). Not adjusted for wrap-around."
|
17
|
+
#
|
18
|
+
field :unique_carrier, String, position: 9, doc: "unique carrier code", validates: { length: { in: 0..5 } }
|
19
|
+
field :flight_num, Integer, position: 10, doc: "flight number"
|
20
|
+
field :tail_num, String, position: 11, doc: "plane tail number", validates: { length: { in: 0..8 } }
|
21
|
+
#
|
22
|
+
field :act_duration, Integer, position: 12, doc: "actual flight time, in minutes", blankish: [nil, '', 'NA']
|
23
|
+
field :crs_duration, Integer, position: 13, doc: "CRS flight time, in minutes"
|
24
|
+
field :air_duration, Integer, position: 14, doc: "Air time, in minutes", blankish: [nil, '', 'NA']
|
25
|
+
field :arr_delay, Integer, position: 15, doc: "arrival delay, in minutes", blankish: [nil, '', 'NA']
|
26
|
+
field :dep_delay, Integer, position: 16, doc: "departure delay, in minutes", blankish: [nil, '', 'NA']
|
27
|
+
field :from_airport, String, position: 17, doc: "Origin IATA airport code", validates: { length: { in: 0..3 } }
|
28
|
+
field :into_airport, String, position: 18, doc: "Destination IATA airport code", validates: { length: { in: 0..3 } }
|
29
|
+
field :distance_mi, Integer, position: 19, doc: "Flight distance, in miles"
|
30
|
+
field :taxi_in_duration, Integer, position: 20, doc: "taxi in time, in minutes", blankish: [nil, '', 'NA']
|
31
|
+
field :taxi_out_duration, Integer, position: 21, doc: "taxi out time in minutes", blankish: [nil, '', 'NA']
|
32
|
+
#
|
33
|
+
field :is_cancelled, :boolean_10, position: 22, doc: "was the flight cancelled?"
|
34
|
+
field :cancellation_code, String, position: 23, doc: "Reason for cancellation (A = carrier, B = weather, C = NAS, D = security, Z = no cancellation)"
|
35
|
+
field :is_diverted, :boolean_10, position: 24, doc: "Was the plane diverted?"
|
36
|
+
field :carrier_delay, Integer, position: 25, doc: "in minutes"
|
37
|
+
field :weather_delay, Integer, position: 26, doc: "in minutes"
|
38
|
+
field :nas_delay, Integer, position: 27, doc: "in minutes"
|
39
|
+
field :security_delay, Integer, position: 28, doc: "in minutes"
|
40
|
+
field :late_aircraft_delay, Integer, position: 29, doc: "in minutes"
|
41
|
+
|
42
|
+
def flight_date
|
43
|
+
Time.new(date_year, date_month, date_day)
|
44
|
+
end
|
45
|
+
|
46
|
+
# uses the year / month / day, along with an "hhmm" string, to
|
47
|
+
def inttime_from_hhmm(val, fencepost=nil)
|
48
|
+
hour, minutes = [val.to_i / 100, val.to_i % 100]
|
49
|
+
res = Time.utc(date_year, date_month, date_day, hour, minutes)
|
50
|
+
# if before fencepost, we wrapped around in time
|
51
|
+
res += (24 * 60 * 60) if fencepost && (res.to_i < fencepost)
|
52
|
+
res.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
def act_dep_itime ; @act_dep_itime = inttime_from_hhmm(act_dep_tod) if act_dep_tod ; end
|
56
|
+
def crs_dep_itime ; @crs_dep_itime = inttime_from_hhmm(crs_dep_tod) ; end
|
57
|
+
def act_arr_itime ; @act_arr_itime = inttime_from_hhmm(act_arr_tod, act_dep_itime) if act_arr_tod ; end
|
58
|
+
def crs_arr_itime ; @crs_arr_itime = inttime_from_hhmm(crs_arr_tod, crs_dep_itime) ; end
|
59
|
+
|
60
|
+
def receive_tail_num(val) ; val = nil if val.to_s == "0" ; super(val) ; end
|
61
|
+
def arr_delay(val) val = nil if val.to_s == 0 ; super(val) ; end
|
62
|
+
|
63
|
+
def receive_cancellation_code(val) ; if val == "" then super("Z") else super(val) ; end ; end
|
64
|
+
|
65
|
+
def to_airline_flight
|
66
|
+
attrs = self.attributes.reject{|attr,val| [:year, :month, :day, :distance_mi].include?(attr) }
|
67
|
+
attrs[:flight_datestr] = flight_date.strftime("%Y%m%d")
|
68
|
+
attrs[:distance_km] = (distance_mi * 1.609_344).to_i
|
69
|
+
|
70
|
+
attrs[:act_dep_tod] = "%04d" % act_dep_tod.to_i if act_dep_tod
|
71
|
+
attrs[:crs_dep_tod] = "%04d" % crs_dep_tod.to_i if crs_dep_tod
|
72
|
+
attrs[:act_arr_tod] = "%04d" % act_arr_tod.to_i if act_arr_tod
|
73
|
+
attrs[:crs_arr_tod] = "%04d" % crs_arr_tod.to_i if crs_arr_tod
|
74
|
+
|
75
|
+
attrs[:act_dep_itime] = act_dep_itime
|
76
|
+
attrs[:crs_dep_itime] = crs_dep_itime
|
77
|
+
attrs[:act_arr_itime] = act_arr_itime
|
78
|
+
attrs[:crs_arr_itime] = crs_arr_itime
|
79
|
+
|
80
|
+
AirlineFlight.receive(attrs)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class AirlineFlight
|
85
|
+
include Gorillib::Model
|
86
|
+
|
87
|
+
# Identifier
|
88
|
+
field :flight_datestr, String, position: 0, doc: "Date, YYYYMMDD. Use flight_date method if you want a date"
|
89
|
+
field :unique_carrier, String, position: 1, doc: "Unique Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2).", validates: { length: { in: 0..5 } }
|
90
|
+
field :flight_num, Integer, position: 2, doc: "flight number"
|
91
|
+
# Flight
|
92
|
+
field :from_airport, String, position: 3, doc: "Origin IATA airport code", validates: { length: { in: 0..3 } }
|
93
|
+
field :into_airport, String, position: 4, doc: "Destination IATA airport code", validates: { length: { in: 0..3 } }
|
94
|
+
field :tail_num, String, position: 5, doc: "Plane tail number", validates: { length: { in: 0..8 } }
|
95
|
+
field :distance_km, Integer, position: 6, doc: "Flight distance, in kilometers"
|
96
|
+
field :day_of_week, Integer, position: 7, doc: "Day of week -- 1 (Monday) - 7 (Sunday)"
|
97
|
+
# Departure and Arrival Absolute Time
|
98
|
+
field :crs_dep_itime, IntTime, position: 8, doc: "scheduled departure time (utc epoch seconds)"
|
99
|
+
field :crs_arr_itime, IntTime, position: 9, doc: "scheduled arrival time (utc epoch seconds)"
|
100
|
+
field :act_dep_itime, IntTime, position: 10, doc: "actual departure time (utc epoch seconds)"
|
101
|
+
field :act_arr_itime, IntTime, position: 11, doc: "actual arrival time (utc epoch seconds)"
|
102
|
+
# Departure and Arrival Local Time of Day
|
103
|
+
field :crs_dep_tod, String, position: 12, doc: "time of day for scheduled departure (local, hhmm)"
|
104
|
+
field :crs_arr_tod, String, position: 13, doc: "time of day for scheduled arrival (local, hhmm). Not adjusted for wrap-around."
|
105
|
+
field :act_dep_tod, String, position: 14, doc: "time of day for actual departure (local, hhmm)"
|
106
|
+
field :act_arr_tod, String, position: 15, doc: "time of day for actual arrival (local, hhmm). Not adjusted for wrap-around."
|
107
|
+
# Duration
|
108
|
+
field :crs_duration, Integer, position: 16, doc: "CRS flight time, in minutes"
|
109
|
+
field :act_duration, Integer, position: 17, doc: "Actual flight time, in minutes"
|
110
|
+
field :air_duration, Integer, position: 18, doc: "Air time, in minutes"
|
111
|
+
field :taxi_in_duration, Integer, position: 19, doc: "taxi in time, in minutes"
|
112
|
+
field :taxi_out_duration, Integer, position: 20, doc: "taxi out time in minutes"
|
113
|
+
# Delay
|
114
|
+
field :is_diverted, :boolean_10, position: 21, doc: "Was the plane diverted? The actual_duration column remains NULL for all diverted flights."
|
115
|
+
field :is_cancelled, :boolean_10, position: 22, doc: "was the flight cancelled?"
|
116
|
+
field :cancellation_code, String, position: 23, doc: "Reason for cancellation (A = carrier, B = weather, C = NAS, D = security, Z = no cancellation)"
|
117
|
+
field :dep_delay, Integer, position: 24, doc: "Difference in minutes between scheduled and actual departure time. Early departures show negative numbers. "
|
118
|
+
field :arr_delay, Integer, position: 25, doc: "Difference in minutes between scheduled and actual arrival time. Early arrivals show negative numbers."
|
119
|
+
field :carrier_delay, Integer, position: 26, doc: "Carrier delay, in minutes"
|
120
|
+
field :weather_delay, Integer, position: 27, doc: "Weather delay, in minutes"
|
121
|
+
field :nas_delay, Integer, position: 28, doc: "National Air System delay, in minutes"
|
122
|
+
field :security_delay, Integer, position: 29, doc: "Security delay, in minutes"
|
123
|
+
field :late_aircraft_delay, Integer, position: 30, doc: "Late Aircraft delay, in minutes"
|
124
|
+
|
125
|
+
def to_tsv
|
126
|
+
attrs = attributes
|
127
|
+
attrs[:is_cancelled] = is_cancelled ? 1 : 0
|
128
|
+
attrs[:is_diverted] = is_diverted ? 1 : 0
|
129
|
+
attrs[:act_dep_itime] ||= ' '
|
130
|
+
attrs[:act_arr_itime] ||= ' '
|
131
|
+
|
132
|
+
# FIXME
|
133
|
+
attrs[:act_duration] = ((crs_arr_itime - crs_dep_itime) / 60.0).to_i
|
134
|
+
attrs[:air_duration] = attrs[:act_duration] - attrs[:crs_duration]
|
135
|
+
attrs.each{|key, val| attrs[key] = val.to_s[-7..-1] if val.to_s.length > 7 } # FIXME: for testing
|
136
|
+
|
137
|
+
attrs.values.join("\t")
|
138
|
+
end
|
139
|
+
|
140
|
+
def flight_date
|
141
|
+
@flight_date ||= Gorillib::Factory::DateFactory.receive(flight_datestr)
|
142
|
+
end
|
143
|
+
|
144
|
+
# checks that the record is sane
|
145
|
+
def lint
|
146
|
+
{
|
147
|
+
act_duration: (!act_arr_itime) || (act_arr_itime - act_dep_itime == act_duration * 60),
|
148
|
+
crs_duration: (!crs_arr_itime) || (crs_arr_itime - crs_dep_itime == crs_duration * 60),
|
149
|
+
cancelled_has_code: (is_cancelled == (cancellation_code != "Z")),
|
150
|
+
cancellation_code: (%w[A B C D Z].include?(cancellation_code)),
|
151
|
+
act_duration: (!act_duration) || (act_duration == (air_duration + taxi_in_duration + taxi_out_duration)),
|
152
|
+
dep_delay: (!act_dep_itime) || (dep_delay == (act_dep_itime - crs_dep_itime)/60.0),
|
153
|
+
arr_delay: (!act_arr_itime) || (arr_delay == (act_arr_itime - crs_arr_itime)/60.0),
|
154
|
+
}
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
# see alsospec/examples/munging/airline_flights_spec.rb
|
3
|
+
|
4
|
+
puts described_class.field_names.map{|fn| fn[0..6] }.join("\t")
|
5
|
+
raw_airports = RawDataexpoAirport.load_csv(de_airports_filename)
|
6
|
+
raw_airports.each do |airport|
|
7
|
+
puts airport.to_tsv
|
8
|
+
end
|
9
|
+
|
10
|
+
puts described_class.field_names.join("\t") # .map{|fn| fn[0..6] }.join("\t")
|
11
|
+
raw_airports = described_class.load_csv(raw_airports_filename)
|
12
|
+
raw_airports.each do |airport|
|
13
|
+
# puts airport.to_tsv
|
14
|
+
linted = airport.lint
|
15
|
+
puts [airport.iata, airport.icao, linted.inspect, airport.to_tsv, ].join("\t") if linted.present?
|
16
|
+
end
|
17
|
+
|
18
|
+
Airport.load(raw_airports_filename, de_airports_filename)
|
19
|
+
Airport::AIRPORTS.each{|id,airport|
|
20
|
+
#puts airport.to_tsv
|
21
|
+
linted = airport.lint
|
22
|
+
warn [airport.iata, airport.icao, airport.de_iata, "%-25s" % airport.name, linted.inspect].join("\t") if linted.present?
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
# Model.from_tuple(...)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require_relative './models'
|
2
|
+
require 'gorillib/model/reconcilable'
|
3
|
+
|
4
|
+
class Airport
|
5
|
+
include Gorillib::Model::Reconcilable
|
6
|
+
attr_accessor :_origin # source of the record
|
7
|
+
|
8
|
+
def conflicting_attribute!(attr, this_val, that_val)
|
9
|
+
case attr
|
10
|
+
when :name, :city, :airport_ofid then return :pass
|
11
|
+
when :latitude, :longitude then return true if (this_val - that_val).abs < 3
|
12
|
+
when :altitude then return true if (this_val - that_val).abs < 5
|
13
|
+
end
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def ids
|
18
|
+
[:icao, :iata, :faa].hashify{|attr| public_send(attr) }.compact
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
#
|
23
|
+
# Loads the Airport identifier tables scraped from Wikipedia
|
24
|
+
#
|
25
|
+
class RawAirportIdentifier < Airport
|
26
|
+
include RawAirport
|
27
|
+
include Gorillib::Model::LoadFromTsv
|
28
|
+
|
29
|
+
def self.from_tuple(icao, iata, faa, name, city=nil, *_)
|
30
|
+
self.new({icao: icao, iata: iata, faa: faa, name: name, city: city}.compact_blank)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.load_airports(filename, &block)
|
34
|
+
load_tsv(filename, num_fields: 4..6, &block)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class Airport
|
39
|
+
#
|
40
|
+
# Reconciler for Airports
|
41
|
+
#
|
42
|
+
# For each airport in turn across openflights, dataexpo and the two scraped
|
43
|
+
# identifier sets,
|
44
|
+
#
|
45
|
+
#
|
46
|
+
class IdReconciler
|
47
|
+
include Gorillib::Model
|
48
|
+
include Gorillib::Model::LoadFromCsv
|
49
|
+
include Gorillib::Model::Reconcilable
|
50
|
+
self.csv_options = { col_sep: "\t", num_fields: 3..6 }
|
51
|
+
|
52
|
+
# Map the reconcilers to each ID they have anything to say about
|
53
|
+
ID_MAP = { icao: {}, iata: {}, faa: {} }
|
54
|
+
|
55
|
+
field :opinions, Array, default: Array.new, doc: "every record having an id in common with the other records in this field"
|
56
|
+
|
57
|
+
def ids
|
58
|
+
opinions.flat_map{|op| op.ids.to_a }.uniq.compact
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.load_all
|
62
|
+
Log.info "Loading all Airports and reconciling"
|
63
|
+
@airports = Array.new
|
64
|
+
RawDataexpoAirport .load_airports(:dataexpo_raw_airports ){|airport| register(:dataexpo, airport) }
|
65
|
+
RawOpenflightAirport.load_airports(:openflights_raw_airports){|airport| register(:openflights, airport) }
|
66
|
+
RawAirportIdentifier.load_airports(:wikipedia_icao ){|airport| register(:wp_icao, airport) }
|
67
|
+
RawAirportIdentifier.load_airports(:wikipedia_iata ){|airport| register(:wp_iata, airport) }
|
68
|
+
RawAirportIdentifier.load_airports(:wikipedia_us_abroad ){|airport| register(:wp_us_abroad, airport) }
|
69
|
+
|
70
|
+
recs = ID_MAP.map{|attr, hsh| hsh.sort.map(&:last) }.flatten.uniq
|
71
|
+
recs.each do |rec|
|
72
|
+
consensus = rec.reconcile
|
73
|
+
# lint = consensus.lint
|
74
|
+
# puts "%-79s\t%s" % [lint, consensus.to_s[0..100]] if lint.present?
|
75
|
+
@airports << consensus
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.airports
|
80
|
+
@airports
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.exemplars
|
84
|
+
Airport::EXEMPLARS.map do |iata|
|
85
|
+
ID_MAP[:iata][iata].reconcile
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def reconcile
|
90
|
+
consensus = Airport.new
|
91
|
+
clean = opinions.all?{|op| consensus.adopt(op) }
|
92
|
+
# puts "\t#{consensus.inspect}"
|
93
|
+
puts "confl\t#{self.inspect}" if not clean
|
94
|
+
consensus
|
95
|
+
end
|
96
|
+
|
97
|
+
def adopt_opinions(vals, _)
|
98
|
+
self.opinions = vals + self.opinions
|
99
|
+
self.opinions.uniq!
|
100
|
+
end
|
101
|
+
|
102
|
+
# * find all existing reconcilers that share an ID with that record
|
103
|
+
# * unify them into one reconciler
|
104
|
+
# * store it back under all the IDs
|
105
|
+
#
|
106
|
+
# Suppose our dataset has 3 identifiers, which look like
|
107
|
+
#
|
108
|
+
# a S
|
109
|
+
# S 88
|
110
|
+
# a Z
|
111
|
+
# b
|
112
|
+
# Q
|
113
|
+
# b Q 77
|
114
|
+
#
|
115
|
+
# We will wind up with these two reconcilers:
|
116
|
+
#
|
117
|
+
# <a S 88 opinions: [a,S, ],[S, ,88],[a,Z, ]>
|
118
|
+
# <b Q 77 opinions: [b, , ],[ ,Q, ],[b,Q,77]>
|
119
|
+
#
|
120
|
+
def self.register(origin, obj)
|
121
|
+
obj._origin = origin
|
122
|
+
# get the existing reconcilers
|
123
|
+
existing = obj.ids.map{|attr, id| ID_MAP[attr][id] }.compact.uniq
|
124
|
+
# push the new object in, and pull the most senior one out
|
125
|
+
existing.unshift(self.new(opinions: [obj]))
|
126
|
+
reconciler = existing.shift
|
127
|
+
# unite them into the reconciler
|
128
|
+
existing.each{|that| reconciler.adopt(that) }
|
129
|
+
# save the reconciler under each of the ids.
|
130
|
+
reconciler.ids.each{|attr, id| ID_MAP[attr][id] = reconciler }
|
131
|
+
end
|
132
|
+
|
133
|
+
def inspect
|
134
|
+
str = "#<#{self.class.name} #{ids}"
|
135
|
+
opinions.each do |op|
|
136
|
+
str << "\n\t #{op._origin}\t#{op}"
|
137
|
+
end
|
138
|
+
str << ">"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# As of January 2012, the OpenFlights/Airline Route Mapper Route Database
|
4
|
+
# contains 59036 routes between 3209 airports on 531 airlines [spanning the
|
5
|
+
# globe](http://openflights.org/demo/openflights-routedb-2048.png). If you
|
6
|
+
# enjoy this data, please consider [visiting their page and
|
7
|
+
# donating](http://openflights.org/data.html)
|
8
|
+
#
|
9
|
+
# > Notes: Routes are directional: if an airline operates services from A to B
|
10
|
+
# > and from B to A, both A-B and B-A are listed separately. Routes where one
|
11
|
+
# > carrier operates both its own and codeshare flights are listed only once.
|
12
|
+
#
|
13
|
+
# Sample entries
|
14
|
+
#
|
15
|
+
# BA,1355,SIN,3316,LHR,507,,0,744 777
|
16
|
+
# BA,1355,SIN,3316,MEL,3339,Y,0,744
|
17
|
+
# TOM,5013,ACE,1055,BFS,465,,0,320
|
18
|
+
#
|
19
|
+
class RawOpenflightRoute
|
20
|
+
include Gorillib::Model
|
21
|
+
|
22
|
+
field :iataicao, String, doc: "2-letter (IATA) or 3-letter (ICAO) code of the airline."
|
23
|
+
field :airline_ofid, Integer, doc: "Unique OpenFlights identifier for airline (see Airline)."
|
24
|
+
field :from_airport_iataicao, String, doc: "3-letter (IATA) or 4-letter (ICAO) code of the source airport."
|
25
|
+
field :from_airport_ofid, Integer, doc: "Unique OpenFlights identifier for source airport (see Airport)"
|
26
|
+
field :into_airport_iataicao, String, doc: "3-letter (IATA) or 4-letter (ICAO) code of the destination airport."
|
27
|
+
field :into_airport_ofid, Integer, doc: "Unique OpenFlights identifier for destination airport (see Airport)"
|
28
|
+
field :codeshare, :boolean, doc: "true if this flight is a codeshare (that is, not operated by Airline, but another carrier); empty otherwise."
|
29
|
+
field :stops, Integer, doc: "Number of stops on this flight, or '0' for direct"
|
30
|
+
field :equipment_list, String, doc: "3-letter codes for plane type(s) generally used on this flight, separated by spaces"
|
31
|
+
|
32
|
+
def receive_codeshare(val)
|
33
|
+
super(case val when "Y" then true when "N" then false else val ; end)
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative('../../rake_helper')
|
2
|
+
require_relative('./models')
|
3
|
+
|
4
|
+
Pathname.register_paths(
|
5
|
+
af_data: [:data, 'airline_flights'],
|
6
|
+
af_work: [:work, 'airline_flights'],
|
7
|
+
af_code: File.dirname(__FILE__),
|
8
|
+
#
|
9
|
+
openflights_raw_airports: [:af_data, "openflights_airports-raw#{Settings[:mini_slug]}.csv" ],
|
10
|
+
openflights_raw_airlines: [:af_data, "openflights_airlines-raw.csv" ],
|
11
|
+
dataexpo_raw_airports: [:af_data, "dataexpo_airports-raw#{Settings[:mini_slug]}.csv" ],
|
12
|
+
wikipedia_icao: [:af_data, "wikipedia_icao.tsv" ],
|
13
|
+
wikipedia_iata: [:af_data, "wikipedia_iata.tsv" ],
|
14
|
+
wikipedia_us_abroad: [:af_data, "wikipedia_us_abroad.tsv" ],
|
15
|
+
#
|
16
|
+
openflights_airports: [:af_work, "openflights_airports-parsed#{Settings[:mini_slug]}.tsv"],
|
17
|
+
openflights_airlines: [:af_work, "openflights_airlines-parsed#{Settings[:mini_slug]}.tsv"],
|
18
|
+
dataexpo_airports: [:af_work, "dataexpo_airports-parsed#{Settings[:mini_slug]}.tsv" ],
|
19
|
+
airport_identifiers: [:af_work, "airport_identifiers.tsv" ],
|
20
|
+
airport_identifiers_mini: [:af_work, "airport_identifiers-sample.tsv" ],
|
21
|
+
# helpers
|
22
|
+
country_name_lookup: [:work, 'geo', "country_name_lookup.tsv"],
|
23
|
+
)
|
24
|
+
|
25
|
+
chain :airline_flights do
|
26
|
+
code_files = FileList[Pathname.of(:af_code, '*.rb').to_s]
|
27
|
+
chain(:parse) do
|
28
|
+
|
29
|
+
# desc 'parse the dataexpo airports'
|
30
|
+
# create_file(:dataexpo_airports, after: code_files) do |dest|
|
31
|
+
# RawDataexpoAirport.load_airports(:dataexpo_raw_airports) do |airport|
|
32
|
+
# dest << airport.to_tsv << "\n"
|
33
|
+
# end
|
34
|
+
# end
|
35
|
+
|
36
|
+
desc 'parse the openflights airports'
|
37
|
+
create_file(:openflights_airports, after: [code_files, :force]) do |dest|
|
38
|
+
require_relative('../geo/geo_models')
|
39
|
+
Geo::CountryNameLookup.load
|
40
|
+
RawOpenflightAirport.load_airports(:openflights_raw_airports) do |airport|
|
41
|
+
dest << airport.to_tsv << "\n"
|
42
|
+
# puts airport.country
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# task :reconcile_airports => [:dataexpo_airports, :openflights_airports] do
|
47
|
+
# require_relative 'reconcile_airports'
|
48
|
+
# Airport::IdReconciler.load_all
|
49
|
+
# end
|
50
|
+
#
|
51
|
+
# desc 'run the identifier reconciler'
|
52
|
+
# create_file(:airport_identifiers, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
|
53
|
+
# Airport::IdReconciler.airports.each do |airport|
|
54
|
+
# dest << airport.to_tsv << "\n"
|
55
|
+
# end
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
# desc 'run the identifier reconciler'
|
59
|
+
# create_file(:airport_identifiers_mini, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
|
60
|
+
# Airport::IdReconciler.exemplars.each do |airport|
|
61
|
+
# dest << airport.to_tsv << "\n"
|
62
|
+
# end
|
63
|
+
# end
|
64
|
+
#
|
65
|
+
# desc 'parse the openflights airlines'
|
66
|
+
# create_file(:openflights_airlines, after: code_files) do |dest|
|
67
|
+
# RawOpenflightAirline.load_airlines(:openflights_raw_airlines) do |airline|
|
68
|
+
# dest << airline.to_tsv << "\n"
|
69
|
+
# puts airline.to_tsv
|
70
|
+
# end
|
71
|
+
# end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
task :default => [
|
77
|
+
'airline_flights',
|
78
|
+
# 'airline_flights:parse:dataexpo_airports',
|
79
|
+
# 'airline_flights:parse:openflights_airports',
|
80
|
+
# 'airline_flights:parse:airport_identifiers',
|
81
|
+
# 'airline_flights:parse:airport_identifiers_mini',
|
82
|
+
# 'airline_flights:parse:openflights_airlines',
|
83
|
+
]
|