wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,4 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+
4
+ OK_CHARS_RE = /[^a-zA-Z0-9\ \/\.\,\-\(\)\'ÁÂÄÅÇÉÍÎÑÖØÜÞàáâãäåæçèéêëìíîïðñóôõöøúüýāăčėęěğİıŁłńōőřŞşŠšţťūźŽžơț]/
@@ -0,0 +1,156 @@
1
+ # Raw data:
2
+ # Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Can
3
+ # 2007,1,1,1,1232,1225,1341,1340,WN,2891,N351,69,75,54,1,7,SMF,ONT,389,4,11,0,,0,0,0,0,0,0
4
+
5
+ class RawAirlineFlight
6
+ include Gorillib::Model
7
+
8
+ field :date_year, Integer, position: 1, doc: "Year (1987-2008)"
9
+ field :date_month, Integer, position: 2, doc: "Month (1-12)"
10
+ field :date_day, Integer, position: 3, doc: "Day of month (1-31)"
11
+ field :day_of_week, Integer, position: 4, doc: "Day of week -- 1 (Monday) - 7 (Sunday)"
12
+ #
13
+ field :act_dep_tod, String, position: 5, doc: "time of day for actual departure (local, hhmm)", blankish: [nil, '', 'NA']
14
+ field :crs_dep_tod, String, position: 6, doc: "time of day for scheduled departure (local, hhmm)"
15
+ field :act_arr_tod, String, position: 7, doc: "time of day for actual arrival (local, hhmm). Not adjusted for wrap-around.", blankish: [nil, '', 'NA']
16
+ field :crs_arr_tod, String, position: 8, doc: "time of day for scheduled arrival (local, hhmm). Not adjusted for wrap-around."
17
+ #
18
+ field :unique_carrier, String, position: 9, doc: "unique carrier code", validates: { length: { in: 0..5 } }
19
+ field :flight_num, Integer, position: 10, doc: "flight number"
20
+ field :tail_num, String, position: 11, doc: "plane tail number", validates: { length: { in: 0..8 } }
21
+ #
22
+ field :act_duration, Integer, position: 12, doc: "actual flight time, in minutes", blankish: [nil, '', 'NA']
23
+ field :crs_duration, Integer, position: 13, doc: "CRS flight time, in minutes"
24
+ field :air_duration, Integer, position: 14, doc: "Air time, in minutes", blankish: [nil, '', 'NA']
25
+ field :arr_delay, Integer, position: 15, doc: "arrival delay, in minutes", blankish: [nil, '', 'NA']
26
+ field :dep_delay, Integer, position: 16, doc: "departure delay, in minutes", blankish: [nil, '', 'NA']
27
+ field :from_airport, String, position: 17, doc: "Origin IATA airport code", validates: { length: { in: 0..3 } }
28
+ field :into_airport, String, position: 18, doc: "Destination IATA airport code", validates: { length: { in: 0..3 } }
29
+ field :distance_mi, Integer, position: 19, doc: "Flight distance, in miles"
30
+ field :taxi_in_duration, Integer, position: 20, doc: "taxi in time, in minutes", blankish: [nil, '', 'NA']
31
+ field :taxi_out_duration, Integer, position: 21, doc: "taxi out time in minutes", blankish: [nil, '', 'NA']
32
+ #
33
+ field :is_cancelled, :boolean_10, position: 22, doc: "was the flight cancelled?"
34
+ field :cancellation_code, String, position: 23, doc: "Reason for cancellation (A = carrier, B = weather, C = NAS, D = security, Z = no cancellation)"
35
+ field :is_diverted, :boolean_10, position: 24, doc: "Was the plane diverted?"
36
+ field :carrier_delay, Integer, position: 25, doc: "in minutes"
37
+ field :weather_delay, Integer, position: 26, doc: "in minutes"
38
+ field :nas_delay, Integer, position: 27, doc: "in minutes"
39
+ field :security_delay, Integer, position: 28, doc: "in minutes"
40
+ field :late_aircraft_delay, Integer, position: 29, doc: "in minutes"
41
+
42
+ def flight_date
43
+ Time.new(date_year, date_month, date_day)
44
+ end
45
+
46
+ # uses the year / month / day, along with an "hhmm" string, to
47
+ def inttime_from_hhmm(val, fencepost=nil)
48
+ hour, minutes = [val.to_i / 100, val.to_i % 100]
49
+ res = Time.utc(date_year, date_month, date_day, hour, minutes)
50
+ # if before fencepost, we wrapped around in time
51
+ res += (24 * 60 * 60) if fencepost && (res.to_i < fencepost)
52
+ res.to_i
53
+ end
54
+
55
+ def act_dep_itime ; @act_dep_itime = inttime_from_hhmm(act_dep_tod) if act_dep_tod ; end
56
+ def crs_dep_itime ; @crs_dep_itime = inttime_from_hhmm(crs_dep_tod) ; end
57
+ def act_arr_itime ; @act_arr_itime = inttime_from_hhmm(act_arr_tod, act_dep_itime) if act_arr_tod ; end
58
+ def crs_arr_itime ; @crs_arr_itime = inttime_from_hhmm(crs_arr_tod, crs_dep_itime) ; end
59
+
60
+ def receive_tail_num(val) ; val = nil if val.to_s == "0" ; super(val) ; end
61
+ def arr_delay(val) val = nil if val.to_s == 0 ; super(val) ; end
62
+
63
+ def receive_cancellation_code(val) ; if val == "" then super("Z") else super(val) ; end ; end
64
+
65
+ def to_airline_flight
66
+ attrs = self.attributes.reject{|attr,val| [:year, :month, :day, :distance_mi].include?(attr) }
67
+ attrs[:flight_datestr] = flight_date.strftime("%Y%m%d")
68
+ attrs[:distance_km] = (distance_mi * 1.609_344).to_i
69
+
70
+ attrs[:act_dep_tod] = "%04d" % act_dep_tod.to_i if act_dep_tod
71
+ attrs[:crs_dep_tod] = "%04d" % crs_dep_tod.to_i if crs_dep_tod
72
+ attrs[:act_arr_tod] = "%04d" % act_arr_tod.to_i if act_arr_tod
73
+ attrs[:crs_arr_tod] = "%04d" % crs_arr_tod.to_i if crs_arr_tod
74
+
75
+ attrs[:act_dep_itime] = act_dep_itime
76
+ attrs[:crs_dep_itime] = crs_dep_itime
77
+ attrs[:act_arr_itime] = act_arr_itime
78
+ attrs[:crs_arr_itime] = crs_arr_itime
79
+
80
+ AirlineFlight.receive(attrs)
81
+ end
82
+ end
83
+
84
+ class AirlineFlight
85
+ include Gorillib::Model
86
+
87
+ # Identifier
88
+ field :flight_datestr, String, position: 0, doc: "Date, YYYYMMDD. Use flight_date method if you want a date"
89
+ field :unique_carrier, String, position: 1, doc: "Unique Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2).", validates: { length: { in: 0..5 } }
90
+ field :flight_num, Integer, position: 2, doc: "flight number"
91
+ # Flight
92
+ field :from_airport, String, position: 3, doc: "Origin IATA airport code", validates: { length: { in: 0..3 } }
93
+ field :into_airport, String, position: 4, doc: "Destination IATA airport code", validates: { length: { in: 0..3 } }
94
+ field :tail_num, String, position: 5, doc: "Plane tail number", validates: { length: { in: 0..8 } }
95
+ field :distance_km, Integer, position: 6, doc: "Flight distance, in kilometers"
96
+ field :day_of_week, Integer, position: 7, doc: "Day of week -- 1 (Monday) - 7 (Sunday)"
97
+ # Departure and Arrival Absolute Time
98
+ field :crs_dep_itime, IntTime, position: 8, doc: "scheduled departure time (utc epoch seconds)"
99
+ field :crs_arr_itime, IntTime, position: 9, doc: "scheduled arrival time (utc epoch seconds)"
100
+ field :act_dep_itime, IntTime, position: 10, doc: "actual departure time (utc epoch seconds)"
101
+ field :act_arr_itime, IntTime, position: 11, doc: "actual arrival time (utc epoch seconds)"
102
+ # Departure and Arrival Local Time of Day
103
+ field :crs_dep_tod, String, position: 12, doc: "time of day for scheduled departure (local, hhmm)"
104
+ field :crs_arr_tod, String, position: 13, doc: "time of day for scheduled arrival (local, hhmm). Not adjusted for wrap-around."
105
+ field :act_dep_tod, String, position: 14, doc: "time of day for actual departure (local, hhmm)"
106
+ field :act_arr_tod, String, position: 15, doc: "time of day for actual arrival (local, hhmm). Not adjusted for wrap-around."
107
+ # Duration
108
+ field :crs_duration, Integer, position: 16, doc: "CRS flight time, in minutes"
109
+ field :act_duration, Integer, position: 17, doc: "Actual flight time, in minutes"
110
+ field :air_duration, Integer, position: 18, doc: "Air time, in minutes"
111
+ field :taxi_in_duration, Integer, position: 19, doc: "taxi in time, in minutes"
112
+ field :taxi_out_duration, Integer, position: 20, doc: "taxi out time in minutes"
113
+ # Delay
114
+ field :is_diverted, :boolean_10, position: 21, doc: "Was the plane diverted? The actual_duration column remains NULL for all diverted flights."
115
+ field :is_cancelled, :boolean_10, position: 22, doc: "was the flight cancelled?"
116
+ field :cancellation_code, String, position: 23, doc: "Reason for cancellation (A = carrier, B = weather, C = NAS, D = security, Z = no cancellation)"
117
+ field :dep_delay, Integer, position: 24, doc: "Difference in minutes between scheduled and actual departure time. Early departures show negative numbers. "
118
+ field :arr_delay, Integer, position: 25, doc: "Difference in minutes between scheduled and actual arrival time. Early arrivals show negative numbers."
119
+ field :carrier_delay, Integer, position: 26, doc: "Carrier delay, in minutes"
120
+ field :weather_delay, Integer, position: 27, doc: "Weather delay, in minutes"
121
+ field :nas_delay, Integer, position: 28, doc: "National Air System delay, in minutes"
122
+ field :security_delay, Integer, position: 29, doc: "Security delay, in minutes"
123
+ field :late_aircraft_delay, Integer, position: 30, doc: "Late Aircraft delay, in minutes"
124
+
125
+ def to_tsv
126
+ attrs = attributes
127
+ attrs[:is_cancelled] = is_cancelled ? 1 : 0
128
+ attrs[:is_diverted] = is_diverted ? 1 : 0
129
+ attrs[:act_dep_itime] ||= ' '
130
+ attrs[:act_arr_itime] ||= ' '
131
+
132
+ # FIXME
133
+ attrs[:act_duration] = ((crs_arr_itime - crs_dep_itime) / 60.0).to_i
134
+ attrs[:air_duration] = attrs[:act_duration] - attrs[:crs_duration]
135
+ attrs.each{|key, val| attrs[key] = val.to_s[-7..-1] if val.to_s.length > 7 } # FIXME: for testing
136
+
137
+ attrs.values.join("\t")
138
+ end
139
+
140
+ def flight_date
141
+ @flight_date ||= Gorillib::Factory::DateFactory.receive(flight_datestr)
142
+ end
143
+
144
+ # checks that the record is sane
145
+ def lint
146
+ {
147
+ act_duration: (!act_arr_itime) || (act_arr_itime - act_dep_itime == act_duration * 60),
148
+ crs_duration: (!crs_arr_itime) || (crs_arr_itime - crs_dep_itime == crs_duration * 60),
149
+ cancelled_has_code: (is_cancelled == (cancellation_code != "Z")),
150
+ cancellation_code: (%w[A B C D Z].include?(cancellation_code)),
151
+ act_duration: (!act_duration) || (act_duration == (air_duration + taxi_in_duration + taxi_out_duration)),
152
+ dep_delay: (!act_dep_itime) || (dep_delay == (act_dep_itime - crs_dep_itime)/60.0),
153
+ arr_delay: (!act_arr_itime) || (arr_delay == (act_arr_itime - crs_arr_itime)/60.0),
154
+ }
155
+ end
156
+ end
@@ -0,0 +1,4 @@
1
+ require_relative './airline'
2
+ require_relative './airport'
3
+ require_relative './route'
4
+ require_relative './flight'
@@ -0,0 +1,26 @@
1
+
2
+ # see alsospec/examples/munging/airline_flights_spec.rb
3
+
4
+ puts described_class.field_names.map{|fn| fn[0..6] }.join("\t")
5
+ raw_airports = RawDataexpoAirport.load_csv(de_airports_filename)
6
+ raw_airports.each do |airport|
7
+ puts airport.to_tsv
8
+ end
9
+
10
+ puts described_class.field_names.join("\t") # .map{|fn| fn[0..6] }.join("\t")
11
+ raw_airports = described_class.load_csv(raw_airports_filename)
12
+ raw_airports.each do |airport|
13
+ # puts airport.to_tsv
14
+ linted = airport.lint
15
+ puts [airport.iata, airport.icao, linted.inspect, airport.to_tsv, ].join("\t") if linted.present?
16
+ end
17
+
18
+ Airport.load(raw_airports_filename, de_airports_filename)
19
+ Airport::AIRPORTS.each{|id,airport|
20
+ #puts airport.to_tsv
21
+ linted = airport.lint
22
+ warn [airport.iata, airport.icao, airport.de_iata, "%-25s" % airport.name, linted.inspect].join("\t") if linted.present?
23
+ }
24
+
25
+
26
+ # Model.from_tuple(...)
@@ -0,0 +1,142 @@
1
+ require_relative './models'
2
+ require 'gorillib/model/reconcilable'
3
+
4
+ class Airport
5
+ include Gorillib::Model::Reconcilable
6
+ attr_accessor :_origin # source of the record
7
+
8
+ def conflicting_attribute!(attr, this_val, that_val)
9
+ case attr
10
+ when :name, :city, :airport_ofid then return :pass
11
+ when :latitude, :longitude then return true if (this_val - that_val).abs < 3
12
+ when :altitude then return true if (this_val - that_val).abs < 5
13
+ end
14
+ super
15
+ end
16
+
17
+ def ids
18
+ [:icao, :iata, :faa].hashify{|attr| public_send(attr) }.compact
19
+ end
20
+ end
21
+
22
+ #
23
+ # Loads the Airport identifier tables scraped from Wikipedia
24
+ #
25
+ class RawAirportIdentifier < Airport
26
+ include RawAirport
27
+ include Gorillib::Model::LoadFromTsv
28
+
29
+ def self.from_tuple(icao, iata, faa, name, city=nil, *_)
30
+ self.new({icao: icao, iata: iata, faa: faa, name: name, city: city}.compact_blank)
31
+ end
32
+
33
+ def self.load_airports(filename, &block)
34
+ load_tsv(filename, num_fields: 4..6, &block)
35
+ end
36
+ end
37
+
38
+ class Airport
39
+ #
40
+ # Reconciler for Airports
41
+ #
42
+ # For each airport in turn across openflights, dataexpo and the two scraped
43
+ # identifier sets,
44
+ #
45
+ #
46
+ class IdReconciler
47
+ include Gorillib::Model
48
+ include Gorillib::Model::LoadFromCsv
49
+ include Gorillib::Model::Reconcilable
50
+ self.csv_options = { col_sep: "\t", num_fields: 3..6 }
51
+
52
+ # Map the reconcilers to each ID they have anything to say about
53
+ ID_MAP = { icao: {}, iata: {}, faa: {} }
54
+
55
+ field :opinions, Array, default: Array.new, doc: "every record having an id in common with the other records in this field"
56
+
57
+ def ids
58
+ opinions.flat_map{|op| op.ids.to_a }.uniq.compact
59
+ end
60
+
61
+ def self.load_all
62
+ Log.info "Loading all Airports and reconciling"
63
+ @airports = Array.new
64
+ RawDataexpoAirport .load_airports(:dataexpo_raw_airports ){|airport| register(:dataexpo, airport) }
65
+ RawOpenflightAirport.load_airports(:openflights_raw_airports){|airport| register(:openflights, airport) }
66
+ RawAirportIdentifier.load_airports(:wikipedia_icao ){|airport| register(:wp_icao, airport) }
67
+ RawAirportIdentifier.load_airports(:wikipedia_iata ){|airport| register(:wp_iata, airport) }
68
+ RawAirportIdentifier.load_airports(:wikipedia_us_abroad ){|airport| register(:wp_us_abroad, airport) }
69
+
70
+ recs = ID_MAP.map{|attr, hsh| hsh.sort.map(&:last) }.flatten.uniq
71
+ recs.each do |rec|
72
+ consensus = rec.reconcile
73
+ # lint = consensus.lint
74
+ # puts "%-79s\t%s" % [lint, consensus.to_s[0..100]] if lint.present?
75
+ @airports << consensus
76
+ end
77
+ end
78
+
79
+ def self.airports
80
+ @airports
81
+ end
82
+
83
+ def self.exemplars
84
+ Airport::EXEMPLARS.map do |iata|
85
+ ID_MAP[:iata][iata].reconcile
86
+ end
87
+ end
88
+
89
+ def reconcile
90
+ consensus = Airport.new
91
+ clean = opinions.all?{|op| consensus.adopt(op) }
92
+ # puts "\t#{consensus.inspect}"
93
+ puts "confl\t#{self.inspect}" if not clean
94
+ consensus
95
+ end
96
+
97
+ def adopt_opinions(vals, _)
98
+ self.opinions = vals + self.opinions
99
+ self.opinions.uniq!
100
+ end
101
+
102
+ # * find all existing reconcilers that share an ID with that record
103
+ # * unify them into one reconciler
104
+ # * store it back under all the IDs
105
+ #
106
+ # Suppose our dataset has 3 identifiers, which look like
107
+ #
108
+ # a S
109
+ # S 88
110
+ # a Z
111
+ # b
112
+ # Q
113
+ # b Q 77
114
+ #
115
+ # We will wind up with these two reconcilers:
116
+ #
117
+ # <a S 88 opinions: [a,S, ],[S, ,88],[a,Z, ]>
118
+ # <b Q 77 opinions: [b, , ],[ ,Q, ],[b,Q,77]>
119
+ #
120
+ def self.register(origin, obj)
121
+ obj._origin = origin
122
+ # get the existing reconcilers
123
+ existing = obj.ids.map{|attr, id| ID_MAP[attr][id] }.compact.uniq
124
+ # push the new object in, and pull the most senior one out
125
+ existing.unshift(self.new(opinions: [obj]))
126
+ reconciler = existing.shift
127
+ # unite them into the reconciler
128
+ existing.each{|that| reconciler.adopt(that) }
129
+ # save the reconciler under each of the ids.
130
+ reconciler.ids.each{|attr, id| ID_MAP[attr][id] = reconciler }
131
+ end
132
+
133
+ def inspect
134
+ str = "#<#{self.class.name} #{ids}"
135
+ opinions.each do |op|
136
+ str << "\n\t #{op._origin}\t#{op}"
137
+ end
138
+ str << ">"
139
+ end
140
+ end
141
+
142
+ end
@@ -0,0 +1,35 @@
1
+
2
+
3
+ # As of January 2012, the OpenFlights/Airline Route Mapper Route Database
4
+ # contains 59036 routes between 3209 airports on 531 airlines [spanning the
5
+ # globe](http://openflights.org/demo/openflights-routedb-2048.png). If you
6
+ # enjoy this data, please consider [visiting their page and
7
+ # donating](http://openflights.org/data.html)
8
+ #
9
+ # > Notes: Routes are directional: if an airline operates services from A to B
10
+ # > and from B to A, both A-B and B-A are listed separately. Routes where one
11
+ # > carrier operates both its own and codeshare flights are listed only once.
12
+ #
13
+ # Sample entries
14
+ #
15
+ # BA,1355,SIN,3316,LHR,507,,0,744 777
16
+ # BA,1355,SIN,3316,MEL,3339,Y,0,744
17
+ # TOM,5013,ACE,1055,BFS,465,,0,320
18
+ #
19
+ class RawOpenflightRoute
20
+ include Gorillib::Model
21
+
22
+ field :iataicao, String, doc: "2-letter (IATA) or 3-letter (ICAO) code of the airline."
23
+ field :airline_ofid, Integer, doc: "Unique OpenFlights identifier for airline (see Airline)."
24
+ field :from_airport_iataicao, String, doc: "3-letter (IATA) or 4-letter (ICAO) code of the source airport."
25
+ field :from_airport_ofid, Integer, doc: "Unique OpenFlights identifier for source airport (see Airport)"
26
+ field :into_airport_iataicao, String, doc: "3-letter (IATA) or 4-letter (ICAO) code of the destination airport."
27
+ field :into_airport_ofid, Integer, doc: "Unique OpenFlights identifier for destination airport (see Airport)"
28
+ field :codeshare, :boolean, doc: "true if this flight is a codeshare (that is, not operated by Airline, but another carrier); empty otherwise."
29
+ field :stops, Integer, doc: "Number of stops on this flight, or '0' for direct"
30
+ field :equipment_list, String, doc: "3-letter codes for plane type(s) generally used on this flight, separated by spaces"
31
+
32
+ def receive_codeshare(val)
33
+ super(case val when "Y" then true when "N" then false else val ; end)
34
+ end
35
+ end
@@ -0,0 +1,83 @@
1
+ require_relative('../../rake_helper')
2
+ require_relative('./models')
3
+
4
+ Pathname.register_paths(
5
+ af_data: [:data, 'airline_flights'],
6
+ af_work: [:work, 'airline_flights'],
7
+ af_code: File.dirname(__FILE__),
8
+ #
9
+ openflights_raw_airports: [:af_data, "openflights_airports-raw#{Settings[:mini_slug]}.csv" ],
10
+ openflights_raw_airlines: [:af_data, "openflights_airlines-raw.csv" ],
11
+ dataexpo_raw_airports: [:af_data, "dataexpo_airports-raw#{Settings[:mini_slug]}.csv" ],
12
+ wikipedia_icao: [:af_data, "wikipedia_icao.tsv" ],
13
+ wikipedia_iata: [:af_data, "wikipedia_iata.tsv" ],
14
+ wikipedia_us_abroad: [:af_data, "wikipedia_us_abroad.tsv" ],
15
+ #
16
+ openflights_airports: [:af_work, "openflights_airports-parsed#{Settings[:mini_slug]}.tsv"],
17
+ openflights_airlines: [:af_work, "openflights_airlines-parsed#{Settings[:mini_slug]}.tsv"],
18
+ dataexpo_airports: [:af_work, "dataexpo_airports-parsed#{Settings[:mini_slug]}.tsv" ],
19
+ airport_identifiers: [:af_work, "airport_identifiers.tsv" ],
20
+ airport_identifiers_mini: [:af_work, "airport_identifiers-sample.tsv" ],
21
+ # helpers
22
+ country_name_lookup: [:work, 'geo', "country_name_lookup.tsv"],
23
+ )
24
+
25
+ chain :airline_flights do
26
+ code_files = FileList[Pathname.of(:af_code, '*.rb').to_s]
27
+ chain(:parse) do
28
+
29
+ # desc 'parse the dataexpo airports'
30
+ # create_file(:dataexpo_airports, after: code_files) do |dest|
31
+ # RawDataexpoAirport.load_airports(:dataexpo_raw_airports) do |airport|
32
+ # dest << airport.to_tsv << "\n"
33
+ # end
34
+ # end
35
+
36
+ desc 'parse the openflights airports'
37
+ create_file(:openflights_airports, after: [code_files, :force]) do |dest|
38
+ require_relative('../geo/geo_models')
39
+ Geo::CountryNameLookup.load
40
+ RawOpenflightAirport.load_airports(:openflights_raw_airports) do |airport|
41
+ dest << airport.to_tsv << "\n"
42
+ # puts airport.country
43
+ end
44
+ end
45
+
46
+ # task :reconcile_airports => [:dataexpo_airports, :openflights_airports] do
47
+ # require_relative 'reconcile_airports'
48
+ # Airport::IdReconciler.load_all
49
+ # end
50
+ #
51
+ # desc 'run the identifier reconciler'
52
+ # create_file(:airport_identifiers, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
53
+ # Airport::IdReconciler.airports.each do |airport|
54
+ # dest << airport.to_tsv << "\n"
55
+ # end
56
+ # end
57
+ #
58
+ # desc 'run the identifier reconciler'
59
+ # create_file(:airport_identifiers_mini, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
60
+ # Airport::IdReconciler.exemplars.each do |airport|
61
+ # dest << airport.to_tsv << "\n"
62
+ # end
63
+ # end
64
+ #
65
+ # desc 'parse the openflights airlines'
66
+ # create_file(:openflights_airlines, after: code_files) do |dest|
67
+ # RawOpenflightAirline.load_airlines(:openflights_raw_airlines) do |airline|
68
+ # dest << airline.to_tsv << "\n"
69
+ # puts airline.to_tsv
70
+ # end
71
+ # end
72
+
73
+ end
74
+ end
75
+
76
+ task :default => [
77
+ 'airline_flights',
78
+ # 'airline_flights:parse:dataexpo_airports',
79
+ # 'airline_flights:parse:openflights_airports',
80
+ # 'airline_flights:parse:airport_identifiers',
81
+ # 'airline_flights:parse:airport_identifiers_mini',
82
+ # 'airline_flights:parse:openflights_airlines',
83
+ ]