wukong 3.0.0.pre → 3.0.0.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (476) hide show
  1. data/.gitignore +46 -33
  2. data/.gitmodules +3 -0
  3. data/.rspec +1 -1
  4. data/.travis.yml +8 -1
  5. data/.yardopts +0 -13
  6. data/Guardfile +4 -6
  7. data/{LICENSE.textile → LICENSE.md} +43 -55
  8. data/README-old.md +422 -0
  9. data/README.md +279 -418
  10. data/Rakefile +21 -5
  11. data/TODO.md +6 -6
  12. data/bin/wu-clean-encoding +31 -0
  13. data/bin/wu-lign +2 -2
  14. data/bin/wu-local +69 -0
  15. data/bin/wu-server +70 -0
  16. data/examples/Gemfile +38 -0
  17. data/examples/README.md +9 -0
  18. data/examples/dataflow/apache_log_line.rb +64 -25
  19. data/examples/dataflow/fibonacci_series.rb +101 -0
  20. data/examples/dataflow/parse_apache_logs.rb +37 -7
  21. data/examples/{dataflow.rb → dataflow/scraper_macro_flow.rb} +0 -0
  22. data/examples/dataflow/simple.rb +4 -4
  23. data/examples/geo.rb +4 -0
  24. data/examples/geo/geo_grids.numbers +0 -0
  25. data/examples/geo/geolocated.rb +331 -0
  26. data/examples/geo/quadtile.rb +69 -0
  27. data/examples/geo/spec/geolocated_spec.rb +247 -0
  28. data/examples/geo/tile_fetcher.rb +77 -0
  29. data/examples/graph/minimum_spanning_tree.rb +61 -61
  30. data/examples/jabberwocky.txt +36 -0
  31. data/examples/models/wikipedia.rb +20 -0
  32. data/examples/munging/Gemfile +8 -0
  33. data/examples/munging/airline_flights/airline.rb +57 -0
  34. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  35. data/{lib/wukong/settings.rb → examples/munging/airline_flights/airplane.rb} +0 -0
  36. data/examples/munging/airline_flights/airport.rb +211 -0
  37. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  38. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  39. data/examples/munging/airline_flights/flight.rb +156 -0
  40. data/examples/munging/airline_flights/models.rb +4 -0
  41. data/examples/munging/airline_flights/parse.rb +26 -0
  42. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  43. data/examples/munging/airline_flights/route.rb +35 -0
  44. data/examples/munging/airline_flights/tasks.rake +83 -0
  45. data/examples/munging/airline_flights/timezone_fixup.rb +62 -0
  46. data/examples/munging/airline_flights/topcities.rb +167 -0
  47. data/examples/munging/airports/40_wbans.txt +40 -0
  48. data/examples/munging/airports/filter_weather_reports.rb +37 -0
  49. data/examples/munging/airports/join.pig +31 -0
  50. data/examples/munging/airports/to_tsv.rb +33 -0
  51. data/examples/munging/airports/usa_wbans.pig +19 -0
  52. data/examples/munging/airports/usa_wbans.txt +2157 -0
  53. data/examples/munging/airports/wbans.pig +19 -0
  54. data/examples/munging/airports/wbans.txt +2310 -0
  55. data/examples/munging/geo/geo_json.rb +54 -0
  56. data/examples/munging/geo/geo_models.rb +69 -0
  57. data/examples/munging/geo/geonames_models.rb +78 -0
  58. data/examples/munging/geo/iso_codes.rb +172 -0
  59. data/examples/munging/geo/reconcile_countries.rb +124 -0
  60. data/examples/munging/geo/tasks.rake +71 -0
  61. data/examples/munging/rake_helper.rb +62 -0
  62. data/examples/munging/weather/.gitignore +1 -0
  63. data/examples/munging/weather/Gemfile +4 -0
  64. data/examples/munging/weather/Rakefile +28 -0
  65. data/examples/munging/weather/extract_ish.rb +13 -0
  66. data/examples/munging/weather/models/weather.rb +119 -0
  67. data/examples/munging/weather/utils/noaa_downloader.rb +46 -0
  68. data/examples/munging/wikipedia/README.md +34 -0
  69. data/examples/munging/wikipedia/Rakefile +193 -0
  70. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  71. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  72. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  73. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  74. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  75. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  76. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  77. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +4 -0
  78. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  79. data/examples/munging/wikipedia/dbpedia/extract_links.rb +193 -0
  80. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  81. data/examples/munging/wikipedia/n1_subuniverse/n1_nodes.pig +18 -0
  82. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb +21 -0
  83. data/examples/munging/wikipedia/page_metadata/extract_page_metadata.rb.old +27 -0
  84. data/examples/munging/wikipedia/pagelinks/augment_pagelinks.pig +29 -0
  85. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb +14 -0
  86. data/examples/munging/wikipedia/pagelinks/extract_pagelinks.rb.old +25 -0
  87. data/examples/munging/wikipedia/pagelinks/undirect_pagelinks.pig +29 -0
  88. data/examples/munging/wikipedia/pageviews/augment_pageviews.pig +32 -0
  89. data/examples/munging/wikipedia/pageviews/extract_pageviews.rb +85 -0
  90. data/examples/munging/wikipedia/pig_style_guide.md +25 -0
  91. data/examples/munging/wikipedia/redirects/redirects_page_metadata.pig +19 -0
  92. data/examples/munging/wikipedia/subuniverse/sub_articles.pig +23 -0
  93. data/examples/munging/wikipedia/subuniverse/sub_page_metadata.pig +24 -0
  94. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_from.pig +22 -0
  95. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_into.pig +22 -0
  96. data/examples/munging/wikipedia/subuniverse/sub_pagelinks_within.pig +26 -0
  97. data/examples/munging/wikipedia/subuniverse/sub_pageviews.pig +29 -0
  98. data/examples/munging/wikipedia/subuniverse/sub_undirected_pagelinks_within.pig +24 -0
  99. data/examples/munging/wikipedia/utils/get_namespaces.rb +86 -0
  100. data/examples/munging/wikipedia/utils/munging_utils.rb +68 -0
  101. data/examples/munging/wikipedia/utils/namespaces.json +1 -0
  102. data/examples/rake_helper.rb +85 -0
  103. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  104. data/examples/server_logs/logline.rb +95 -0
  105. data/examples/server_logs/models.rb +66 -0
  106. data/examples/server_logs/page_counts.pig +48 -0
  107. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  108. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  109. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  110. data/{old/examples/server_logs/breadcrumbs.rb → examples/server_logs/server_logs-03-breadcrumbs-full.rb} +26 -30
  111. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  112. data/examples/string_reverser.rb +26 -0
  113. data/examples/text/pig_latin.rb +2 -2
  114. data/examples/text/regional_flavor/README.md +14 -0
  115. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  116. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  117. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  118. data/examples/word_count/accumulator.rb +26 -0
  119. data/examples/word_count/tokenizer.rb +13 -0
  120. data/examples/word_count/word_count.rb +6 -0
  121. data/examples/workflow/cherry_pie.dot +97 -0
  122. data/examples/workflow/cherry_pie.png +0 -0
  123. data/examples/workflow/cherry_pie.rb +61 -26
  124. data/lib/hanuman.rb +34 -7
  125. data/lib/hanuman/graph.rb +55 -31
  126. data/lib/hanuman/graphvizzer.rb +199 -178
  127. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  128. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  129. data/lib/hanuman/link.rb +35 -0
  130. data/lib/hanuman/registry.rb +46 -0
  131. data/lib/hanuman/stage.rb +76 -32
  132. data/lib/wukong.rb +23 -24
  133. data/lib/wukong/boot.rb +87 -0
  134. data/lib/wukong/configuration.rb +8 -0
  135. data/lib/wukong/dataflow.rb +45 -78
  136. data/lib/wukong/driver.rb +99 -0
  137. data/lib/wukong/emitter.rb +22 -0
  138. data/lib/wukong/model/faker.rb +24 -24
  139. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  140. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  141. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  142. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  143. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  144. data/lib/wukong/processor.rb +60 -114
  145. data/lib/wukong/spec_helpers.rb +81 -0
  146. data/lib/wukong/spec_helpers/integration_driver.rb +144 -0
  147. data/lib/wukong/spec_helpers/integration_driver_matchers.rb +219 -0
  148. data/lib/wukong/spec_helpers/processor_helpers.rb +95 -0
  149. data/lib/wukong/spec_helpers/processor_methods.rb +108 -0
  150. data/lib/wukong/spec_helpers/shared_examples.rb +15 -0
  151. data/lib/wukong/spec_helpers/spec_driver.rb +28 -0
  152. data/lib/wukong/spec_helpers/spec_driver_matchers.rb +195 -0
  153. data/lib/wukong/version.rb +2 -1
  154. data/lib/wukong/widget/filters.rb +311 -0
  155. data/lib/wukong/widget/processors.rb +156 -0
  156. data/lib/wukong/widget/reducers.rb +7 -0
  157. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  158. data/lib/wukong/widget/reducers/bin.rb +318 -0
  159. data/lib/wukong/widget/reducers/count.rb +61 -0
  160. data/lib/wukong/widget/reducers/group.rb +85 -0
  161. data/lib/wukong/widget/reducers/group_concat.rb +70 -0
  162. data/lib/wukong/widget/reducers/moments.rb +72 -0
  163. data/lib/wukong/widget/reducers/sort.rb +130 -0
  164. data/lib/wukong/widget/serializers.rb +287 -0
  165. data/lib/wukong/widget/sink.rb +10 -52
  166. data/lib/wukong/widget/source.rb +7 -113
  167. data/lib/wukong/widget/utils.rb +46 -0
  168. data/lib/wukong/widgets.rb +6 -0
  169. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  170. data/spec/examples/dataflow/parsing_spec.rb +12 -11
  171. data/spec/examples/dataflow/simple_spec.rb +32 -6
  172. data/spec/examples/dataflow/telegram_spec.rb +36 -36
  173. data/spec/examples/graph/minimum_spanning_tree_spec.rb +30 -31
  174. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  175. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  176. data/spec/examples/text/pig_latin_spec.rb +13 -16
  177. data/spec/examples/workflow/cherry_pie_spec.rb +34 -4
  178. data/spec/hanuman/graph_spec.rb +27 -2
  179. data/spec/hanuman/hanuman_spec.rb +10 -0
  180. data/spec/hanuman/registry_spec.rb +123 -0
  181. data/spec/hanuman/stage_spec.rb +61 -7
  182. data/spec/spec_helper.rb +29 -19
  183. data/spec/support/hanuman_test_helpers.rb +14 -12
  184. data/spec/support/shared_context_for_reducers.rb +37 -0
  185. data/spec/support/shared_examples_for_builders.rb +101 -0
  186. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  187. data/spec/support/wukong_test_helpers.rb +37 -11
  188. data/spec/wukong/dataflow_spec.rb +77 -55
  189. data/spec/wukong/local_runner_spec.rb +24 -24
  190. data/spec/wukong/model/faker_spec.rb +132 -131
  191. data/spec/wukong/runner_spec.rb +8 -8
  192. data/spec/wukong/widget/filters_spec.rb +61 -0
  193. data/spec/wukong/widget/processors_spec.rb +126 -0
  194. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  195. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  196. data/spec/wukong/widget/reducers/group_spec.rb +20 -0
  197. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  198. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  199. data/spec/wukong/widget/serializers_spec.rb +92 -0
  200. data/spec/wukong/widget/sink_spec.rb +15 -15
  201. data/spec/wukong/widget/source_spec.rb +65 -41
  202. data/spec/wukong/wukong_spec.rb +10 -0
  203. data/wukong.gemspec +17 -10
  204. metadata +359 -335
  205. data/.document +0 -5
  206. data/VERSION +0 -1
  207. data/bin/hdp-bin +0 -44
  208. data/bin/hdp-bzip +0 -23
  209. data/bin/hdp-cat +0 -3
  210. data/bin/hdp-catd +0 -3
  211. data/bin/hdp-cp +0 -3
  212. data/bin/hdp-du +0 -86
  213. data/bin/hdp-get +0 -3
  214. data/bin/hdp-kill +0 -3
  215. data/bin/hdp-kill-task +0 -3
  216. data/bin/hdp-ls +0 -11
  217. data/bin/hdp-mkdir +0 -2
  218. data/bin/hdp-mkdirp +0 -12
  219. data/bin/hdp-mv +0 -3
  220. data/bin/hdp-parts_to_keys.rb +0 -77
  221. data/bin/hdp-ps +0 -3
  222. data/bin/hdp-put +0 -3
  223. data/bin/hdp-rm +0 -32
  224. data/bin/hdp-sort +0 -40
  225. data/bin/hdp-stream +0 -40
  226. data/bin/hdp-stream-flat +0 -22
  227. data/bin/hdp-stream2 +0 -39
  228. data/bin/hdp-sync +0 -17
  229. data/bin/hdp-wc +0 -67
  230. data/bin/wu-flow +0 -10
  231. data/bin/wu-map +0 -17
  232. data/bin/wu-red +0 -17
  233. data/bin/wukong +0 -17
  234. data/data/CREDITS.md +0 -355
  235. data/data/graph/airfares.tsv +0 -2174
  236. data/data/text/gift_of_the_magi.txt +0 -225
  237. data/data/text/jabberwocky.txt +0 -36
  238. data/data/text/rectification_of_names.txt +0 -33
  239. data/data/twitter/a_atsigns_b.tsv +0 -64
  240. data/data/twitter/a_follows_b.tsv +0 -53
  241. data/data/twitter/tweet.tsv +0 -167
  242. data/data/twitter/twitter_user.tsv +0 -55
  243. data/data/wikipedia/dbpedia-sentences.tsv +0 -1000
  244. data/docpages/INSTALL.textile +0 -92
  245. data/docpages/LICENSE.textile +0 -107
  246. data/docpages/README-elastic_map_reduce.textile +0 -377
  247. data/docpages/README-performance.textile +0 -90
  248. data/docpages/README-wulign.textile +0 -65
  249. data/docpages/UsingWukong-part1-get_ready.textile +0 -17
  250. data/docpages/UsingWukong-part2-ThinkingBigData.textile +0 -75
  251. data/docpages/UsingWukong-part3-parsing.textile +0 -138
  252. data/docpages/_config.yml +0 -39
  253. data/docpages/avro/avro_notes.textile +0 -56
  254. data/docpages/avro/performance.textile +0 -36
  255. data/docpages/avro/tethering.textile +0 -19
  256. data/docpages/bigdata-tips.textile +0 -143
  257. data/docpages/code/api_response_example.txt +0 -20
  258. data/docpages/code/parser_skeleton.rb +0 -38
  259. data/docpages/diagrams/MapReduceDiagram.graffle +0 -0
  260. data/docpages/favicon.ico +0 -0
  261. data/docpages/gem.css +0 -16
  262. data/docpages/hadoop-tips.textile +0 -83
  263. data/docpages/index.textile +0 -92
  264. data/docpages/intro.textile +0 -8
  265. data/docpages/moreinfo.textile +0 -174
  266. data/docpages/news.html +0 -24
  267. data/docpages/pig/PigLatinExpressionsList.txt +0 -122
  268. data/docpages/pig/PigLatinReferenceManual.txt +0 -1640
  269. data/docpages/pig/commandline_params.txt +0 -26
  270. data/docpages/pig/cookbook.html +0 -481
  271. data/docpages/pig/images/hadoop-logo.jpg +0 -0
  272. data/docpages/pig/images/instruction_arrow.png +0 -0
  273. data/docpages/pig/images/pig-logo.gif +0 -0
  274. data/docpages/pig/piglatin_ref1.html +0 -1103
  275. data/docpages/pig/piglatin_ref2.html +0 -14340
  276. data/docpages/pig/setup.html +0 -505
  277. data/docpages/pig/skin/basic.css +0 -166
  278. data/docpages/pig/skin/breadcrumbs.js +0 -237
  279. data/docpages/pig/skin/fontsize.js +0 -166
  280. data/docpages/pig/skin/getBlank.js +0 -40
  281. data/docpages/pig/skin/getMenu.js +0 -45
  282. data/docpages/pig/skin/images/chapter.gif +0 -0
  283. data/docpages/pig/skin/images/chapter_open.gif +0 -0
  284. data/docpages/pig/skin/images/current.gif +0 -0
  285. data/docpages/pig/skin/images/external-link.gif +0 -0
  286. data/docpages/pig/skin/images/header_white_line.gif +0 -0
  287. data/docpages/pig/skin/images/page.gif +0 -0
  288. data/docpages/pig/skin/images/pdfdoc.gif +0 -0
  289. data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
  290. data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
  291. data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  292. data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
  293. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
  294. data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  295. data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
  296. data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
  297. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
  298. data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
  299. data/docpages/pig/skin/print.css +0 -54
  300. data/docpages/pig/skin/profile.css +0 -181
  301. data/docpages/pig/skin/screen.css +0 -587
  302. data/docpages/pig/tutorial.html +0 -1059
  303. data/docpages/pig/udf.html +0 -1509
  304. data/docpages/tutorial.textile +0 -283
  305. data/docpages/usage.textile +0 -195
  306. data/docpages/wutils.textile +0 -263
  307. data/examples/dataflow/complex.rb +0 -11
  308. data/examples/dataflow/donuts.rb +0 -13
  309. data/examples/tiny_count/jabberwocky_output.tsv +0 -92
  310. data/examples/word_count.rb +0 -48
  311. data/examples/workflow/fiddle.rb +0 -24
  312. data/lib/away/escapement.rb +0 -129
  313. data/lib/away/exe.rb +0 -11
  314. data/lib/away/experimental.rb +0 -5
  315. data/lib/away/from_file.rb +0 -52
  316. data/lib/away/job.rb +0 -56
  317. data/lib/away/job/rake_compat.rb +0 -17
  318. data/lib/away/registry.rb +0 -79
  319. data/lib/away/runner.rb +0 -276
  320. data/lib/away/runner/execute.rb +0 -121
  321. data/lib/away/script.rb +0 -161
  322. data/lib/away/script/hadoop_command.rb +0 -240
  323. data/lib/away/source/file_list_source.rb +0 -15
  324. data/lib/away/source/looper.rb +0 -18
  325. data/lib/away/task.rb +0 -219
  326. data/lib/hanuman/action.rb +0 -21
  327. data/lib/hanuman/chain.rb +0 -4
  328. data/lib/hanuman/graphviz.rb +0 -74
  329. data/lib/hanuman/resource.rb +0 -6
  330. data/lib/hanuman/slot.rb +0 -87
  331. data/lib/hanuman/slottable.rb +0 -220
  332. data/lib/wukong/bad_record.rb +0 -15
  333. data/lib/wukong/event.rb +0 -44
  334. data/lib/wukong/local_runner.rb +0 -55
  335. data/lib/wukong/mapred.rb +0 -3
  336. data/lib/wukong/universe.rb +0 -48
  337. data/lib/wukong/widget/filter.rb +0 -81
  338. data/lib/wukong/widget/gibberish.rb +0 -123
  339. data/lib/wukong/widget/monitor.rb +0 -26
  340. data/lib/wukong/widget/reducer.rb +0 -66
  341. data/lib/wukong/widget/stringifier.rb +0 -50
  342. data/lib/wukong/workflow.rb +0 -22
  343. data/lib/wukong/workflow/command.rb +0 -42
  344. data/old/config/emr-example.yaml +0 -48
  345. data/old/examples/README.txt +0 -17
  346. data/old/examples/contrib/jeans/README.markdown +0 -165
  347. data/old/examples/contrib/jeans/data/normalized_sizes +0 -3
  348. data/old/examples/contrib/jeans/data/orders.tsv +0 -1302
  349. data/old/examples/contrib/jeans/data/sizes +0 -3
  350. data/old/examples/contrib/jeans/normalize.rb +0 -20
  351. data/old/examples/contrib/jeans/sizes.rb +0 -55
  352. data/old/examples/corpus/bnc_word_freq.rb +0 -44
  353. data/old/examples/corpus/bucket_counter.rb +0 -47
  354. data/old/examples/corpus/dbpedia_abstract_to_sentences.rb +0 -86
  355. data/old/examples/corpus/sentence_bigrams.rb +0 -53
  356. data/old/examples/corpus/sentence_coocurrence.rb +0 -66
  357. data/old/examples/corpus/stopwords.rb +0 -138
  358. data/old/examples/corpus/words_to_bigrams.rb +0 -53
  359. data/old/examples/emr/README.textile +0 -110
  360. data/old/examples/emr/dot_wukong_dir/credentials.json +0 -7
  361. data/old/examples/emr/dot_wukong_dir/emr.yaml +0 -69
  362. data/old/examples/emr/dot_wukong_dir/emr_bootstrap.sh +0 -33
  363. data/old/examples/emr/elastic_mapreduce_example.rb +0 -28
  364. data/old/examples/network_graph/adjacency_list.rb +0 -74
  365. data/old/examples/network_graph/breadth_first_search.rb +0 -72
  366. data/old/examples/network_graph/gen_2paths.rb +0 -68
  367. data/old/examples/network_graph/gen_multi_edge.rb +0 -112
  368. data/old/examples/network_graph/gen_symmetric_links.rb +0 -64
  369. data/old/examples/pagerank/README.textile +0 -6
  370. data/old/examples/pagerank/gen_initial_pagerank_graph.pig +0 -57
  371. data/old/examples/pagerank/pagerank.rb +0 -72
  372. data/old/examples/pagerank/pagerank_initialize.rb +0 -42
  373. data/old/examples/pagerank/run_pagerank.sh +0 -21
  374. data/old/examples/sample_records.rb +0 -33
  375. data/old/examples/server_logs/apache_log_parser.rb +0 -15
  376. data/old/examples/server_logs/nook.rb +0 -48
  377. data/old/examples/server_logs/nook/faraday_dummy_adapter.rb +0 -94
  378. data/old/examples/server_logs/user_agent.rb +0 -40
  379. data/old/examples/simple_word_count.rb +0 -82
  380. data/old/examples/size.rb +0 -61
  381. data/old/examples/stats/avg_value_frequency.rb +0 -86
  382. data/old/examples/stats/binning_percentile_estimator.rb +0 -140
  383. data/old/examples/stats/data/avg_value_frequency.tsv +0 -3
  384. data/old/examples/stats/rank_and_bin.rb +0 -173
  385. data/old/examples/stupidly_simple_filter.rb +0 -40
  386. data/old/examples/word_count.rb +0 -75
  387. data/old/graph/graphviz_builder.rb +0 -580
  388. data/old/graph_easy/Attributes.pm +0 -4181
  389. data/old/graph_easy/Graphviz.pm +0 -2232
  390. data/old/wukong.rb +0 -18
  391. data/old/wukong/and_pig.rb +0 -38
  392. data/old/wukong/bad_record.rb +0 -18
  393. data/old/wukong/datatypes.rb +0 -24
  394. data/old/wukong/datatypes/enum.rb +0 -127
  395. data/old/wukong/datatypes/fake_types.rb +0 -17
  396. data/old/wukong/decorator.rb +0 -28
  397. data/old/wukong/encoding/asciize.rb +0 -108
  398. data/old/wukong/extensions.rb +0 -16
  399. data/old/wukong/extensions/array.rb +0 -18
  400. data/old/wukong/extensions/blank.rb +0 -93
  401. data/old/wukong/extensions/class.rb +0 -189
  402. data/old/wukong/extensions/date_time.rb +0 -53
  403. data/old/wukong/extensions/emittable.rb +0 -69
  404. data/old/wukong/extensions/enumerable.rb +0 -79
  405. data/old/wukong/extensions/hash.rb +0 -167
  406. data/old/wukong/extensions/hash_keys.rb +0 -16
  407. data/old/wukong/extensions/hash_like.rb +0 -150
  408. data/old/wukong/extensions/hashlike_class.rb +0 -47
  409. data/old/wukong/extensions/module.rb +0 -2
  410. data/old/wukong/extensions/pathname.rb +0 -27
  411. data/old/wukong/extensions/string.rb +0 -65
  412. data/old/wukong/extensions/struct.rb +0 -17
  413. data/old/wukong/extensions/symbol.rb +0 -11
  414. data/old/wukong/filename_pattern.rb +0 -74
  415. data/old/wukong/helper.rb +0 -7
  416. data/old/wukong/helper/stopwords.rb +0 -195
  417. data/old/wukong/helper/tokenize.rb +0 -35
  418. data/old/wukong/logger.rb +0 -38
  419. data/old/wukong/periodic_monitor.rb +0 -72
  420. data/old/wukong/schema.rb +0 -269
  421. data/old/wukong/script.rb +0 -286
  422. data/old/wukong/script/avro_command.rb +0 -5
  423. data/old/wukong/script/cassandra_loader_script.rb +0 -40
  424. data/old/wukong/script/emr_command.rb +0 -168
  425. data/old/wukong/script/hadoop_command.rb +0 -237
  426. data/old/wukong/script/local_command.rb +0 -41
  427. data/old/wukong/store.rb +0 -10
  428. data/old/wukong/store/base.rb +0 -27
  429. data/old/wukong/store/cassandra.rb +0 -10
  430. data/old/wukong/store/cassandra/streaming.rb +0 -75
  431. data/old/wukong/store/cassandra/struct_loader.rb +0 -21
  432. data/old/wukong/store/cassandra_model.rb +0 -91
  433. data/old/wukong/store/chh_chunked_flat_file_store.rb +0 -37
  434. data/old/wukong/store/chunked_flat_file_store.rb +0 -48
  435. data/old/wukong/store/conditional_store.rb +0 -57
  436. data/old/wukong/store/factory.rb +0 -8
  437. data/old/wukong/store/flat_file_store.rb +0 -89
  438. data/old/wukong/store/key_store.rb +0 -51
  439. data/old/wukong/store/null_store.rb +0 -15
  440. data/old/wukong/store/read_thru_store.rb +0 -22
  441. data/old/wukong/store/tokyo_tdb_key_store.rb +0 -33
  442. data/old/wukong/store/tyrant_rdb_key_store.rb +0 -57
  443. data/old/wukong/store/tyrant_tdb_key_store.rb +0 -20
  444. data/old/wukong/streamer.rb +0 -30
  445. data/old/wukong/streamer/accumulating_reducer.rb +0 -83
  446. data/old/wukong/streamer/base.rb +0 -126
  447. data/old/wukong/streamer/counting_reducer.rb +0 -25
  448. data/old/wukong/streamer/filter.rb +0 -20
  449. data/old/wukong/streamer/instance_streamer.rb +0 -15
  450. data/old/wukong/streamer/json_streamer.rb +0 -21
  451. data/old/wukong/streamer/line_streamer.rb +0 -12
  452. data/old/wukong/streamer/list_reducer.rb +0 -31
  453. data/old/wukong/streamer/rank_and_bin_reducer.rb +0 -145
  454. data/old/wukong/streamer/record_streamer.rb +0 -14
  455. data/old/wukong/streamer/reducer.rb +0 -11
  456. data/old/wukong/streamer/set_reducer.rb +0 -14
  457. data/old/wukong/streamer/struct_streamer.rb +0 -48
  458. data/old/wukong/streamer/summing_reducer.rb +0 -29
  459. data/old/wukong/streamer/uniq_by_last_reducer.rb +0 -51
  460. data/old/wukong/typed_struct.rb +0 -12
  461. data/spec/away/encoding_spec.rb +0 -32
  462. data/spec/away/exe_spec.rb +0 -20
  463. data/spec/away/flow_spec.rb +0 -82
  464. data/spec/away/graph_spec.rb +0 -6
  465. data/spec/away/job_spec.rb +0 -15
  466. data/spec/away/rake_compat_spec.rb +0 -9
  467. data/spec/away/script_spec.rb +0 -81
  468. data/spec/hanuman/graphviz_spec.rb +0 -29
  469. data/spec/hanuman/slot_spec.rb +0 -2
  470. data/spec/support/examples_helper.rb +0 -10
  471. data/spec/support/streamer_test_helpers.rb +0 -6
  472. data/spec/support/wukong_widget_helpers.rb +0 -66
  473. data/spec/wukong/processor_spec.rb +0 -109
  474. data/spec/wukong/widget/filter_spec.rb +0 -99
  475. data/spec/wukong/widget/stringifier_spec.rb +0 -51
  476. data/spec/wukong/workflow/command_spec.rb +0 -5
@@ -0,0 +1,36 @@
1
+ "Jabberwocky"
2
+
3
+ 'Twas brillig, and the slithy toves
4
+ Did gyre and gimble in the wabe;
5
+ All mimsy were the borogoves,
6
+ And the mome raths outgrabe.
7
+
8
+ "Beware the Jabberwock, my son!
9
+ The jaws that bite, the claws that catch!
10
+ Beware the Jubjub bird, and shun
11
+ The frumious Bandersnatch!"
12
+
13
+ He took his vorpal sword in hand:
14
+ Long time the manxome foe he sought—
15
+ So rested he by the Tumtum tree,
16
+ And stood awhile in thought.
17
+
18
+ And as in uffish thought he stood,
19
+ The Jabberwock, with eyes of flame,
20
+ Came whiffling through the tulgey wood,
21
+ And burbled as it came!
22
+
23
+ One, two! One, two! and through and through
24
+ The vorpal blade went snicker-snack!
25
+ He left it dead, and with its head
26
+ He went galumphing back.
27
+
28
+ "And hast thou slain the Jabberwock?
29
+ Come to my arms, my beamish boy!
30
+ O frabjous day! Callooh! Callay!"
31
+ He chortled in his joy.
32
+
33
+ 'Twas brillig, and the slithy toves
34
+ Did gyre and gimble in the wabe;
35
+ All mimsy were the borogoves,
36
+ And the mome raths outgrabe.
@@ -0,0 +1,20 @@
1
+
2
+ module Wu
3
+ module Data
4
+ module Wikipedia
5
+
6
+ class Article
7
+ include Gorillib::Model
8
+ field :title, String
9
+ field :namespace, Integer
10
+ field :id, Integer
11
+ field :restrictions, String
12
+ field :revision_id, String
13
+ field :timestamp, String
14
+ field :sha1, String
15
+ field :redirect, String
16
+ field :xml_text, String
17
+
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,8 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gem 'gorillib', :path => '/home/dlaw/dev/gorillib'
4
+ gem 'ruby-progressbar'
5
+ gem 'crack'
6
+ gem 'rake'
7
+ gem 'wukong', :path => '/home/dlaw/dev/wukong_og'
8
+ gem 'json'
@@ -0,0 +1,57 @@
1
+ class Airline
2
+ include Gorillib::Model
3
+ field :icao_id, String, doc: "3-letter ICAO code, if available", identifier: true, length: 2
4
+ field :iata_id, String, doc: "2-letter IATA code, if available", identifier: true, length: 2
5
+ field :airline_ofid, Integer, doc: "Unique OpenFlights identifier for this airline.", identifier: true
6
+ field :active, :boolean, doc: 'true if the airline is or has until recently been operational, false if it is defunct. (This is only a rough indication and should not be taken as 100% accurate)'
7
+ field :country, String, doc: "Country or territory where airline is incorporated"
8
+ field :name, String, doc: "Airline name."
9
+ field :callsign, String, doc: "Airline callsign", identifier: true
10
+ field :alias, String, doc: "Alias of the airline. For example, 'All Nippon Airways' is commonly known as 'ANA'"
11
+ end
12
+
13
+ #
14
+ # As of January 2012, the OpenFlights Airlines Database contains 5888
15
+ # airlines. If you enjoy this data, please consider [visiting their page and
16
+ # donating](http://openflights.org/data.html)
17
+ #
18
+ # > Notes: Airlines with null codes/callsigns/countries generally represent
19
+ # > user-added airlines. Since the data is intended primarily for current
20
+ # > flights, defunct IATA codes are generally not included. For example,
21
+ # > "Sabena" is not listed with a SN IATA code, since "SN" is presently used by
22
+ # > its successor Brussels Airlines.
23
+ #
24
+ # Sample entries
25
+ #
26
+ # 324,"All Nippon Airways","ANA All Nippon Airways","NH","ANA","ALL NIPPON","Japan","Y"
27
+ # 412,"Aerolineas Argentinas",\N,"AR","ARG","ARGENTINA","Argentina","Y"
28
+ # 413,"Arrowhead Airways",\N,"","ARH","ARROWHEAD","United States","N"
29
+ #
30
+ class RawOpenflightAirline
31
+ include Gorillib::Model
32
+ include Gorillib::Model::LoadFromCsv
33
+ BLANKISH_STRINGS = ["", nil, "NULL", '\\N', "NONE", "NA", "Null", "..."]
34
+
35
+ field :airline_ofid, Integer, blankish: BLANKISH_STRINGS, doc: "Unique OpenFlights identifier for this airline.", identifier: true
36
+ field :name, String, blankish: BLANKISH_STRINGS, doc: "Airline name."
37
+ field :alias, String, blankish: BLANKISH_STRINGS, doc: "Alias of the airline. For example, 'All Nippon Airways' is commonly known as 'ANA'"
38
+ field :iata_id, String, blankish: BLANKISH_STRINGS, doc: "2-letter IATA code, if available", identifier: true, length: 2
39
+ field :icao_id, String, blankish: BLANKISH_STRINGS, doc: "3-letter ICAO code, if available", identifier: true, length: 2
40
+ field :callsign, String, blankish: BLANKISH_STRINGS, doc: "Airline callsign"
41
+ field :country, String, blankish: BLANKISH_STRINGS, doc: "Country or territory where airline is incorporated"
42
+ field :active, :boolean, blankish: BLANKISH_STRINGS, doc: 'true if the airline is or has until recently been operational, false if it is defunct. (This is only a rough indication and should not be taken as 100% accurate)'
43
+
44
+ def receive_iata_id(val) super if val =~ /\A\w+\z/ ; end
45
+ def receive_icao_id(val) super if val =~ /\A\w+\z/ ; end
46
+ def receive_active(val)
47
+ super(case val.to_s when "Y" then true when "N" then false else val ; end)
48
+ end
49
+
50
+ def to_airline
51
+ Airline.receive(self.compact_attributes)
52
+ end
53
+
54
+ def self.load_airlines(filename)
55
+ load_csv(filename){|raw_airline| yield(raw_airline.to_airline) }
56
+ end
57
+ end
@@ -0,0 +1,83 @@
1
+ require_relative('../../rake_helper')
2
+ require_relative('./models')
3
+
4
+ Pathname.register_paths(
5
+ af_data: [:data, 'airline_flights'],
6
+ af_work: [:work, 'airline_flights'],
7
+ af_code: File.dirname(__FILE__),
8
+ #
9
+ openflights_raw_airports: [:af_data, "openflights_airports-raw#{Settings[:mini_slug]}.csv" ],
10
+ openflights_raw_airlines: [:af_data, "openflights_airlines-raw.csv" ],
11
+ dataexpo_raw_airports: [:af_data, "dataexpo_airports-raw#{Settings[:mini_slug]}.csv" ],
12
+ wikipedia_icao: [:af_data, "wikipedia_icao.tsv" ],
13
+ wikipedia_iata: [:af_data, "wikipedia_iata.tsv" ],
14
+ wikipedia_us_abroad: [:af_data, "wikipedia_us_abroad.tsv" ],
15
+ #
16
+ openflights_airports: [:af_work, "openflights_airports-parsed#{Settings[:mini_slug]}.tsv"],
17
+ openflights_airlines: [:af_work, "openflights_airlines-parsed#{Settings[:mini_slug]}.tsv"],
18
+ dataexpo_airports: [:af_work, "dataexpo_airports-parsed#{Settings[:mini_slug]}.tsv" ],
19
+ airport_identifiers: [:af_work, "airport_identifiers.tsv" ],
20
+ airport_identifiers_mini: [:af_work, "airport_identifiers-sample.tsv" ],
21
+ # helpers
22
+ country_name_lookup: [:work, 'geo', "country_name_lookup.tsv"],
23
+ )
24
+
25
+ chain :airline_flights do
26
+ code_files = FileList[Pathname.of(:af_code, '*.rb').to_s]
27
+ chain(:parse) do
28
+
29
+ # desc 'parse the dataexpo airports'
30
+ # create_file(:dataexpo_airports, after: code_files) do |dest|
31
+ # RawDataexpoAirport.load_airports(:dataexpo_raw_airports) do |airport|
32
+ # dest << airport.to_tsv << "\n"
33
+ # end
34
+ # end
35
+
36
+ desc 'parse the openflights airports'
37
+ create_file(:openflights_airports, after: [code_files, :force]) do |dest|
38
+ require_relative('../geo/geo_models')
39
+ Geo::CountryNameLookup.load
40
+ RawOpenflightAirport.load_airports(:openflights_raw_airports) do |airport|
41
+ dest << airport.to_tsv << "\n"
42
+ # puts airport.country
43
+ end
44
+ end
45
+
46
+ # task :reconcile_airports => [:dataexpo_airports, :openflights_airports] do
47
+ # require_relative 'reconcile_airports'
48
+ # Airport::IdReconciler.load_all
49
+ # end
50
+ #
51
+ # desc 'run the identifier reconciler'
52
+ # create_file(:airport_identifiers, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
53
+ # Airport::IdReconciler.airports.each do |airport|
54
+ # dest << airport.to_tsv << "\n"
55
+ # end
56
+ # end
57
+ #
58
+ # desc 'run the identifier reconciler'
59
+ # create_file(:airport_identifiers_mini, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
60
+ # Airport::IdReconciler.exemplars.each do |airport|
61
+ # dest << airport.to_tsv << "\n"
62
+ # end
63
+ # end
64
+ #
65
+ # desc 'parse the openflights airlines'
66
+ # create_file(:openflights_airlines, after: code_files) do |dest|
67
+ # RawOpenflightAirline.load_airlines(:openflights_raw_airlines) do |airline|
68
+ # dest << airline.to_tsv << "\n"
69
+ # puts airline.to_tsv
70
+ # end
71
+ # end
72
+
73
+ end
74
+ end
75
+
76
+ task :default => [
77
+ 'airline_flights',
78
+ # 'airline_flights:parse:dataexpo_airports',
79
+ # 'airline_flights:parse:openflights_airports',
80
+ # 'airline_flights:parse:airport_identifiers',
81
+ # 'airline_flights:parse:airport_identifiers_mini',
82
+ # 'airline_flights:parse:openflights_airlines',
83
+ ]
@@ -0,0 +1,211 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ ### @export "airport_model"
4
+ class Airport
5
+ include Gorillib::Model
6
+
7
+ field :icao, String, doc: "4-letter ICAO code, or blank if not assigned.", length: 4, identifier: true, :blankish => ["", nil]
8
+ field :iata, String, doc: "3-letter IATA code, or blank if not assigned.", length: 3, identifier: true, :blankish => ["", nil]
9
+ field :faa, String, doc: "3-letter FAA code, or blank if not assigned.", length: 3, identifier: true, :blankish => ["", nil]
10
+ field :utc_offset, Float, doc: "Hours offset from UTC. Fractional hours are expressed as decimals, eg. India is 5.5.", validates: { inclusion: (-12...12) }
11
+ field :dst_rule, String, doc: "Daylight savings time rule. One of E (Europe), A (US/Canada), S (South America), O (Australia), Z (New Zealand), N (None) or U (Unknown). See the readme for more.", validates: { inclusion: %w[E A S O Z N U] }
12
+ field :longitude, Float, doc: "Decimal degrees, usually to six significant digits. Negative is West, positive is East.", validates: { inclusion: (-180...180) }
13
+ field :latitude, Float, doc: "Decimal degrees, usually to six significant digits. Negative is South, positive is North.", validates: { inclusion: (-90.0...90.0) }
14
+ field :altitude, Float, doc: "Elevation in meters."
15
+ field :name, String, doc: "Name of airport."
16
+ field :country, String, doc: "Country or territory where airport is located.", length: 2
17
+ field :state, String, doc: "State in which the airport is located", length: 2
18
+ field :city, String, doc: "Main city served by airport. This is the logical city it serves; so, for example SFO gets 'San Francisco', not 'San Bruno'"
19
+ field :airport_ofid, String, doc: "OpenFlights identifier for this airport.", identifier: true
20
+ end
21
+ ### @export "nil"
22
+ class Airport
23
+ EXEMPLARS = %w[
24
+ ANC ATL AUS BDL BNA BOI BOS BWI CLE CLT
25
+ CMH DCA DEN DFW DTW EWR FLL HNL IAD IAH
26
+ IND JAX JFK LAS LAX LGA MCI MCO MDW MIA
27
+ MSP MSY OAK ORD PDX PHL PHX PIT PVD RDU
28
+ SAN SEA SFO SJC SJU SLC SMF STL TPA YYZ
29
+ ]
30
+
31
+ def utc_time_for(tm)
32
+ utc_time = tm.get_utc + utc_offset
33
+ utc_time += (60*60) if TimezoneFixup.dst?(tm)
34
+ utc_time
35
+ end
36
+
37
+ BLANKISH_STRINGS = ["", nil, "NULL", '\\N', "NONE", "NA", "Null", "..."]
38
+ OK_CHARS_RE = /[^a-zA-Z0-9\:\ \/\.\,\-\(\)\'ÁÂÄÅÇÉÍÎÑÓÖØÚÜÞàáâãäåæçèéêëìíîïðñóôõöøúüýĀāăĆćČčēėęěğīİıŁłńņňŌōőřŞşŠšţťūůųźŽžơț]/
39
+
40
+ def lint
41
+ errors = {}
42
+ errors["ICAO is wrong length"] = icao if icao.present? && icao.length != 4
43
+ if (icao && faa && (icao =~ /^K.../))
44
+ errors["ICAO != K+FAA yet ICAO is a K..."] = [icao, faa] if (icao != "K#{faa}")
45
+ end
46
+ # errors["ICAO present for piddlyshit airport"] = icao if icao.present? && ((faa.to_s.length == 4) || (faa.to_s =~ /\d/))
47
+ errors[:spaces] ||= []
48
+ errors[:funny] ||= []
49
+ attributes.each do |attr, val|
50
+ next if val.blank?
51
+ errors["#{attr} looks blankish"] = val if BLANKISH_STRINGS.include?(val)
52
+ if (val.is_a?(String))
53
+ errors[:spaces] << [attr, val] if (val.strip != val)
54
+ errors[:funny] << [attr, val] if val =~ OK_CHARS_RE
55
+ end
56
+ end
57
+ errors.compact_blank
58
+ end
59
+
60
+ def to_s
61
+ str = "#<Airport "
62
+ str << [icao, iata, faa,
63
+ (latitude && "%4.1f" % latitude), (longitude && "%5.1f" % longitude), state, country,
64
+ "%-30s" % name, country, city].join("\t")
65
+ str << ">"
66
+ end
67
+
68
+ def faa_controlled?
69
+ icao =~ /^(?:K|P[ABFGHJKMOPW]|T[IJ]|NS(AS|FQ|TU))/
70
+ end
71
+ end
72
+ ### @export "airport_load"
73
+ class Airport
74
+ include Gorillib::Model::LoadFromTsv
75
+ self.tsv_options.merge!(num_fields: 10..20)
76
+ def self.load_airports(filename)
77
+ load_tsv(filename){|airport| yield(airport) }
78
+ end
79
+
80
+ end
81
+ ### @export "nil"
82
+
83
+ #
84
+ # As of January 2012, the OpenFlights Airports Database contains 6977 airports
85
+ # [spanning the globe](http://openflights.org/demo/openflights-apdb-2048.png).
86
+ # If you enjoy this data, please consider [visiting their page and
87
+ # donating](http://openflights.org/data.html)
88
+ #
89
+ # > Note: Rules for daylight savings time change from year to year and from
90
+ # > country to country. The current data is an approximation for 2009, built on
91
+ # > a country level. Most airports in DST-less regions in countries that
92
+ # > generally observe DST (eg. AL, HI in the USA, NT, QL in Australia, parts of
93
+ # > Canada) are marked incorrectly.
94
+ #
95
+ # Sample entries
96
+ #
97
+ # 507,"Heathrow","London","United Kingdom","LHR","EGLL",51.4775,-0.461389,83,0,"E"
98
+ # 26,"Kugaaruk","Pelly Bay","Canada","YBB","CYBB",68.534444,-89.808056,56,-6,"A"
99
+ # 3127,"Pokhara","Pokhara","Nepal","PKR","VNPK",28.200881,83.982056,2712,5.75,"N"
100
+ #
101
+
102
+ ### @export "raw_openflight_airport"
103
+
104
+ module RawAirport
105
+ COUNTRIES = { 'Puerto Rico' => 'us', 'Canada' => 'ca', 'USA' => 'us', 'United States' => 'us',
106
+ 'Northern Mariana Islands' => 'us', 'N Mariana Islands' => 'us',
107
+ 'Federated States of Micronesia' => 'fm',
108
+ 'Thailand' => 'th', 'Palau' => 'pw',
109
+ 'American Samoa' => 'as', 'Wake Island' => 'us', 'Virgin Islands' => 'vi', 'Guam' => 'gu'
110
+ }
111
+ BLANKISH_STRINGS = ["", nil, "NULL", '\\N', "NONE", "NA", "Null", "..."]
112
+ OK_CHARS_RE = /[^a-zA-Z0-9\:\ \/\.\,\-\(\)\'ÁÂÄÅÇÉÍÎÑÓÖØÚÜÞàáâãäåæçèéêëìíîïðñóôõöøúüýĀāăĆćČčēėęěğīİıŁłńņňŌōőřŞşŠšţťūůųźŽžơț]/
113
+
114
+ def receive_city(val)
115
+ super.tap{|val| if val then val.strip! ; val.gsub!(/\\+/, '') ; end }
116
+ end
117
+
118
+ def receive_country(val)
119
+ super(COUNTRIES[val] || val)
120
+ end
121
+
122
+ def receive_name(val)
123
+ super.tap do |val|
124
+ if val
125
+ val.strip!
126
+ val.gsub!(/\\+/, '')
127
+ val.gsub!(/\s*\[(military|private)\]/, '')
128
+ val.gsub!(/\b(Int\'l|International)\b/, 'Intl')
129
+ val.gsub!(/\b(Intercontinental)\b/, 'Intcntl')
130
+ val.gsub!(/\b(Airpt)\b/, 'Airport')
131
+ val.gsub!(/ Airport$/, '')
132
+ end
133
+ end
134
+ end
135
+ end
136
+
137
+ #
138
+ class RawOpenflightAirport
139
+ include Gorillib::Model
140
+ include Gorillib::Model::LoadFromCsv
141
+ include RawAirport
142
+ #
143
+ field :airport_ofid, String, doc: "Unique OpenFlights identifier for this airport."
144
+ field :name, String, doc: "Name of airport. May or may not contain the City name."
145
+ field :city, String, blankish: BLANKISH_STRINGS, doc: "Main city served by airport. May be spelled differently from Name."
146
+ field :country, String, doc: "Country or territory where airport is located."
147
+ field :iata_faa, String, blankish: BLANKISH_STRINGS, doc: "3-letter FAA code, for airports located in the USA. For all other airports, 3-letter IATA code, or blank if not assigned."
148
+ field :icao, String, blankish: BLANKISH_STRINGS, doc: "4-letter ICAO code; Blank if not assigned."
149
+ field :latitude, Float, doc: "Decimal degrees, usually to six significant digits. Negative is South, positive is North."
150
+ field :longitude, Float, doc: "Decimal degrees, usually to six significant digits. Negative is West, positive is East."
151
+ field :altitude_ft, Float, blankish: ['', nil, 0, '0'], doc: "In feet."
152
+ field :utc_offset, Float, doc: "Hours offset from UTC. Fractional hours are expressed as decimals, eg. India is 5.5."
153
+ field :dst_rule, String, doc: "Daylight savings time rule. One of E (Europe), A (US/Canada), S (South America), O (Australia), Z (New Zealand), N (None) or U (Unknown). See the readme for more."
154
+
155
+ UNRELIABLE_OPENFLIGHTS_IATA_VALUES = /^(7AK|AGA|AUQ|BDJ|BGW|BME|BPM|BXH|BZY|CAT|CEE|CEJ|CFS|CGU|CIO|CLV|CNN|DEE|DIB|DNM|DUH|DUR|FKI|GES|GSM|HKV|HOJ|HYD|IEO|IFN|IKA|IZA|JCU|JGS|KMW|KNC|LGQ|LUM|MCU|MCY|MDO|MOH|MON|MPH|MVF|NAY|NMA|NOE|NQY|OTU|OUI|PBV|PCA|PCB|PGK|PHO|PIF|PKN|PKY|PMK|PTG|PZO|QAS|QKT|QVY|RCM|RJL|RTG|SBG|SDZ|SFG|SIC|SIQ|SJI|SRI|STP|STU|SWQ|TJQ|TJS|TMC|TYA|UKC|VIY|VQS|VTS|WDH|WKM|WPR|WPU|ZQF)$/
156
+
157
+ def id_is_faa?
158
+ (icao =~ /^(?:K)/) || (icao.blank? && country == 'us')
159
+ end
160
+
161
+ def iata ; (id_is_faa? ? nil : iata_faa) unless iata_faa =~ UNRELIABLE_OPENFLIGHTS_IATA_VALUES end
162
+ def faa ; (id_is_faa? ? iata_faa : nil ) end
163
+ def altitude
164
+ altitude_ft && (0.3048 * altitude_ft).round(1)
165
+ end
166
+
167
+ def receive_country(val)
168
+ country = Geo::CountryNameLookup.for_alt_name(val, nil)
169
+ p val unless country
170
+ super(country ? country.country_id : val)
171
+ end
172
+
173
+ def to_airport
174
+ attrs = self.compact_attributes.except(:altitude_ft)
175
+ attrs[:altitude] = altitude
176
+ attrs[:iata] = iata unless iata.to_s =~ UNRELIABLE_OPENFLIGHTS_IATA_VALUES
177
+ attrs[:faa] = faa
178
+ Airport.receive(attrs)
179
+ end
180
+
181
+ def self.load_airports(filename)
182
+ load_csv(filename){|raw_airport| yield(raw_airport.to_airport) }
183
+ end
184
+ end
185
+
186
+ ### @export "raw_dataexpo_airport"
187
+ class RawDataexpoAirport
188
+ include Gorillib::Model
189
+ include Gorillib::Model::LoadFromCsv
190
+ include RawAirport
191
+ self.csv_options = self.csv_options.merge(pop_headers: true)
192
+
193
+ field :faa, String, doc: "the international airport abbreviation code"
194
+ field :name, String, doc: "Airport name"
195
+ field :city, String, blankish: ["NA"], doc: "city in which the airport is located"
196
+ field :state, String, blankish: ["NA"], doc: "state in which the airport is located"
197
+ field :country, String, doc: "country in which airport is located"
198
+ field :latitude, Float, doc: "latitude of the airport"
199
+ field :longitude, Float, doc: "longitude of the airport"
200
+
201
+ def to_airport
202
+ attrs = self.compact_attributes
203
+ attrs[:icao] = "K#{faa}" if faa =~ /[A-Z]{3}/ && (not ['PR', 'AK', 'CQ', 'HI', 'AS', 'GU', 'VI'].include?(state)) && (country == 'us')
204
+ Airport.receive(attrs)
205
+ end
206
+
207
+ def self.load_airports(filename)
208
+ load_csv(filename){|raw_airport| yield(raw_airport.to_airport) }
209
+ end
210
+ end
211
+ ### @export "nil"
@@ -0,0 +1,129 @@
1
+ class Airport
2
+
3
+ # [Hash] all options passed to the field not recognized by one of its own current fields
4
+ attr_reader :_extra_attributes
5
+
6
+ # # Airports whose IATA and FAA codes differ; all are in the US, so their ICAO is "K"+the FAA id
7
+ # FAA_ICAO_FIXUP = {
8
+ # "GRM" => "CKC", "CLD" => "CRQ", "SDX" => "SEZ", "AZA" => "IWA", "SCE" => "UNV", "BLD" => "BVU",
9
+ # "LKE" => "W55", "HSH" => "HND", "BKG" => "BBG", "UST" => "SGJ", "LYU" => "ELO", "WFK" => "FVE",
10
+ # "FRD" => "FHR", "ESD" => "ORS", "RKH" => "UZA", "NZC" => "VQQ", "SCF" => "SDL", "JCI" => "IXD",
11
+ # "AVW" => "AVQ", "UTM" => "UTA", "ONP" => "NOP", }
12
+ #
13
+ # [:iata, :icao, :latitude, :longitude, :country, :city, :name].each do |attr|
14
+ # define_method("of_#{attr}"){ @_extra_attributes[:"of_#{attr}"] }
15
+ # define_method("de_#{attr}"){ @_extra_attributes[:"de_#{attr}"] }
16
+ # end
17
+ #
18
+ # def lint_differences
19
+ # errors = {}
20
+ # return errors unless de_name.present? && of_name.present?
21
+ # [
22
+ # [:iata, of_iata, de_iata], [:icao, of_icao, de_icao], [:country, of_country, de_country],
23
+ # [:city, of_city, de_city],
24
+ # [:name, of_name, de_name],
25
+ # ].each{|attr, of, de| next unless of && de ; errors[attr] = [of, de] if of != de }
26
+ #
27
+ # if (of_latitude && of_longitude && de_latitude && de_longitude)
28
+ # lat_diff = (of_latitude - de_latitude ).abs
29
+ # lng_diff = (of_longitude - de_longitude).abs
30
+ # unless (lat_diff < 0.015) && (lng_diff < 0.015)
31
+ # msg = [of_latitude, de_latitude, of_longitude, de_longitude, lat_diff, lng_diff].map{|val| "%9.4f" % val }.join(" ")
32
+ # errors["distance"] = ([msg, of_city, de_city, of_name, de_name])
33
+ # end
34
+ # end
35
+ #
36
+ # errors
37
+ # end
38
+ #
39
+ # AIRPORTS = Hash.new # unless defined?(AIRPORTS)
40
+ # def self.load(of_filename, de_filename)
41
+ # RawOpenflightAirport.load_csv(of_filename) do |raw_airport|
42
+ # airport = raw_airport.to_airport
43
+ # AIRPORTS[airport.iata_icao] = airport
44
+ # end
45
+ # RawDataexpoAirport.load_csv(de_filename) do |raw_airport|
46
+ # airport = (AIRPORTS[raw_airport.iata_icao] ||= self.new)
47
+ # if airport.de_name
48
+ # warn "duplicate data for #{[iata, de_iata, icao, de_icao]}: #{raw_airport.to_tsv} #{airport.to_tsv}"
49
+ # end
50
+ # airport.receive!(raw_airport.airport_attrs)
51
+ # end
52
+ # AIRPORTS
53
+ # end
54
+
55
+ def self.load(dirname)
56
+ load_csv(File.join(dirname, 'wikipedia_icao.tsv')) do |id_mapping|
57
+ [:icao, :iata, :faa ].each do |attr|
58
+ val = id_mapping.read_attribute(attr) or next
59
+ next if (val == '.') || (val == '_')
60
+ if that = ID_MAPPINGS[attr][val]
61
+ lint = that.disagreements(id_mapping)
62
+ puts [attr, val, "%-25s" % lint.inspect, id_mapping, that, "%-60s" % id_mapping.name, "%-25s" % that.name].join("\t") if lint.present?
63
+ else
64
+ ID_MAPPINGS[attr][val] = id_mapping
65
+ end
66
+ end
67
+ # [:icao, :iata, :faa ].each do |attr|
68
+ # val = id_mapping.read_attribute(attr)
69
+ # ID_MAPPINGS[attr][val] = id_mapping
70
+ # end
71
+ end
72
+ load_csv(File.join(dirname, 'wikipedia_iata.tsv')) do |id_mapping|
73
+ # if not ID_MAPPINGS[:icao].has_key?(id_mapping.icao)
74
+ # puts [:badicao, "%-25s" % "", id_mapping, " "*24, "%-60s" % id_mapping.name].join("\t")
75
+ # end
76
+ [:icao, :iata, :faa ].each do |attr|
77
+ val = id_mapping.read_attribute(attr) or next
78
+ next if (val == '.') || (val == '_')
79
+ if that = ID_MAPPINGS[attr][val]
80
+ lint = that.disagreements(id_mapping)
81
+ puts [attr, val, "%-25s" % lint.inspect, id_mapping, that, "%-60s" % id_mapping.name, "%-25s" % that.name].join("\t") if lint.present?
82
+ else
83
+ ID_MAPPINGS[attr][val] = id_mapping
84
+ end
85
+ end
86
+ end
87
+
88
+ # def adopt_field(that, attr)
89
+ # this_val = self.read_attribute(attr)
90
+ # that_val = that.read_attribute(attr)
91
+ # if name =~ /Bogus|Austin/i
92
+ # puts [attr, this_val, that_val, attribute_set?(attr), that.attribute_set?(attr), to_tsv, that.to_tsv].join("\t")
93
+ # end
94
+ # if this_val && that_val
95
+ # if (this_val != that_val) then warn [attr, this_val, that_val, name].join("\t") ; end
96
+ # elsif that_val
97
+ # write_attribute(that_val)
98
+ # end
99
+ # end
100
+
101
+ def to_s
102
+ attributes.values[0..2].join("\t")
103
+ end
104
+
105
+ def disagreements(that)
106
+ errors = {}
107
+ [:icao, :iata, :faa ].each do |attr|
108
+ this_val = self.read_attribute(attr) or next
109
+ that_val = that.read_attribute(attr) or next
110
+ next if that_val == '.' || that_val == '_'
111
+ errors[attr] = [this_val, that_val] if this_val != that_val
112
+ end
113
+ errors
114
+ end
115
+
116
+ def self.dump_ids(ids)
117
+ "%s\t%s\t%s" % [icao, iata, faa]
118
+ end
119
+ def self.dump_mapping
120
+ [:icao, :iata, :faa].map do |attr|
121
+ "%-50s" % ID_MAP[attr].to_a.sort.map{|id, val| "#{id}:#{val.icao||' '}|#{val.iata||' '}|#{val.faa||' '}"}.join(";")
122
+ end
123
+ end
124
+
125
+ def self.dump_info(kind, ids, reconciler, existing, *args)
126
+ ex_str = [existing.map{|el| dump_ids(el.ids) }, "\t\t","\t\t","\t\t"].flatten[0..2]
127
+ puts [kind, dump_ids(ids), dump_ids(reconciler.ids), ex_str, *args, dump_mapping.join("//") ].flatten.join("\t| ")
128
+ end
129
+ end