ul-wukong 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +60 -0
  3. data/.gitmodules +6 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +19 -0
  6. data/.yardopts +6 -0
  7. data/CHANGELOG.md +7 -0
  8. data/Gemfile +17 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE.md +95 -0
  11. data/NOTES-travis.md +31 -0
  12. data/README-old.md +422 -0
  13. data/README.md +1308 -0
  14. data/Rakefile +28 -0
  15. data/TODO.md +99 -0
  16. data/bin/cutc +30 -0
  17. data/bin/cuttab +5 -0
  18. data/bin/greptrue +6 -0
  19. data/bin/md5sort +20 -0
  20. data/bin/setcat +11 -0
  21. data/bin/tabchar +5 -0
  22. data/bin/uniq-ord +59 -0
  23. data/bin/uniqc +3 -0
  24. data/bin/wu +34 -0
  25. data/bin/wu-clean-encoding +31 -0
  26. data/bin/wu-date +13 -0
  27. data/bin/wu-datetime +13 -0
  28. data/bin/wu-hist +3 -0
  29. data/bin/wu-lign +186 -0
  30. data/bin/wu-local +4 -0
  31. data/bin/wu-plus +9 -0
  32. data/bin/wu-source +5 -0
  33. data/bin/wu-sum +31 -0
  34. data/diagrams/wu_local.dot +39 -0
  35. data/diagrams/wu_local.dot.png +0 -0
  36. data/examples/Gemfile +38 -0
  37. data/examples/README.md +9 -0
  38. data/examples/basic/string_reverser.rb +23 -0
  39. data/examples/basic/tiny_count.rb +8 -0
  40. data/examples/basic/word_count/accumulator.rb +26 -0
  41. data/examples/basic/word_count/tokenizer.rb +13 -0
  42. data/examples/basic/word_count/word_count.rb +6 -0
  43. data/examples/dataflow/scraper_macro_flow.rb +28 -0
  44. data/examples/deploy_pack/Gemfile +6 -0
  45. data/examples/deploy_pack/README.md +6 -0
  46. data/examples/deploy_pack/a/b/c/.gitkeep +0 -0
  47. data/examples/deploy_pack/app/processors/string_reverser.rb +5 -0
  48. data/examples/deploy_pack/config/environment.rb +1 -0
  49. data/examples/dsl/dataflow/fibonacci_series.rb +101 -0
  50. data/examples/dsl/dataflow/scraper_macro_flow.rb +28 -0
  51. data/examples/dsl/dataflow/simple.rb +12 -0
  52. data/examples/dsl/dataflow/telegram.rb +45 -0
  53. data/examples/dsl/workflow/cherry_pie.dot +97 -0
  54. data/examples/dsl/workflow/cherry_pie.md +104 -0
  55. data/examples/dsl/workflow/cherry_pie.png +0 -0
  56. data/examples/dsl/workflow/cherry_pie.rb +101 -0
  57. data/examples/empty/.gitkeep +0 -0
  58. data/examples/examples_helper.rb +9 -0
  59. data/examples/geo.rb +4 -0
  60. data/examples/geo/geo_grids.numbers +0 -0
  61. data/examples/geo/geolocated.rb +331 -0
  62. data/examples/geo/quadtile.rb +69 -0
  63. data/examples/geo/spec/geolocated_spec.rb +247 -0
  64. data/examples/geo/tile_fetcher.rb +77 -0
  65. data/examples/graph/implied_geolocation/README.md +63 -0
  66. data/examples/graph/minimum_spanning_tree/airfares_graphviz.rb +73 -0
  67. data/examples/improver/tweet_summary.rb +73 -0
  68. data/examples/loadable.rb +2 -0
  69. data/examples/munging/airline_flights/airline_flights.rake +83 -0
  70. data/examples/munging/airline_flights/airplane.rb +0 -0
  71. data/examples/munging/airline_flights/airport_id_unification.rb +129 -0
  72. data/examples/munging/airline_flights/airport_ok_chars.rb +4 -0
  73. data/examples/munging/airline_flights/indexable.rb +75 -0
  74. data/examples/munging/airline_flights/indexable_spec.rb +90 -0
  75. data/examples/munging/airline_flights/reconcile_airports.rb +142 -0
  76. data/examples/munging/airline_flights/tasks.rake +83 -0
  77. data/examples/munging/airline_flights/topcities.rb +167 -0
  78. data/examples/munging/geo/geo_json.rb +54 -0
  79. data/examples/munging/geo/geo_models.rb +69 -0
  80. data/examples/munging/geo/geonames_models.rb +107 -0
  81. data/examples/munging/geo/iso_codes.rb +172 -0
  82. data/examples/munging/geo/reconcile_countries.rb +124 -0
  83. data/examples/munging/geo/tasks.rake +71 -0
  84. data/examples/munging/wikipedia/articles/extract_articles-parsed.rb +79 -0
  85. data/examples/munging/wikipedia/articles/extract_articles-templated.rb +136 -0
  86. data/examples/munging/wikipedia/articles/textualize_articles.rb +54 -0
  87. data/examples/munging/wikipedia/articles/verify_structure.rb +43 -0
  88. data/examples/munging/wikipedia/articles/wp2txt-LICENSE.txt +22 -0
  89. data/examples/munging/wikipedia/articles/wp2txt_article.rb +259 -0
  90. data/examples/munging/wikipedia/articles/wp2txt_utils.rb +452 -0
  91. data/examples/munging/wikipedia/dbpedia/dbpedia_common.rb +5 -0
  92. data/examples/munging/wikipedia/dbpedia/dbpedia_extract_geocoordinates.rb +78 -0
  93. data/examples/munging/wikipedia/dbpedia/extract_links-cruft.rb +66 -0
  94. data/examples/munging/wikipedia/dbpedia/extract_links.rb +260 -0
  95. data/examples/munging/wikipedia/dbpedia/sameas_extractor.rb +20 -0
  96. data/examples/rake_helper.rb +97 -0
  97. data/examples/ruby_project/Gemfile +6 -0
  98. data/examples/ruby_project/README.md +6 -0
  99. data/examples/ruby_project/a/b/c/.gitkeep +0 -0
  100. data/examples/server_logs/geo_ip_mapping/munge_geolite.rb +82 -0
  101. data/examples/server_logs/logline.rb +95 -0
  102. data/examples/server_logs/models.rb +66 -0
  103. data/examples/server_logs/page_counts.pig +48 -0
  104. data/examples/server_logs/server_logs-01-parse-script.rb +13 -0
  105. data/examples/server_logs/server_logs-02-histograms-full.rb +33 -0
  106. data/examples/server_logs/server_logs-02-histograms-mapper.rb +14 -0
  107. data/examples/server_logs/server_logs-03-breadcrumbs-full.rb +71 -0
  108. data/examples/server_logs/server_logs-04-page_page_edges-full.rb +40 -0
  109. data/examples/serverlogs/geo_ip_mapping/munge_geolite.rb +82 -0
  110. data/examples/serverlogs/models/logline.rb +102 -0
  111. data/examples/serverlogs/parser/apache_parser_widget.rb +46 -0
  112. data/examples/serverlogs/visit_paths/common.rb +4 -0
  113. data/examples/serverlogs/visit_paths/page_counts.pig +48 -0
  114. data/examples/serverlogs/visit_paths/serverlogs-01-parse-script.rb +11 -0
  115. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-full.rb +31 -0
  116. data/examples/serverlogs/visit_paths/serverlogs-02-histograms-mapper.rb +12 -0
  117. data/examples/serverlogs/visit_paths/serverlogs-03-breadcrumbs-full.rb +67 -0
  118. data/examples/serverlogs/visit_paths/serverlogs-04-page_page_edges-full.rb +38 -0
  119. data/examples/splitter.rb +94 -0
  120. data/examples/string_reverser.rb +7 -0
  121. data/examples/text/pig_latin/pig_latinizer.rb +35 -0
  122. data/examples/text/pig_latin/pig_latinizer_widget.rb +16 -0
  123. data/examples/text/regional_flavor/README.md +14 -0
  124. data/examples/text/regional_flavor/article_wordbags.pig +39 -0
  125. data/examples/text/regional_flavor/j01-article_wordbags.rb +4 -0
  126. data/examples/text/regional_flavor/simple_pig_script.pig +27 -0
  127. data/examples/twitter.rb +5 -0
  128. data/lib/hanuman.rb +36 -0
  129. data/lib/hanuman/graph.rb +97 -0
  130. data/lib/hanuman/graphvizzer.rb +206 -0
  131. data/lib/hanuman/graphvizzer/gv_models.rb +161 -0
  132. data/lib/hanuman/graphvizzer/gv_presenter.rb +97 -0
  133. data/lib/hanuman/link.rb +35 -0
  134. data/lib/hanuman/registry.rb +46 -0
  135. data/lib/hanuman/stage.rb +128 -0
  136. data/lib/hanuman/tree.rb +67 -0
  137. data/lib/wu/geo.rb +4 -0
  138. data/lib/wu/geo/geo_grids.numbers +0 -0
  139. data/lib/wu/geo/geolocated.rb +331 -0
  140. data/lib/wu/geo/quadtile.rb +69 -0
  141. data/lib/wu/graph/union_find.rb +62 -0
  142. data/lib/wu/model/reconcilable.rb +63 -0
  143. data/lib/wu/munging.rb +71 -0
  144. data/lib/wu/social/models/twitter.rb +31 -0
  145. data/lib/wu/wikipedia/models.rb +20 -0
  146. data/lib/wukong.rb +54 -0
  147. data/lib/wukong/dataflow.rb +43 -0
  148. data/lib/wukong/doc_helpers.rb +14 -0
  149. data/lib/wukong/doc_helpers/dataflow_handler.rb +29 -0
  150. data/lib/wukong/doc_helpers/field_handler.rb +91 -0
  151. data/lib/wukong/doc_helpers/processor_handler.rb +29 -0
  152. data/lib/wukong/driver.rb +214 -0
  153. data/lib/wukong/driver/event_machine_driver.rb +15 -0
  154. data/lib/wukong/driver/wiring.rb +68 -0
  155. data/lib/wukong/local.rb +42 -0
  156. data/lib/wukong/local/runner.rb +96 -0
  157. data/lib/wukong/local/stdio_driver.rb +104 -0
  158. data/lib/wukong/logger.rb +102 -0
  159. data/lib/wukong/model/faker.rb +136 -0
  160. data/lib/wukong/model/flatpack_parser/flat.rb +60 -0
  161. data/lib/wukong/model/flatpack_parser/flatpack.rb +4 -0
  162. data/lib/wukong/model/flatpack_parser/lang.rb +46 -0
  163. data/lib/wukong/model/flatpack_parser/parser.rb +55 -0
  164. data/lib/wukong/model/flatpack_parser/tokens.rb +130 -0
  165. data/lib/wukong/plugin.rb +48 -0
  166. data/lib/wukong/processor.rb +110 -0
  167. data/lib/wukong/rake_helper.rb +6 -0
  168. data/lib/wukong/runner.rb +169 -0
  169. data/lib/wukong/runner/boot_sequence.rb +123 -0
  170. data/lib/wukong/runner/code_loader.rb +52 -0
  171. data/lib/wukong/runner/command_runner.rb +44 -0
  172. data/lib/wukong/runner/deploy_pack_loader.rb +75 -0
  173. data/lib/wukong/runner/help_message.rb +42 -0
  174. data/lib/wukong/source.rb +33 -0
  175. data/lib/wukong/source/source_driver.rb +74 -0
  176. data/lib/wukong/source/source_runner.rb +38 -0
  177. data/lib/wukong/spec_helpers.rb +74 -0
  178. data/lib/wukong/spec_helpers/integration_tests.rb +150 -0
  179. data/lib/wukong/spec_helpers/integration_tests/integration_test_matchers.rb +207 -0
  180. data/lib/wukong/spec_helpers/integration_tests/integration_test_runner.rb +97 -0
  181. data/lib/wukong/spec_helpers/shared_examples.rb +22 -0
  182. data/lib/wukong/spec_helpers/unit_tests.rb +135 -0
  183. data/lib/wukong/spec_helpers/unit_tests/unit_test_driver.rb +132 -0
  184. data/lib/wukong/spec_helpers/unit_tests/unit_test_matchers.rb +169 -0
  185. data/lib/wukong/spec_helpers/unit_tests/unit_test_runner.rb +60 -0
  186. data/lib/wukong/version.rb +3 -0
  187. data/lib/wukong/widget/echo.rb +55 -0
  188. data/lib/wukong/widget/extract.rb +122 -0
  189. data/lib/wukong/widget/filters.rb +452 -0
  190. data/lib/wukong/widget/logger.rb +56 -0
  191. data/lib/wukong/widget/operators.rb +82 -0
  192. data/lib/wukong/widget/reducers.rb +10 -0
  193. data/lib/wukong/widget/reducers/accumulator.rb +73 -0
  194. data/lib/wukong/widget/reducers/bin.rb +368 -0
  195. data/lib/wukong/widget/reducers/count.rb +73 -0
  196. data/lib/wukong/widget/reducers/group.rb +128 -0
  197. data/lib/wukong/widget/reducers/group_concat.rb +98 -0
  198. data/lib/wukong/widget/reducers/improver.rb +71 -0
  199. data/lib/wukong/widget/reducers/join_xml.rb +37 -0
  200. data/lib/wukong/widget/reducers/moments.rb +72 -0
  201. data/lib/wukong/widget/reducers/sort.rb +180 -0
  202. data/lib/wukong/widget/reducers/uniq.rb +91 -0
  203. data/lib/wukong/widget/serializers.rb +317 -0
  204. data/lib/wukong/widget/utils.rb +46 -0
  205. data/lib/wukong/widgets.rb +7 -0
  206. data/spec/examples/dataflow/fibonacci_series_spec.rb +18 -0
  207. data/spec/examples/dataflow/parse_apache_logs_spec.rb +8 -0
  208. data/spec/examples/dataflow/parsing_spec.rb +14 -0
  209. data/spec/examples/dataflow/simple_spec.rb +34 -0
  210. data/spec/examples/dataflow/telegram_spec.rb +43 -0
  211. data/spec/examples/graph/minimum_spanning_tree_spec.rb +34 -0
  212. data/spec/examples/munging/airline_flights/identifiers_spec.rb +16 -0
  213. data/spec/examples/munging/airline_flights_spec.rb +202 -0
  214. data/spec/examples/text/pig_latin_spec.rb +18 -0
  215. data/spec/examples/workflow/cherry_pie_spec.rb +36 -0
  216. data/spec/hanuman/graph_spec.rb +119 -0
  217. data/spec/hanuman/hanuman_spec.rb +10 -0
  218. data/spec/hanuman/registry_spec.rb +123 -0
  219. data/spec/hanuman/stage_spec.rb +81 -0
  220. data/spec/hanuman/tree_spec.rb +119 -0
  221. data/spec/spec.opts +1 -0
  222. data/spec/spec_helper.rb +43 -0
  223. data/spec/support/example_test_helpers.rb +95 -0
  224. data/spec/support/hanuman_test_helpers.rb +92 -0
  225. data/spec/support/integration_helper.rb +38 -0
  226. data/spec/support/model_test_helpers.rb +115 -0
  227. data/spec/support/shared_context_for_graphs.rb +57 -0
  228. data/spec/support/shared_context_for_reducers.rb +37 -0
  229. data/spec/support/shared_examples_for_builders.rb +94 -0
  230. data/spec/support/shared_examples_for_shortcuts.rb +57 -0
  231. data/spec/wu/model/reconcilable_spec.rb +152 -0
  232. data/spec/wukong/dataflow_spec.rb +87 -0
  233. data/spec/wukong/driver_spec.rb +154 -0
  234. data/spec/wukong/local/runner_spec.rb +29 -0
  235. data/spec/wukong/local/stdio_driver_spec.rb +73 -0
  236. data/spec/wukong/local_spec.rb +6 -0
  237. data/spec/wukong/logger_spec.rb +49 -0
  238. data/spec/wukong/model/faker_spec.rb +132 -0
  239. data/spec/wukong/processor_spec.rb +21 -0
  240. data/spec/wukong/runner_spec.rb +132 -0
  241. data/spec/wukong/source_spec.rb +6 -0
  242. data/spec/wukong/widget/extract_spec.rb +101 -0
  243. data/spec/wukong/widget/filters_spec.rb +79 -0
  244. data/spec/wukong/widget/logger_spec.rb +23 -0
  245. data/spec/wukong/widget/operators_spec.rb +25 -0
  246. data/spec/wukong/widget/reducers/bin_spec.rb +92 -0
  247. data/spec/wukong/widget/reducers/count_spec.rb +11 -0
  248. data/spec/wukong/widget/reducers/group_spec.rb +21 -0
  249. data/spec/wukong/widget/reducers/join_xml_spec.rb +25 -0
  250. data/spec/wukong/widget/reducers/moments_spec.rb +36 -0
  251. data/spec/wukong/widget/reducers/sort_spec.rb +26 -0
  252. data/spec/wukong/widget/reducers/uniq_spec.rb +14 -0
  253. data/spec/wukong/widget/serializers_spec.rb +114 -0
  254. data/spec/wukong/widget/sink_spec.rb +19 -0
  255. data/spec/wukong/widget/source_spec.rb +65 -0
  256. data/spec/wukong/wu-local_spec.rb +109 -0
  257. data/spec/wukong/wu-source_spec.rb +32 -0
  258. data/spec/wukong/wu_spec.rb +14 -0
  259. data/spec/wukong/wukong_spec.rb +10 -0
  260. data/wukong.gemspec +35 -0
  261. metadata +465 -0
@@ -0,0 +1,90 @@
1
+ require 'spec_helper'
2
+ require 'gorillib/model'
3
+ require 'gorillib/pathname'
4
+ #
5
+ require 'gorillib/model/serialization'
6
+ require 'gorillib/model/serialization/tsv'
7
+ require 'gorillib/array/hashify'
8
+ #
9
+ require 'wu/model/indexable'
10
+
11
+ describe Gorillib::Model::Indexable, :model_spec, :only do
12
+ let(:mock_array){ mock('array') }
13
+
14
+ let(:country_code_class) do
15
+ module Gorillib::Test
16
+ remove_const(:CountryCode) if defined?(CountryCode)
17
+
18
+ class CountryCode
19
+ include Gorillib::Model
20
+ include Gorillib::Model::Indexable
21
+ field :alpha_2_code, String, position: 0
22
+ field :name, String, position: 1
23
+ def self.load
24
+ self.values << new('dj', 'Djibouti')
25
+ self.values << new('us', 'United States of America')
26
+ values
27
+ end
28
+ end
29
+
30
+ end
31
+ Gorillib::Test::CountryCode
32
+ end
33
+
34
+ let(:djibouti){ country_code_class.new('dj', 'Djibouti') }
35
+ let(:usa ){ country_code_class.new('us', 'United States of America') }
36
+
37
+ context 'test setup' do
38
+ subject{ country_code_class.load }
39
+ it{ should == [djibouti, usa] }
40
+ end
41
+
42
+ context '.values' do
43
+ # before{ country_code_class.send(:remove_instance_variable, '@values') }
44
+ it 'gets its values from .load' do
45
+ country_code_class.should_receive(:load).once.and_return mock_array
46
+ country_code_class.values.should equal(mock_array)
47
+ end
48
+ it 'memoizes once it is called' do
49
+ country_code_class.should_receive(:load).once.and_return mock_array
50
+ country_code_class.values.should equal(mock_array)
51
+ country_code_class.values.should equal(mock_array)
52
+ end
53
+ end
54
+
55
+ context '.index_on' do
56
+ it 'defines a .for_foo method' do
57
+ country_code_class.should_not respond_to(:for_name)
58
+ country_code_class.index_on(:name)
59
+ country_code_class.should respond_to(:for_name)
60
+ country_code_class.protected_methods.should include(:name_index)
61
+ end
62
+ end
63
+
64
+ context '.for_foo' do
65
+ before{ country_code_class.index_on :name }
66
+ context 'behaves like Hash#fetch:' do
67
+ context 'when key is not present' do
68
+ it 'retrieves a value if in the index' do
69
+ country_code_class.for_name('Djibouti').should == djibouti
70
+ end
71
+ end
72
+ context 'when key is not present' do
73
+ it 'and no default it raises KeyError' do
74
+ expect{ country_code_class.for_name('Yo Mama') }.to raise_error(KeyError, 'key not found: "Yo Mama"')
75
+ end
76
+ it 'returns default value if given' do
77
+ yo_mama = country_code_class.for_name('Yo Mama', 'wears combat boots')
78
+ yo_mama.should == 'wears combat boots'
79
+ end
80
+ it 'calls block if given' do
81
+ she = nil
82
+ so_fat = country_code_class.for_name('Yo Mama'){ she = 'sits around the house' ; 'when she sits' }
83
+ so_fat.should == 'when she sits'
84
+ she.should == 'sits around the house'
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,142 @@
1
+ require_relative './models'
2
+ require 'gorillib/model/reconcilable'
3
+
4
+ class Airport
5
+ include Gorillib::Model::Reconcilable
6
+ attr_accessor :_origin # source of the record
7
+
8
+ def conflicting_attribute!(attr, this_val, that_val)
9
+ case attr
10
+ when :name, :city, :airport_ofid then return :pass
11
+ when :latitude, :longitude then return true if (this_val - that_val).abs < 3
12
+ when :altitude then return true if (this_val - that_val).abs < 5
13
+ end
14
+ super
15
+ end
16
+
17
+ def ids
18
+ [:icao, :iata, :faa].hashify{|attr| public_send(attr) }.compact
19
+ end
20
+ end
21
+
22
+ #
23
+ # Loads the Airport identifier tables scraped from Wikipedia
24
+ #
25
+ class RawAirportIdentifier < Airport
26
+ include RawAirport
27
+ include Gorillib::Model::LoadFromTsv
28
+
29
+ def self.from_tuple(icao, iata, faa, name, city=nil, *_)
30
+ self.new({icao: icao, iata: iata, faa: faa, name: name, city: city}.compact_blank)
31
+ end
32
+
33
+ def self.load_airports(filename, &block)
34
+ load_tsv(filename, num_fields: 4..6, &block)
35
+ end
36
+ end
37
+
38
+ class Airport
39
+ #
40
+ # Reconciler for Airports
41
+ #
42
+ # For each airport in turn across openflights, dataexpo and the two scraped
43
+ # identifier sets,
44
+ #
45
+ #
46
+ class IdReconciler
47
+ include Gorillib::Model
48
+ include Gorillib::Model::LoadFromCsv
49
+ include Gorillib::Model::Reconcilable
50
+ self.csv_options = { col_sep: "\t", num_fields: 3..6 }
51
+
52
+ # Map the reconcilers to each ID they have anything to say about
53
+ ID_MAP = { icao: {}, iata: {}, faa: {} }
54
+
55
+ field :opinions, Array, default: Array.new, doc: "every record having an id in common with the other records in this field"
56
+
57
+ def ids
58
+ opinions.flat_map{|op| op.ids.to_a }.uniq.compact
59
+ end
60
+
61
+ def self.load_all
62
+ Log.info "Loading all Airports and reconciling"
63
+ @airports = Array.new
64
+ RawDataexpoAirport .load_airports(:dataexpo_raw_airports ){|airport| register(:dataexpo, airport) }
65
+ RawOpenflightAirport.load_airports(:openflights_raw_airports){|airport| register(:openflights, airport) }
66
+ RawAirportIdentifier.load_airports(:wikipedia_icao ){|airport| register(:wp_icao, airport) }
67
+ RawAirportIdentifier.load_airports(:wikipedia_iata ){|airport| register(:wp_iata, airport) }
68
+ RawAirportIdentifier.load_airports(:wikipedia_us_abroad ){|airport| register(:wp_us_abroad, airport) }
69
+
70
+ recs = ID_MAP.map{|attr, hsh| hsh.sort.map(&:last) }.flatten.uniq
71
+ recs.each do |rec|
72
+ consensus = rec.reconcile
73
+ # lint = consensus.lint
74
+ # puts "%-79s\t%s" % [lint, consensus.to_s[0..100]] if lint.present?
75
+ @airports << consensus
76
+ end
77
+ end
78
+
79
+ def self.airports
80
+ @airports
81
+ end
82
+
83
+ def self.exemplars
84
+ Airport::EXEMPLARS.map do |iata|
85
+ ID_MAP[:iata][iata].reconcile
86
+ end
87
+ end
88
+
89
+ def reconcile
90
+ consensus = Airport.new
91
+ clean = opinions.all?{|op| consensus.adopt(op) }
92
+ # puts "\t#{consensus.inspect}"
93
+ puts "confl\t#{self.inspect}" if not clean
94
+ consensus
95
+ end
96
+
97
+ def adopt_opinions(vals, _)
98
+ self.opinions = vals + self.opinions
99
+ self.opinions.uniq!
100
+ end
101
+
102
+ # * find all existing reconcilers that share an ID with that record
103
+ # * unify them into one reconciler
104
+ # * store it back under all the IDs
105
+ #
106
+ # Suppose our dataset has 3 identifiers, which look like
107
+ #
108
+ # a S
109
+ # S 88
110
+ # a Z
111
+ # b
112
+ # Q
113
+ # b Q 77
114
+ #
115
+ # We will wind up with these two reconcilers:
116
+ #
117
+ # <a S 88 opinions: [a,S, ],[S, ,88],[a,Z, ]>
118
+ # <b Q 77 opinions: [b, , ],[ ,Q, ],[b,Q,77]>
119
+ #
120
+ def self.register(origin, obj)
121
+ obj._origin = origin
122
+ # get the existing reconcilers
123
+ existing = obj.ids.map{|attr, id| ID_MAP[attr][id] }.compact.uniq
124
+ # push the new object in, and pull the most senior one out
125
+ existing.unshift(self.new(opinions: [obj]))
126
+ reconciler = existing.shift
127
+ # unite them into the reconciler
128
+ existing.each{|that| reconciler.adopt(that) }
129
+ # save the reconciler under each of the ids.
130
+ reconciler.ids.each{|attr, id| ID_MAP[attr][id] = reconciler }
131
+ end
132
+
133
+ def inspect
134
+ str = "#<#{self.class.name} #{ids}"
135
+ opinions.each do |op|
136
+ str << "\n\t #{op._origin}\t#{op}"
137
+ end
138
+ str << ">"
139
+ end
140
+ end
141
+
142
+ end
@@ -0,0 +1,83 @@
1
+ require_relative('../../rake_helper')
2
+ require_relative('./models')
3
+
4
+ Pathname.register_paths(
5
+ af_data: [:data, 'airline_flights'],
6
+ af_work: [:work, 'airline_flights'],
7
+ af_code: File.dirname(__FILE__),
8
+ #
9
+ openflights_raw_airports: [:af_data, "openflights_airports-raw#{Settings[:mini_slug]}.csv" ],
10
+ openflights_raw_airlines: [:af_data, "openflights_airlines-raw.csv" ],
11
+ dataexpo_raw_airports: [:af_data, "dataexpo_airports-raw#{Settings[:mini_slug]}.csv" ],
12
+ wikipedia_icao: [:af_data, "wikipedia_icao.tsv" ],
13
+ wikipedia_iata: [:af_data, "wikipedia_iata.tsv" ],
14
+ wikipedia_us_abroad: [:af_data, "wikipedia_us_abroad.tsv" ],
15
+ #
16
+ openflights_airports: [:af_work, "openflights_airports-parsed#{Settings[:mini_slug]}.tsv"],
17
+ openflights_airlines: [:af_work, "openflights_airlines-parsed#{Settings[:mini_slug]}.tsv"],
18
+ dataexpo_airports: [:af_work, "dataexpo_airports-parsed#{Settings[:mini_slug]}.tsv" ],
19
+ airport_identifiers: [:af_work, "airport_identifiers.tsv" ],
20
+ airport_identifiers_mini: [:af_work, "airport_identifiers-sample.tsv" ],
21
+ # helpers
22
+ country_name_lookup: [:work, 'geo', "country_name_lookup.tsv"],
23
+ )
24
+
25
+ chain :airline_flights do
26
+ code_files = FileList[Pathname.of(:af_code, '*.rb').to_s]
27
+ chain(:parse) do
28
+
29
+ # desc 'parse the dataexpo airports'
30
+ # create_file(:dataexpo_airports, after: code_files) do |dest|
31
+ # RawDataexpoAirport.load_airports(:dataexpo_raw_airports) do |airport|
32
+ # dest << airport.to_tsv << "\n"
33
+ # end
34
+ # end
35
+
36
+ desc 'parse the openflights airports'
37
+ create_file(:openflights_airports, after: [code_files, :force]) do |dest|
38
+ require_relative('../geo/geo_models')
39
+ Geo::CountryNameLookup.load
40
+ RawOpenflightAirport.load_airports(:openflights_raw_airports) do |airport|
41
+ dest << airport.to_tsv << "\n"
42
+ # puts airport.country
43
+ end
44
+ end
45
+
46
+ # task :reconcile_airports => [:dataexpo_airports, :openflights_airports] do
47
+ # require_relative 'reconcile_airports'
48
+ # Airport::IdReconciler.load_all
49
+ # end
50
+ #
51
+ # desc 'run the identifier reconciler'
52
+ # create_file(:airport_identifiers, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
53
+ # Airport::IdReconciler.airports.each do |airport|
54
+ # dest << airport.to_tsv << "\n"
55
+ # end
56
+ # end
57
+ #
58
+ # desc 'run the identifier reconciler'
59
+ # create_file(:airport_identifiers_mini, after: code_files, invoke: 'airline_flights:parse:reconcile_airports') do |dest|
60
+ # Airport::IdReconciler.exemplars.each do |airport|
61
+ # dest << airport.to_tsv << "\n"
62
+ # end
63
+ # end
64
+ #
65
+ # desc 'parse the openflights airlines'
66
+ # create_file(:openflights_airlines, after: code_files) do |dest|
67
+ # RawOpenflightAirline.load_airlines(:openflights_raw_airlines) do |airline|
68
+ # dest << airline.to_tsv << "\n"
69
+ # puts airline.to_tsv
70
+ # end
71
+ # end
72
+
73
+ end
74
+ end
75
+
76
+ task :default => [
77
+ 'airline_flights',
78
+ # 'airline_flights:parse:dataexpo_airports',
79
+ # 'airline_flights:parse:openflights_airports',
80
+ # 'airline_flights:parse:airport_identifiers',
81
+ # 'airline_flights:parse:airport_identifiers_mini',
82
+ # 'airline_flights:parse:openflights_airlines',
83
+ ]
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/env ruby
2
+ require('rake')
3
+ require_relative('../../rake_helper')
4
+ require_relative './models'
5
+
6
+ Pathname.register_paths(
7
+ af_data: [:data, 'airline_flights'],
8
+ af_work: [:work, 'airline_flights'],
9
+ af_code: File.dirname(__FILE__),
10
+ airport_identifiers: [:af_work, "airport_identifiers.tsv" ],
11
+ )
12
+
13
+ AIRPORTS_TO_MATCH = [
14
+ [ 'Tokyo', 1, "HND", ],
15
+ [ 'Guangzhou', 2, "CAN", ],
16
+ [ 'Seoul', 3, "ICN", ],
17
+ [ 'Shanghai', 4, "PVG", ],
18
+ [ 'Mexico.*City', 5, "MEX", ],
19
+ [ 'Delhi', 6, "DEL", ],
20
+ [ 'New.*York', 7, "JFK", ],
21
+ [ 'S.*o.*Paulo', 8, "GRU", ],
22
+ [ 'Mumbai|Bombay', 9, "BOM", ],
23
+ [ 'Manila', 10, "MNL", ],
24
+ [ 'Jakarta', 11, "CGK", ],
25
+ [ 'Los.*Angeles', 12, "LAX", ],
26
+ [ 'Karachi', 13, "KHI", ],
27
+ [ 'Osaka', 14, "KIX", ],
28
+ [ 'Beijing', 15, "PEK", ],
29
+ [ 'Moscow', 16, "SVO", ],
30
+ [ 'Cairo', 17, "CAI", ],
31
+ [ 'Kolkata|Calcutta', 18, "CCU", ],
32
+ [ 'Buenos.*Aires', 19, "EZE", ],
33
+ [ 'Dhaka', 20, "DAC", ],
34
+ [ 'Bangkok', 21, "BKK", ],
35
+ [ 'Tehran|Abyek', 22, "IKA", ],
36
+ [ 'Istanbul', 23, "IST", ],
37
+ [ 'Janeiro', 24, "GIG", ],
38
+ [ 'London', 25, "LHR", ],
39
+ [ 'Lagos', 26, "LOS", ],
40
+ [ 'Paris', 27, "CDG", ],
41
+ [ 'Chicago', 28, "ORD", ],
42
+ [ 'Kinshasa', 29, "FIH", ],
43
+ [ 'Lima', 30, "LIM", ],
44
+ [ 'Wuhan', 31, "WUH", ],
45
+ [ 'Bangalore', 32, "BLR", ],
46
+ [ 'Bogot.*', 33, "BOG", ],
47
+ [ 'Taipei', 34, "TSA", ],
48
+ [ 'Washington|Arling', 35, "DCA", ],
49
+ [ 'Johannesburg', 36, "JNB", ],
50
+ [ 'Saigon|Ho.Chi.M', 37, "SGN", ],
51
+ [ 'San.*Francisco', 38, "SFO", ],
52
+ [ 'Boston', 39, "BOS", ],
53
+ [ 'Hong.*Kong', 40, "HKG", ],
54
+ [ 'Baghdad', 41, "SDA", ],
55
+ [ 'Madrid', 42, "MAD", ],
56
+ [ 'Singapore', 43, "SIN", ],
57
+ [ 'Kuala.*Lumpur', 44, "KUL", ],
58
+ [ 'Chongqing|Chung.*', 45, "CKG", ],
59
+ [ 'Santiago', 46, "SCL", ],
60
+ [ 'Toronto', 47, "YYZ", ],
61
+ [ 'Riyadh', 48, "RUH", ],
62
+ [ 'Atlanta', 49, "ATL", ],
63
+ [ 'Miami', 50, "MIA", ],
64
+ [ 'Detroit', 51, "DTW", ],
65
+ [ 'St..*Petersburg', 52, "LED", ],
66
+ [ 'Khartoum', 53, "KRT", ],
67
+ [ 'Sydney', 54, "SYD", ],
68
+ [ 'Milan', 55, "MXP", ],
69
+ [ 'Abidjan', 56, "ABJ", ],
70
+ [ 'Barcelona', 57, "BCN", ],
71
+ [ 'Nairobi', 58, "NBO", ],
72
+ [ 'Caracas', 59, "CCS", ],
73
+ [ 'Monterrey', 60, "MTY", ],
74
+ [ 'Phoenix', 61, "PHX", ],
75
+ [ 'Berlin', 62, "TXL", ],
76
+ [ 'Melbourne', 63, "MEL", ],
77
+ [ 'Casablanca', 64, "CMN", ],
78
+ [ 'Montreal', 65, "YUL", ],
79
+ [ 'Salvador', 66, "SSA", ],
80
+ [ 'Rome', 67, "FCO", ],
81
+ [ 'Kiev', 68, "KBP", ],
82
+ [ 'Ad+is.*Ab.ba', 69, "ADD", ],
83
+ [ 'Denver', 70, "DEN", ],
84
+ [ 'St.*Louis', 71, "STL", ],
85
+ [ 'Dakar', 72, "DKR", ],
86
+ [ 'San.*Juan', 73, "SJU", ],
87
+ [ 'Vancouver', 74, "YVR", ],
88
+ [ 'Tel.*Aviv', 75, "TLV", ],
89
+ [ 'Tunis', 76, "TUN", ],
90
+ [ 'Portland', 77, "PDX", ],
91
+ [ 'Manaus', 78, "MAO", ],
92
+ [ 'Calgary', 79, "YYC", ],
93
+ [ 'Halifax', 80, "YHZ", ],
94
+ [ 'Prague', 81, "PRG", ],
95
+ [ 'Copenhagen', 82, "CPH", ],
96
+ [ 'Djibouti', 83, "JIB", ],
97
+ [ 'Quito', 84, "UIO", ],
98
+ [ 'Helsinki', 85, "HEL", ],
99
+ [ 'Papeete|Tahiti', 86, "PPT", ],
100
+ [ 'Frankfurt', 87, "FRA", ],
101
+ [ 'Reykjavik', 88, "RKV", ],
102
+ [ 'Riga', 89, "RIX", ],
103
+ [ 'Antananarivo', 90, "TNR", ],
104
+ [ 'Amsterdam', 91, "AMS", ],
105
+ [ 'Bucharest', 92, "OTP", ],
106
+ [ 'Novosibirsk', 93, "OVB", ],
107
+ [ 'Kigali', 94, "KGL", ],
108
+ [ 'Dushanbe', 95, "DYU", ],
109
+ [ 'Dubai', 96, "DXB", ],
110
+ [ 'Bermuda', 97, "BDA", ],
111
+ [ 'Anchorage', 98, "ANC", ],
112
+ [ 'Austin', 99, "AUS", ],
113
+ [ 'Honolulu', 100, "HNL", ],
114
+ [ 'Apia', 101, "FGI", ],
115
+ [ 'Vienna', 102, "VIE", ],
116
+ [ 'Brussels', 103, "BRU", ],
117
+ [ 'Munich', 104, "MUC", ],
118
+ [ 'Dublin', 105, "DUB", ],
119
+ [ 'Doha', 106, "DOH", ],
120
+ [ 'Taipei', 107, "TPE", ],
121
+ [ 'Yakutsk', 108, "YKS", ],
122
+ [ 'Z.rich', 109, "ZRH", ],
123
+ [ 'Manchester', 110, "MAN", ],
124
+ [ 'Houston', 111, "IAH", ],
125
+ [ 'Charlotte', 112, "CLT", ],
126
+ [ 'Dallas', 113, "DFW", ],
127
+ [ 'Las.*Vegas', 114, "LAS", ],
128
+ [ 'Antalya', 115, "AYT", ],
129
+ [ 'Auckland', 116, "AKL", ],
130
+ ]
131
+
132
+ MATCHED_AIRPORTS = {}
133
+ MATCH_ON_IATA = {}
134
+ MATCH_ON_CITY = {}
135
+ match_on_city_names = []
136
+
137
+ AIRPORTS_TO_MATCH.each do |name, idx, iata|
138
+ hsh = {iata: iata, re: Regexp.new(name, 'i'), name: name, idx: idx}
139
+ if iata.present?
140
+ MATCH_ON_IATA[iata] = hsh
141
+ else
142
+ match_on_city_names << name
143
+ MATCH_ON_CITY[hsh[:re]] = hsh
144
+ end
145
+ end
146
+ match_on_city_re = Regexp.new(match_on_city_names.join('|'))
147
+
148
+ Airport.load_tsv(:airport_identifiers) do |airport|
149
+ airport.name = airport.name[0..30]
150
+ if MATCH_ON_IATA.include?(airport.iata)
151
+ hsh = MATCH_ON_IATA[airport.iata]
152
+ warn [hsh.values, airport.to_tsv].flatten.join("\t") unless hsh[:re] =~ airport.city
153
+ MATCHED_AIRPORTS[hsh[:idx]] = airport
154
+ # elsif (airport.city =~ match_on_city_re)
155
+ # MATCH_ON_CITY.each do |re, hsh|
156
+ # if (airport.city =~ re)
157
+ # puts [airport.to_tsv, hsh[:name], hsh[:idx]].join("\t")
158
+ # end
159
+ # end
160
+ end
161
+ end
162
+
163
+ AIRPORTS_TO_MATCH.each do |name, idx, iata|
164
+ # next if MATCHED_AIRPORTS[idx]
165
+ airport_str = MATCHED_AIRPORTS[idx] ? MATCHED_AIRPORTS[idx].to_tsv : "\t\t\t\t\t\t\t\t\t\t\t\t"
166
+ puts [airport_str, name, "", idx].join("\t")
167
+ end