pumi 0.19.0 → 0.20.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +8 -0
- data/bin/parse_data +2 -5
- data/data/communes.yml +9904 -0
- data/data/districts.yml +1576 -0
- data/data/provinces.yml +225 -0
- data/lib/pumi/bot/wikipedia/article.rb +13 -0
- data/lib/pumi/bot/wikipedia/communes_in_cambodia_article.rb +157 -0
- data/lib/pumi/bot/wikipedia/districts_in_cambodia_article.rb +122 -0
- data/lib/pumi/bot/wikipedia/templates/commune_list.wikitext.erb +46 -0
- data/lib/pumi/bot/wikipedia/templates/district_list.wikitext.erb +27 -0
- data/lib/pumi/bot/wikipedia.rb +10 -0
- data/lib/pumi/bot.rb +6 -0
- data/lib/pumi/data_source/geocoder.rb +251 -0
- data/lib/pumi/data_source/iso31662.rb +29 -0
- data/lib/pumi/data_source/wikipedia.rb +19 -524
- data/lib/pumi/data_source.rb +2 -0
- data/lib/pumi/geodata.rb +3 -0
- data/lib/pumi/location.rb +2 -0
- data/lib/pumi/parser.rb +7 -0
- data/lib/pumi/version.rb +1 -1
- data/lib/pumi/wikipedia/client.rb +68 -0
- data/lib/pumi/wikipedia/response.rb +15 -0
- data/lib/pumi/wikipedia.rb +7 -0
- data/lib/pumi.rb +1 -1
- data/pumi.gemspec +4 -1
- metadata +58 -4
- data/lib/pumi/scraper/result.rb +0 -5
@@ -0,0 +1,46 @@
|
|
1
|
+
<div id="communes-list">
|
2
|
+
<% provinces.each do |province| %>
|
3
|
+
<% province_page = URI.parse(province.links[:wikipedia]).path.split("/").last %>
|
4
|
+
==[[<%= province_page %>|<%= province.full_name_en %>]]==
|
5
|
+
<div id=province-communes-<%= province.id %>>
|
6
|
+
<%= province.name_en %> contains <%= province.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
|
7
|
+
|
8
|
+
<% province.districts.each do |district| %>
|
9
|
+
<% if district.links[:wikipedia] %>
|
10
|
+
<% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last %>
|
11
|
+
===[[<%= district_page %>|<%= district.full_name_en %>]]===
|
12
|
+
<% else %>
|
13
|
+
===<%= district.full_name_en %>===
|
14
|
+
<% end %>
|
15
|
+
|
16
|
+
<div id=district-communes-<%= district.id %>>
|
17
|
+
<%= district.name_en %> contains <%= district.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/district.castle?ds=<%= district.id %> |title=<%= district.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
|
18
|
+
|
19
|
+
{| class="wikitable sortable"
|
20
|
+
|-
|
21
|
+
! #
|
22
|
+
! Name
|
23
|
+
! Khmer
|
24
|
+
! Administrative Unit
|
25
|
+
! Geocode
|
26
|
+
|-
|
27
|
+
|
28
|
+
<% district.communes.each_with_index do |commune, index| %>
|
29
|
+
| <%= index + 1 %>
|
30
|
+
<% if commune.links[:wikipedia] %>
|
31
|
+
<% commune_page = URI.parse(commune.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
|
32
|
+
| [[<%= commune_page %>|<%= commune.name_en %>]]
|
33
|
+
<% else %>
|
34
|
+
| <%= commune.name_en %>
|
35
|
+
<% end %>
|
36
|
+
| <%= commune.name_km %>
|
37
|
+
| <%= "#{commune.administrative_unit.name_en} (#{commune.administrative_unit.name_km} #{commune.administrative_unit.name_latin})" %>
|
38
|
+
| <%= commune.id %>
|
39
|
+
|-
|
40
|
+
<% end %>
|
41
|
+
|}
|
42
|
+
</div>
|
43
|
+
<% end %>
|
44
|
+
</div>
|
45
|
+
<% end %>
|
46
|
+
</div>
|
@@ -0,0 +1,27 @@
|
|
1
|
+
<div id=province-districts-<%= province.id %>>
|
2
|
+
<%= province.name_en %> contains <%= districts_summary %>. <ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
|
3
|
+
|
4
|
+
{| class="wikitable sortable"
|
5
|
+
|-
|
6
|
+
! #
|
7
|
+
! Name
|
8
|
+
! Khmer
|
9
|
+
! Administrative Unit
|
10
|
+
! Geocode
|
11
|
+
|-
|
12
|
+
|
13
|
+
<% districts.each_with_index do |district, index| %>
|
14
|
+
| <%= index + 1 %>
|
15
|
+
<% if district.links[:wikipedia] %>
|
16
|
+
<% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
|
17
|
+
| [[<%= district_page %>|<%= district.name_en %>]]
|
18
|
+
<% else %>
|
19
|
+
| <%= district.name_en %>
|
20
|
+
<% end %>
|
21
|
+
| <%= district.name_km %>
|
22
|
+
| <%= "#{district.administrative_unit.name_en} (#{district.administrative_unit.name_km} #{district.administrative_unit.name_latin})" %>
|
23
|
+
| <%= district.id %>
|
24
|
+
|-
|
25
|
+
<% end %>
|
26
|
+
|}
|
27
|
+
</div>
|
data/lib/pumi/bot.rb
ADDED
@@ -0,0 +1,251 @@
|
|
1
|
+
require "geocoder"
|
2
|
+
|
3
|
+
module Pumi
|
4
|
+
module DataSource
|
5
|
+
class Geocoder
|
6
|
+
Result = Struct.new(:code, :lat, :long, :bounding_box, keyword_init: true)
|
7
|
+
|
8
|
+
class AbstractGeocoder
|
9
|
+
Result = Struct.new(
|
10
|
+
:lat, :long, :bounding_box, :country_code,
|
11
|
+
:types, :iso3166_2, :district_name_en,
|
12
|
+
:name,
|
13
|
+
keyword_init: true
|
14
|
+
)
|
15
|
+
|
16
|
+
Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
|
17
|
+
|
18
|
+
MISSPELLINGS = []
|
19
|
+
|
20
|
+
class AbstractProvider
|
21
|
+
attr_reader :geocoder, :name
|
22
|
+
|
23
|
+
def initialize(geocoder:, name:)
|
24
|
+
@geocoder = geocoder
|
25
|
+
@name = name
|
26
|
+
end
|
27
|
+
|
28
|
+
def search(term)
|
29
|
+
geocoder.search(term, lookup: name).map do |result|
|
30
|
+
build_result(result.data)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Google < AbstractProvider
|
36
|
+
private
|
37
|
+
|
38
|
+
def build_result(data)
|
39
|
+
province_name_en = find_address_component(
|
40
|
+
data,
|
41
|
+
"administrative_area_level_1"
|
42
|
+
)&.fetch("long_name")
|
43
|
+
province = Pumi::Province.where(full_name_en: province_name_en).first
|
44
|
+
Result.new(
|
45
|
+
name: data.dig("address_components", 0, "long_name"),
|
46
|
+
lat: data.dig("geometry", "location", "lat"),
|
47
|
+
long: data.dig("geometry", "location", "lng"),
|
48
|
+
bounding_box: [
|
49
|
+
data.dig("geometry", "bounds", "northeast", "lat"),
|
50
|
+
data.dig("geometry", "bounds", "northeast", "lng"),
|
51
|
+
data.dig("geometry", "bounds", "southwest", "lat"),
|
52
|
+
data.dig("geometry", "bounds", "southwest", "lng")
|
53
|
+
],
|
54
|
+
country_code: find_address_component(data, "country").fetch("short_name").upcase,
|
55
|
+
district_name_en: find_address_component(
|
56
|
+
data,
|
57
|
+
"administrative_area_level_2"
|
58
|
+
)&.fetch("long_name"),
|
59
|
+
types: data["types"],
|
60
|
+
iso3166_2: province&.iso3166_2
|
61
|
+
)
|
62
|
+
end
|
63
|
+
|
64
|
+
def find_address_component(data, type)
|
65
|
+
data.fetch("address_components").find do |c|
|
66
|
+
c.fetch("types").include?(type)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class Nominatim < AbstractProvider
|
72
|
+
private
|
73
|
+
|
74
|
+
def build_result(data)
|
75
|
+
Result.new(
|
76
|
+
name: nil,
|
77
|
+
lat: data["lat"],
|
78
|
+
long: data["lon"],
|
79
|
+
bounding_box: data["boundingbox"],
|
80
|
+
types: Array(data["type"]),
|
81
|
+
iso3166_2: data.dig("address", "ISO3166-2-lvl4"),
|
82
|
+
country_code: data.dig("address", "country_code")&.upcase,
|
83
|
+
district_name_en: data.dig("address", "county")
|
84
|
+
)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
PROVIDERS = {
|
89
|
+
nominatim: Nominatim,
|
90
|
+
google: Google
|
91
|
+
}.freeze
|
92
|
+
|
93
|
+
attr_reader :providers, :options
|
94
|
+
|
95
|
+
def initialize(geocoder: ::Geocoder, providers: PROVIDERS.keys, **options)
|
96
|
+
@options = options
|
97
|
+
|
98
|
+
geocoder.configure(
|
99
|
+
google: {
|
100
|
+
api_key: ENV["GOOGLE_API_KEY"]
|
101
|
+
}
|
102
|
+
)
|
103
|
+
|
104
|
+
@providers = Array(providers).map do |name|
|
105
|
+
PROVIDERS.fetch(name).new(geocoder:, name:)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def geocode_all
|
110
|
+
locations.each_with_object([]).with_index do |(location, results), _index|
|
111
|
+
next if !options[:regeocode] && !location.geodata.nil?
|
112
|
+
|
113
|
+
geocoder_result = geocode(location)
|
114
|
+
|
115
|
+
if geocoder_result.nil?
|
116
|
+
ungeocoded_locations << location
|
117
|
+
next
|
118
|
+
end
|
119
|
+
|
120
|
+
results << build_result(code: location.id, geocoder_result:)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
|
126
|
+
def geocode(location)
|
127
|
+
providers.each do |provider|
|
128
|
+
Array(build_search_term(location)).each do |search_term|
|
129
|
+
all_results = provider.search(search_term)
|
130
|
+
geocoder_result = filter(location, all_results)
|
131
|
+
|
132
|
+
return geocoder_result unless geocoder_result.nil?
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
nil
|
137
|
+
end
|
138
|
+
|
139
|
+
def build_result(code:, geocoder_result:)
|
140
|
+
Geocoder::Result.new(
|
141
|
+
code:,
|
142
|
+
lat: geocoder_result.lat,
|
143
|
+
long: geocoder_result.long,
|
144
|
+
bounding_box: geocoder_result.bounding_box
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
def build_search_term(location)
|
149
|
+
[location.full_name_km, location.name_km].map do |term|
|
150
|
+
MISSPELLINGS.find { |m| m.correct_text == term }&.incorrect_text || term
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def ungeocoded_locations
|
155
|
+
@ungeocoded_locations ||= []
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
class CambodianProvinces < AbstractGeocoder
|
160
|
+
private
|
161
|
+
|
162
|
+
def locations
|
163
|
+
@locations ||= Pumi::Province.all
|
164
|
+
end
|
165
|
+
|
166
|
+
def build_search_term(province)
|
167
|
+
province.iso3166_2
|
168
|
+
end
|
169
|
+
|
170
|
+
def filter(province, geocoder_results)
|
171
|
+
geocoder_results.find do |r|
|
172
|
+
r.iso3166_2 == province.iso3166_2 && r.types.include?("administrative")
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class CambodianDistricts < AbstractGeocoder
|
178
|
+
private
|
179
|
+
|
180
|
+
def locations
|
181
|
+
@locations ||= Pumi::District.all
|
182
|
+
end
|
183
|
+
|
184
|
+
def filter(district, geocoder_results)
|
185
|
+
geocoder_results.find do |r|
|
186
|
+
r.country_code == "KH" &&
|
187
|
+
r.iso3166_2 == district.province.iso3166_2 && (
|
188
|
+
%w[administrative_area_level_2 town city administrative].any? do |type|
|
189
|
+
r.types.include?(type)
|
190
|
+
end || (%w[locality political].sort == r.types.sort)
|
191
|
+
)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
class CambodianCommunes < AbstractGeocoder
|
197
|
+
private
|
198
|
+
|
199
|
+
def locations
|
200
|
+
@locations ||= Pumi::Commune.all
|
201
|
+
end
|
202
|
+
|
203
|
+
def filter(commune, geocoder_results)
|
204
|
+
geocoder_results.find do |r|
|
205
|
+
r.country_code == "KH" &&
|
206
|
+
(r.iso3166_2 == commune.province.iso3166_2 || r.district_name_en.to_s.downcase.include?(commune.district.name_en.downcase)) &&
|
207
|
+
%w[administrative_area_level_3 village suburb neighbourhood].any? do |type|
|
208
|
+
r.types.include?(type)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
attr_reader :data_file, :geocoder
|
215
|
+
|
216
|
+
def initialize(data_file:, geocoder:)
|
217
|
+
@data_file = data_file
|
218
|
+
@geocoder = geocoder
|
219
|
+
end
|
220
|
+
|
221
|
+
def load_data!(output_dir: "data")
|
222
|
+
data.each do |code, attributes|
|
223
|
+
geocoded_result = geocoded_results.find { |r| r.code == code }
|
224
|
+
|
225
|
+
next if geocoded_result.nil?
|
226
|
+
|
227
|
+
attributes["geodata"] ||= {}
|
228
|
+
attributes["geodata"]["lat"] = geocoded_result.lat
|
229
|
+
attributes["geodata"]["long"] = geocoded_result.long
|
230
|
+
attributes["geodata"]["bounding_box"] = geocoded_result.bounding_box
|
231
|
+
end
|
232
|
+
|
233
|
+
write_data!(output_dir)
|
234
|
+
end
|
235
|
+
|
236
|
+
private
|
237
|
+
|
238
|
+
def data
|
239
|
+
@data ||= data_file.read
|
240
|
+
end
|
241
|
+
|
242
|
+
def write_data!(data_directory)
|
243
|
+
data_file.write(data, data_directory:)
|
244
|
+
end
|
245
|
+
|
246
|
+
def geocoded_results
|
247
|
+
@geocoded_results ||= geocoder.geocode_all
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Pumi
|
2
|
+
module DataSource
|
3
|
+
class ISO31662
|
4
|
+
attr_reader :data_file
|
5
|
+
|
6
|
+
def initialize(data_file: Pumi::DataFile.new(:provinces))
|
7
|
+
@data_file = data_file
|
8
|
+
end
|
9
|
+
|
10
|
+
def load_data!(output_dir: "data")
|
11
|
+
data.each do |code, attributes|
|
12
|
+
attributes["iso3166_2"] = "KH-#{code.to_i}"
|
13
|
+
end
|
14
|
+
|
15
|
+
write_data!(output_dir)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def data
|
21
|
+
@data ||= data_file.read
|
22
|
+
end
|
23
|
+
|
24
|
+
def write_data!(data_directory)
|
25
|
+
data_file.write(data, data_directory:)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|