pumi 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +8 -0
- data/bin/parse_data +0 -5
- data/data/communes.yml +9904 -0
- data/data/districts.yml +1576 -0
- data/data/provinces.yml +225 -0
- data/lib/pumi/bot/wikipedia/article.rb +13 -0
- data/lib/pumi/bot/wikipedia/communes_in_cambodia_article.rb +157 -0
- data/lib/pumi/bot/wikipedia/districts_in_cambodia_article.rb +122 -0
- data/lib/pumi/bot/wikipedia/templates/commune_list.wikitext.erb +46 -0
- data/lib/pumi/bot/wikipedia/templates/district_list.wikitext.erb +27 -0
- data/lib/pumi/bot/wikipedia.rb +10 -0
- data/lib/pumi/bot.rb +6 -0
- data/lib/pumi/data_source/geocoder.rb +251 -0
- data/lib/pumi/data_source/iso31662.rb +29 -0
- data/lib/pumi/data_source/wikipedia.rb +19 -524
- data/lib/pumi/data_source.rb +2 -0
- data/lib/pumi/geodata.rb +3 -0
- data/lib/pumi/location.rb +2 -0
- data/lib/pumi/parser.rb +7 -0
- data/lib/pumi/version.rb +1 -1
- data/lib/pumi/wikipedia/client.rb +68 -0
- data/lib/pumi/wikipedia/response.rb +15 -0
- data/lib/pumi/wikipedia.rb +7 -0
- data/lib/pumi.rb +3 -0
- data/pumi.gemspec +4 -1
- metadata +58 -4
- data/lib/pumi/scraper/result.rb +0 -5
@@ -69,13 +69,13 @@ module Pumi
|
|
69
69
|
end
|
70
70
|
|
71
71
|
def find_url(province)
|
72
|
-
td = province_table_rows.
|
72
|
+
td = province_table_rows.at_xpath("child::td[contains(., '#{province.name_km}')]")
|
73
73
|
if td.nil?
|
74
74
|
raise WebScraper::ElementNotFoundError,
|
75
75
|
"No cell containing '#{province.name_km}' was found in a table on #{URL}"
|
76
76
|
end
|
77
77
|
|
78
|
-
link = td.
|
78
|
+
link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
|
79
79
|
URI.join(URL, link[:href]).to_s
|
80
80
|
end
|
81
81
|
|
@@ -83,8 +83,8 @@ module Pumi
|
|
83
83
|
@province_table_rows ||= begin
|
84
84
|
sample_province = Province.all.first
|
85
85
|
|
86
|
-
sample_row = scraper.page.
|
87
|
-
if sample_row.
|
86
|
+
sample_row = scraper.page.at_xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]")
|
87
|
+
if sample_row.at_xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").nil?
|
88
88
|
raise WebScraper::ElementNotFoundError,
|
89
89
|
"No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
|
90
90
|
end
|
@@ -113,547 +113,42 @@ module Pumi
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def find_url(district)
|
116
|
-
|
117
|
-
list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
|
116
|
+
geocode = scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
|
118
117
|
|
119
|
-
return if
|
118
|
+
return if geocode.nil?
|
120
119
|
|
121
|
-
|
122
|
-
raise WebScraper::ElementNotFoundError,
|
123
|
-
"More than one element was found with the identifier '#{identifier}' on #{URL}"
|
124
|
-
end
|
120
|
+
link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
|
125
121
|
|
126
|
-
|
127
|
-
return unless link
|
122
|
+
return if link.nil?
|
128
123
|
|
129
124
|
URI.join(URL, link[:href]).to_s
|
130
125
|
end
|
131
126
|
end
|
132
127
|
|
133
128
|
class CambodianCommunesScraper
|
134
|
-
class CommuneNotFoundError < StandardError; end
|
135
|
-
class DuplicateCommuneError < StandardError; end
|
136
|
-
|
137
129
|
URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
|
138
|
-
Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
|
139
|
-
InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
|
140
|
-
|
141
|
-
MISSING_LOCATIONS = [
|
142
|
-
"Taing Kouk District",
|
143
|
-
"Bokor Municipality",
|
144
|
-
"Ta Lou Senchey District",
|
145
|
-
"Kaoh Rung Municipality",
|
146
|
-
"Borei Ou Svay Senchey District"
|
147
|
-
].freeze
|
148
|
-
|
149
|
-
INVALID_COMMUNE_LINKS = [
|
150
|
-
InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
|
151
|
-
InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
|
152
|
-
InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
|
153
|
-
InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
|
154
|
-
InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
|
155
|
-
InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
|
156
|
-
InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
|
157
|
-
InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
|
158
|
-
InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
|
159
|
-
InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
|
160
|
-
InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
|
161
|
-
InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
|
162
|
-
InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
|
163
|
-
InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
|
164
|
-
InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
|
165
|
-
InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
|
166
|
-
InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
|
167
|
-
InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
|
168
|
-
InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
|
169
|
-
InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
|
170
|
-
InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
|
171
|
-
InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
|
172
|
-
InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
|
173
|
-
InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
|
174
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
|
175
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
|
176
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
|
177
|
-
InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
|
178
|
-
InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
|
179
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
|
180
|
-
InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
|
181
|
-
InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
|
182
|
-
InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
|
183
|
-
InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
|
184
|
-
InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
|
185
|
-
InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
|
186
|
-
InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
|
187
|
-
InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
|
188
|
-
InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
|
189
|
-
InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
|
190
|
-
InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
|
191
|
-
InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
|
192
|
-
InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
|
193
|
-
InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
|
194
|
-
InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
|
195
|
-
InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
|
196
|
-
InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
|
197
|
-
InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
|
198
|
-
InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
|
199
|
-
InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
|
200
|
-
InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
|
201
|
-
InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
|
202
|
-
InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
|
203
|
-
InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
|
204
|
-
InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
|
205
|
-
InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
|
206
|
-
InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
|
207
|
-
InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
|
208
|
-
InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
|
209
|
-
InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
|
210
|
-
InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
|
211
|
-
InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
|
212
|
-
InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
|
213
|
-
InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
|
214
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
|
215
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
|
216
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
|
217
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
|
218
|
-
InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
|
219
|
-
InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
|
220
|
-
InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
|
221
|
-
InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
|
222
|
-
InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
|
223
|
-
InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
|
224
|
-
InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
|
225
|
-
InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
|
226
|
-
InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
|
227
|
-
InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
|
228
|
-
InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
|
229
|
-
InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
|
230
|
-
InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
|
231
|
-
InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
|
232
|
-
InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
|
233
|
-
InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
|
234
|
-
InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
|
235
|
-
InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
|
236
|
-
InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
|
237
|
-
InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
|
238
|
-
InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
|
239
|
-
InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
|
240
|
-
InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
|
241
|
-
InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
|
242
|
-
InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
|
243
|
-
InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
|
244
|
-
InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
|
245
|
-
InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
|
246
|
-
InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
|
247
|
-
InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
|
248
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
|
249
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
|
250
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
|
251
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
|
252
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
|
253
|
-
InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
|
254
|
-
InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
|
255
|
-
InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
|
256
|
-
InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
|
257
|
-
InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
|
258
|
-
InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
|
259
|
-
InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
|
260
|
-
InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
|
261
|
-
InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
|
262
|
-
InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
|
263
|
-
InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
|
264
|
-
InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
|
265
|
-
InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
|
266
|
-
InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
|
267
|
-
InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
|
268
|
-
].freeze
|
269
|
-
|
270
|
-
MISSPELLINGS = [
|
271
|
-
Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
|
272
|
-
Misspelling.new(
|
273
|
-
incorrect_text: "Mondulkiri Province",
|
274
|
-
correct_text: "Mondul Kiri Province"
|
275
|
-
),
|
276
|
-
Misspelling.new(
|
277
|
-
incorrect_text: "Ratanakiri Province",
|
278
|
-
correct_text: "Ratanak Kiri Province"
|
279
|
-
),
|
280
|
-
Misspelling.new(
|
281
|
-
incorrect_text: "Siem Reap Province",
|
282
|
-
correct_text: "Siemreap Province"
|
283
|
-
),
|
284
|
-
Misspelling.new(
|
285
|
-
incorrect_text: "Serei Saophoan District",
|
286
|
-
correct_text: "Serei Saophoan Municipality"
|
287
|
-
),
|
288
|
-
Misspelling.new(
|
289
|
-
incorrect_text: "Poipet Municipality",
|
290
|
-
correct_text: "Paoy Paet Municipality"
|
291
|
-
),
|
292
|
-
Misspelling.new(
|
293
|
-
incorrect_text: "Battambang District",
|
294
|
-
correct_text: "Battambang Municipality"
|
295
|
-
),
|
296
|
-
Misspelling.new(
|
297
|
-
incorrect_text: "Rotanak Mondol District",
|
298
|
-
correct_text: "Rotonak Mondol District"
|
299
|
-
),
|
300
|
-
Misspelling.new(
|
301
|
-
incorrect_text: "Sampov Loun District",
|
302
|
-
correct_text: "Sampov Lun District"
|
303
|
-
),
|
304
|
-
Misspelling.new(
|
305
|
-
incorrect_text: "Koh Kralor District",
|
306
|
-
correct_text: "Koas Krala District"
|
307
|
-
),
|
308
|
-
Misspelling.new(
|
309
|
-
incorrect_text: "Rukhak Kiri District",
|
310
|
-
correct_text: "Rukh Kiri District"
|
311
|
-
),
|
312
|
-
Misspelling.new(
|
313
|
-
incorrect_text: "Koh Sotin District",
|
314
|
-
correct_text: "Kaoh Soutin District"
|
315
|
-
),
|
316
|
-
Misspelling.new(
|
317
|
-
incorrect_text: "Srey Santhor District",
|
318
|
-
correct_text: "Srei Santhor District"
|
319
|
-
),
|
320
|
-
Misspelling.new(
|
321
|
-
incorrect_text: "Kong Pisey",
|
322
|
-
correct_text: "Kong Pisei District"
|
323
|
-
),
|
324
|
-
Misspelling.new(
|
325
|
-
incorrect_text: "Phnom Sruoch District",
|
326
|
-
correct_text: "Phnum Sruoch District"
|
327
|
-
),
|
328
|
-
Misspelling.new(
|
329
|
-
incorrect_text: "Stueng Saen District",
|
330
|
-
correct_text: "Stueng Saen Municipality"
|
331
|
-
),
|
332
|
-
Misspelling.new(
|
333
|
-
incorrect_text: "Prasat Balangk District",
|
334
|
-
correct_text: "Prasat Ballangk District"
|
335
|
-
),
|
336
|
-
Misspelling.new(
|
337
|
-
incorrect_text: "Kampot District",
|
338
|
-
correct_text: "Kampot Municipality"
|
339
|
-
),
|
340
|
-
Misspelling.new(
|
341
|
-
incorrect_text: "Kampot District",
|
342
|
-
correct_text: "Kampot Municipality"
|
343
|
-
),
|
344
|
-
Misspelling.new(
|
345
|
-
incorrect_text: "Koh Thum District",
|
346
|
-
correct_text: "Kaoh Thum District"
|
347
|
-
),
|
348
|
-
Misspelling.new(
|
349
|
-
incorrect_text: "Mukh Kamphool District",
|
350
|
-
correct_text: "Mukh Kampul District"
|
351
|
-
),
|
352
|
-
Misspelling.new(
|
353
|
-
incorrect_text: "Ponhea Leu District",
|
354
|
-
correct_text: "Ponhea Lueu District"
|
355
|
-
),
|
356
|
-
Misspelling.new(
|
357
|
-
incorrect_text: "Kiri Sakor",
|
358
|
-
correct_text: "Kiri Sakor District"
|
359
|
-
),
|
360
|
-
Misspelling.new(
|
361
|
-
incorrect_text: "Koh Kong",
|
362
|
-
correct_text: "Kaoh Kong District"
|
363
|
-
),
|
364
|
-
Misspelling.new(
|
365
|
-
incorrect_text: "Khemara Phoumin",
|
366
|
-
correct_text: "Khemara Phoumin Municipality"
|
367
|
-
),
|
368
|
-
Misspelling.new(
|
369
|
-
incorrect_text: "Mondol Seima",
|
370
|
-
correct_text: "Mondol Seima District"
|
371
|
-
),
|
372
|
-
Misspelling.new(
|
373
|
-
incorrect_text: "Srae Ambel",
|
374
|
-
correct_text: "Srae Ambel District"
|
375
|
-
),
|
376
|
-
Misspelling.new(
|
377
|
-
incorrect_text: "Thma Bang",
|
378
|
-
correct_text: "Thma Bang District"
|
379
|
-
),
|
380
|
-
Misspelling.new(
|
381
|
-
incorrect_text: "Kratie Municipality",
|
382
|
-
correct_text: "Kracheh Municipality"
|
383
|
-
),
|
384
|
-
Misspelling.new(
|
385
|
-
incorrect_text: "Preaek Prasab District",
|
386
|
-
correct_text: "Prek Prasab District"
|
387
|
-
),
|
388
|
-
Misspelling.new(
|
389
|
-
incorrect_text: "Krong Saen Monorom",
|
390
|
-
correct_text: "Saen Monourom Municipality"
|
391
|
-
),
|
392
|
-
Misspelling.new(
|
393
|
-
incorrect_text: "Khan Daun Penh",
|
394
|
-
correct_text: "Doun Penh Section"
|
395
|
-
),
|
396
|
-
Misspelling.new(
|
397
|
-
incorrect_text: "Khan Prampir Makara",
|
398
|
-
correct_text: "Prampir Meakkakra Section"
|
399
|
-
),
|
400
|
-
Misspelling.new(
|
401
|
-
incorrect_text: "Khan Meanchey",
|
402
|
-
correct_text: "Mean Chey Section"
|
403
|
-
),
|
404
|
-
Misspelling.new(
|
405
|
-
incorrect_text: "Khan Sen Sok",
|
406
|
-
correct_text: "Saensokh Section"
|
407
|
-
),
|
408
|
-
Misspelling.new(
|
409
|
-
incorrect_text: "Khan Por Sen Chey",
|
410
|
-
correct_text: "Pur SenChey Section"
|
411
|
-
),
|
412
|
-
Misspelling.new(
|
413
|
-
incorrect_text: "Khan Chrouy Changvar",
|
414
|
-
correct_text: "Chraoy Chongvar Section"
|
415
|
-
),
|
416
|
-
Misspelling.new(
|
417
|
-
incorrect_text: "Khan Prek Phnov",
|
418
|
-
correct_text: "Praek Pnov Section"
|
419
|
-
),
|
420
|
-
Misspelling.new(
|
421
|
-
incorrect_text: "Choam Khsant",
|
422
|
-
correct_text: "Choam Ksant District"
|
423
|
-
),
|
424
|
-
Misspelling.new(
|
425
|
-
incorrect_text: "Kulen",
|
426
|
-
correct_text: "Kuleaen District"
|
427
|
-
),
|
428
|
-
Misspelling.new(
|
429
|
-
incorrect_text: "Sangkom Thmei",
|
430
|
-
correct_text: "Sangkum Thmei District"
|
431
|
-
),
|
432
|
-
Misspelling.new(
|
433
|
-
incorrect_text: "Prey Veaeng",
|
434
|
-
correct_text: "Prey Veng Municipality"
|
435
|
-
),
|
436
|
-
Misspelling.new(
|
437
|
-
incorrect_text: "Por Reang",
|
438
|
-
correct_text: "Pur Rieng District"
|
439
|
-
),
|
440
|
-
Misspelling.new(
|
441
|
-
incorrect_text: "Veal Veng",
|
442
|
-
correct_text: "Veal Veaeng District"
|
443
|
-
),
|
444
|
-
Misspelling.new(
|
445
|
-
incorrect_text: "Krong Banlung",
|
446
|
-
correct_text: "Ban Lung Municipality"
|
447
|
-
),
|
448
|
-
Misspelling.new(
|
449
|
-
incorrect_text: "Angkor Thom",
|
450
|
-
correct_text: "Angkor Thum District"
|
451
|
-
),
|
452
|
-
Misspelling.new(
|
453
|
-
incorrect_text: "Sout Nikom",
|
454
|
-
correct_text: "Soutr Nikom District"
|
455
|
-
),
|
456
|
-
Misspelling.new(
|
457
|
-
incorrect_text: "Steung Hav",
|
458
|
-
correct_text: "Stueng Hav District"
|
459
|
-
),
|
460
|
-
Misspelling.new(
|
461
|
-
incorrect_text: "Krong Stung Treng",
|
462
|
-
correct_text: "Stueng Traeng Municipality"
|
463
|
-
),
|
464
|
-
Misspelling.new(
|
465
|
-
incorrect_text: "Bourei Cholsar District",
|
466
|
-
correct_text: "Borei Cholsar District"
|
467
|
-
),
|
468
|
-
Misspelling.new(
|
469
|
-
incorrect_text: "Damnak Chang'Eur",
|
470
|
-
correct_text: "Damnak Chang'aeur District"
|
471
|
-
),
|
472
|
-
Misspelling.new(
|
473
|
-
incorrect_text: "Krong Keb",
|
474
|
-
correct_text: "Kaeb Municipality"
|
475
|
-
),
|
476
|
-
Misspelling.new(
|
477
|
-
incorrect_text: "Sala Krao",
|
478
|
-
correct_text: "Sala Krau District"
|
479
|
-
),
|
480
|
-
Misspelling.new(
|
481
|
-
incorrect_text: "Dombae",
|
482
|
-
correct_text: "Dambae District"
|
483
|
-
),
|
484
|
-
Misspelling.new(
|
485
|
-
incorrect_text: "Krouch Chhma",
|
486
|
-
correct_text: "Krouch Chhmar District"
|
487
|
-
),
|
488
|
-
Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
|
489
|
-
Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
|
490
|
-
Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
|
491
|
-
Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
|
492
|
-
Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
|
493
|
-
Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
|
494
|
-
Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
|
495
|
-
Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
|
496
|
-
Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
|
497
|
-
Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
|
498
|
-
Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
|
499
|
-
Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
|
500
|
-
Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
|
501
|
-
Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
|
502
|
-
Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
|
503
|
-
Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
|
504
|
-
Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
|
505
|
-
Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
|
506
|
-
Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
|
507
|
-
Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
|
508
|
-
Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
|
509
|
-
Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
|
510
|
-
Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
|
511
|
-
Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
|
512
|
-
Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
|
513
|
-
Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
|
514
|
-
Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
|
515
|
-
Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
|
516
|
-
Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
|
517
|
-
Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
|
518
|
-
Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
|
519
|
-
Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
|
520
|
-
correct_text: "Tuol Svay Prey Ti Pir"),
|
521
|
-
Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
|
522
|
-
Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
|
523
|
-
Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
|
524
|
-
Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
|
525
|
-
Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
|
526
|
-
Misspelling.new(
|
527
|
-
incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
|
528
|
-
correct_text: "Kampeaeng"
|
529
|
-
),
|
530
|
-
Misspelling.new(
|
531
|
-
incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
|
532
|
-
correct_text: "Prey Rumdeng"
|
533
|
-
),
|
534
|
-
Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
|
535
|
-
Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
|
536
|
-
Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
|
537
|
-
Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
|
538
|
-
Misspelling.new(
|
539
|
-
incorrect_text: "Trapeang Kranhung Commune",
|
540
|
-
correct_text: "Trapeang Kranhoung"
|
541
|
-
),
|
542
|
-
Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
|
543
|
-
Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
|
544
|
-
Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
|
545
|
-
].freeze
|
546
130
|
|
547
131
|
def scrape!
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
province_section = find_section(
|
552
|
-
text: district.province.address_en,
|
553
|
-
section: scraper.page,
|
554
|
-
xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
|
555
|
-
).xpath("ancestor::h2/following-sibling::div").first
|
556
|
-
|
557
|
-
district_title = find_section(
|
558
|
-
text: [district.full_name_en, district.full_name_latin, district.name_latin],
|
559
|
-
section: province_section,
|
560
|
-
xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
|
561
|
-
)
|
562
|
-
|
563
|
-
next unless district_title
|
564
|
-
|
565
|
-
district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
|
566
|
-
commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
|
567
|
-
|
568
|
-
commune_links.each do |link|
|
569
|
-
invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
|
570
|
-
|
571
|
-
next if invalid_commune_link
|
572
|
-
|
573
|
-
commune = begin
|
574
|
-
find_commune(
|
575
|
-
district:,
|
576
|
-
names: {
|
577
|
-
name_latin: link.text,
|
578
|
-
full_name_en: link.text,
|
579
|
-
full_name_latin: link.text
|
580
|
-
}
|
581
|
-
)
|
582
|
-
rescue CommuneNotFoundError => e
|
583
|
-
misspelling = MISSPELLINGS.find do |m|
|
584
|
-
m.incorrect_text == link.text
|
585
|
-
end
|
586
|
-
|
587
|
-
raise(e) unless misspelling
|
588
|
-
|
589
|
-
find_commune(district:, names: { name_latin: misspelling.correct_text })
|
590
|
-
end
|
591
|
-
|
592
|
-
result << ScraperResult.new(code: commune.id,
|
593
|
-
wikipedia: URI.join(
|
594
|
-
URL, link[:href]
|
595
|
-
).to_s)
|
596
|
-
end
|
597
|
-
end
|
598
|
-
|
599
|
-
result
|
600
|
-
end
|
601
|
-
|
602
|
-
private
|
132
|
+
Commune.all.each_with_object([]) do |commune, result|
|
133
|
+
url = find_url(commune)
|
134
|
+
next if url.nil?
|
603
135
|
|
604
|
-
|
605
|
-
|
606
|
-
def find_invalid_commune_link(district:, text:)
|
607
|
-
INVALID_COMMUNE_LINKS.find do |c|
|
608
|
-
c.district_code == district.id && c.name == text
|
136
|
+
result << ScraperResult.new(code: commune.id, wikipedia: url)
|
609
137
|
end
|
610
138
|
end
|
611
139
|
|
612
|
-
|
613
|
-
texts = Array(text)
|
614
|
-
default_text = texts.first
|
615
|
-
texts.each do |t|
|
616
|
-
return find_link(text: t, section:, xpath_pattern:)
|
617
|
-
rescue WebScraper::ElementNotFoundError => e
|
618
|
-
raise(e) if t == texts.last
|
619
|
-
end
|
620
|
-
rescue WebScraper::ElementNotFoundError => e
|
621
|
-
misspelling = MISSPELLINGS.find do |m|
|
622
|
-
m.correct_text == default_text
|
623
|
-
end
|
624
|
-
|
625
|
-
return if !misspelling && MISSING_LOCATIONS.include?(default_text)
|
626
|
-
raise(e) unless misspelling
|
627
|
-
|
628
|
-
find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
|
629
|
-
end
|
630
|
-
|
631
|
-
def find_link(text:, section:, xpath_pattern:)
|
632
|
-
xpath = format(xpath_pattern, text:)
|
633
|
-
result = section.xpath(xpath)
|
634
|
-
|
635
|
-
return result.first if result.size == 1
|
636
|
-
|
637
|
-
raise WebScraper::ElementNotFoundError,
|
638
|
-
"No link or many links found on #{URL} (xpath: '#{xpath}') "
|
639
|
-
end
|
140
|
+
private
|
640
141
|
|
641
|
-
def
|
642
|
-
|
643
|
-
names.each do |k, v|
|
644
|
-
results = Commune.where(district_id: district.id, k => v)
|
142
|
+
def find_url(commune)
|
143
|
+
geocode = scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
|
645
144
|
|
646
|
-
|
647
|
-
end
|
145
|
+
return if geocode.nil?
|
648
146
|
|
649
|
-
|
147
|
+
link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
|
650
148
|
|
651
|
-
if
|
652
|
-
raise DuplicateCommuneError,
|
653
|
-
"Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
|
654
|
-
end
|
149
|
+
return if link.nil?
|
655
150
|
|
656
|
-
|
151
|
+
URI.join(URL, link[:href]).to_s
|
657
152
|
end
|
658
153
|
|
659
154
|
def scraper
|
data/lib/pumi/data_source.rb
CHANGED
data/lib/pumi/geodata.rb
ADDED
data/lib/pumi/location.rb
CHANGED
data/lib/pumi/parser.rb
CHANGED
@@ -15,6 +15,7 @@ module Pumi
|
|
15
15
|
data_key: :provinces,
|
16
16
|
id_length: 2
|
17
17
|
)
|
18
|
+
|
18
19
|
DISTRICT = AdministrativeDivision.new(
|
19
20
|
type: District,
|
20
21
|
name: :district,
|
@@ -85,11 +86,17 @@ module Pumi
|
|
85
86
|
attributes.fetch("administrative_unit")
|
86
87
|
)
|
87
88
|
|
89
|
+
if attributes.key?("geodata")
|
90
|
+
geodata = Geodata.new(attributes.fetch("geodata").transform_keys(&:to_sym))
|
91
|
+
end
|
92
|
+
|
88
93
|
{
|
89
94
|
id:,
|
90
95
|
administrative_unit:,
|
91
96
|
name_km:,
|
92
97
|
name_latin:,
|
98
|
+
geodata:,
|
99
|
+
iso3166_2: attributes["iso3166_2"],
|
93
100
|
links: attributes.fetch("links", {}).transform_keys(&:to_sym),
|
94
101
|
name_en: name_latin,
|
95
102
|
full_name_km: [
|
data/lib/pumi/version.rb
CHANGED