pumi 0.19.0 → 0.20.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +8 -0
- data/bin/parse_data +2 -5
- data/data/communes.yml +9904 -0
- data/data/districts.yml +1576 -0
- data/data/provinces.yml +225 -0
- data/lib/pumi/bot/wikipedia/article.rb +13 -0
- data/lib/pumi/bot/wikipedia/communes_in_cambodia_article.rb +157 -0
- data/lib/pumi/bot/wikipedia/districts_in_cambodia_article.rb +122 -0
- data/lib/pumi/bot/wikipedia/templates/commune_list.wikitext.erb +46 -0
- data/lib/pumi/bot/wikipedia/templates/district_list.wikitext.erb +27 -0
- data/lib/pumi/bot/wikipedia.rb +10 -0
- data/lib/pumi/bot.rb +6 -0
- data/lib/pumi/data_source/geocoder.rb +251 -0
- data/lib/pumi/data_source/iso31662.rb +29 -0
- data/lib/pumi/data_source/wikipedia.rb +19 -524
- data/lib/pumi/data_source.rb +2 -0
- data/lib/pumi/geodata.rb +3 -0
- data/lib/pumi/location.rb +2 -0
- data/lib/pumi/parser.rb +7 -0
- data/lib/pumi/version.rb +1 -1
- data/lib/pumi/wikipedia/client.rb +68 -0
- data/lib/pumi/wikipedia/response.rb +15 -0
- data/lib/pumi/wikipedia.rb +7 -0
- data/lib/pumi.rb +1 -1
- data/pumi.gemspec +4 -1
- metadata +58 -4
- data/lib/pumi/scraper/result.rb +0 -5
@@ -69,13 +69,13 @@ module Pumi
|
|
69
69
|
end
|
70
70
|
|
71
71
|
def find_url(province)
|
72
|
-
td = province_table_rows.
|
72
|
+
td = province_table_rows.at_xpath("child::td[contains(., '#{province.name_km}')]")
|
73
73
|
if td.nil?
|
74
74
|
raise WebScraper::ElementNotFoundError,
|
75
75
|
"No cell containing '#{province.name_km}' was found in a table on #{URL}"
|
76
76
|
end
|
77
77
|
|
78
|
-
link = td.
|
78
|
+
link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
|
79
79
|
URI.join(URL, link[:href]).to_s
|
80
80
|
end
|
81
81
|
|
@@ -83,8 +83,8 @@ module Pumi
|
|
83
83
|
@province_table_rows ||= begin
|
84
84
|
sample_province = Province.all.first
|
85
85
|
|
86
|
-
sample_row = scraper.page.
|
87
|
-
if sample_row.
|
86
|
+
sample_row = scraper.page.at_xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]")
|
87
|
+
if sample_row.at_xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").nil?
|
88
88
|
raise WebScraper::ElementNotFoundError,
|
89
89
|
"No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
|
90
90
|
end
|
@@ -113,547 +113,42 @@ module Pumi
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def find_url(district)
|
116
|
-
|
117
|
-
list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
|
116
|
+
geocode = scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
|
118
117
|
|
119
|
-
return if
|
118
|
+
return if geocode.nil?
|
120
119
|
|
121
|
-
|
122
|
-
raise WebScraper::ElementNotFoundError,
|
123
|
-
"More than one element was found with the identifier '#{identifier}' on #{URL}"
|
124
|
-
end
|
120
|
+
link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
|
125
121
|
|
126
|
-
|
127
|
-
return unless link
|
122
|
+
return if link.nil?
|
128
123
|
|
129
124
|
URI.join(URL, link[:href]).to_s
|
130
125
|
end
|
131
126
|
end
|
132
127
|
|
133
128
|
class CambodianCommunesScraper
|
134
|
-
class CommuneNotFoundError < StandardError; end
|
135
|
-
class DuplicateCommuneError < StandardError; end
|
136
|
-
|
137
129
|
URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
|
138
|
-
Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
|
139
|
-
InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
|
140
|
-
|
141
|
-
MISSING_LOCATIONS = [
|
142
|
-
"Taing Kouk District",
|
143
|
-
"Bokor Municipality",
|
144
|
-
"Ta Lou Senchey District",
|
145
|
-
"Kaoh Rung Municipality",
|
146
|
-
"Borei Ou Svay Senchey District"
|
147
|
-
].freeze
|
148
|
-
|
149
|
-
INVALID_COMMUNE_LINKS = [
|
150
|
-
InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
|
151
|
-
InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
|
152
|
-
InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
|
153
|
-
InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
|
154
|
-
InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
|
155
|
-
InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
|
156
|
-
InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
|
157
|
-
InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
|
158
|
-
InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
|
159
|
-
InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
|
160
|
-
InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
|
161
|
-
InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
|
162
|
-
InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
|
163
|
-
InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
|
164
|
-
InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
|
165
|
-
InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
|
166
|
-
InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
|
167
|
-
InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
|
168
|
-
InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
|
169
|
-
InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
|
170
|
-
InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
|
171
|
-
InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
|
172
|
-
InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
|
173
|
-
InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
|
174
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
|
175
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
|
176
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
|
177
|
-
InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
|
178
|
-
InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
|
179
|
-
InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
|
180
|
-
InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
|
181
|
-
InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
|
182
|
-
InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
|
183
|
-
InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
|
184
|
-
InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
|
185
|
-
InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
|
186
|
-
InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
|
187
|
-
InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
|
188
|
-
InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
|
189
|
-
InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
|
190
|
-
InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
|
191
|
-
InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
|
192
|
-
InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
|
193
|
-
InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
|
194
|
-
InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
|
195
|
-
InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
|
196
|
-
InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
|
197
|
-
InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
|
198
|
-
InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
|
199
|
-
InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
|
200
|
-
InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
|
201
|
-
InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
|
202
|
-
InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
|
203
|
-
InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
|
204
|
-
InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
|
205
|
-
InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
|
206
|
-
InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
|
207
|
-
InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
|
208
|
-
InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
|
209
|
-
InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
|
210
|
-
InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
|
211
|
-
InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
|
212
|
-
InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
|
213
|
-
InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
|
214
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
|
215
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
|
216
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
|
217
|
-
InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
|
218
|
-
InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
|
219
|
-
InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
|
220
|
-
InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
|
221
|
-
InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
|
222
|
-
InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
|
223
|
-
InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
|
224
|
-
InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
|
225
|
-
InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
|
226
|
-
InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
|
227
|
-
InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
|
228
|
-
InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
|
229
|
-
InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
|
230
|
-
InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
|
231
|
-
InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
|
232
|
-
InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
|
233
|
-
InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
|
234
|
-
InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
|
235
|
-
InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
|
236
|
-
InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
|
237
|
-
InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
|
238
|
-
InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
|
239
|
-
InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
|
240
|
-
InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
|
241
|
-
InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
|
242
|
-
InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
|
243
|
-
InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
|
244
|
-
InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
|
245
|
-
InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
|
246
|
-
InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
|
247
|
-
InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
|
248
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
|
249
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
|
250
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
|
251
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
|
252
|
-
InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
|
253
|
-
InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
|
254
|
-
InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
|
255
|
-
InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
|
256
|
-
InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
|
257
|
-
InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
|
258
|
-
InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
|
259
|
-
InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
|
260
|
-
InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
|
261
|
-
InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
|
262
|
-
InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
|
263
|
-
InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
|
264
|
-
InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
|
265
|
-
InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
|
266
|
-
InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
|
267
|
-
InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
|
268
|
-
].freeze
|
269
|
-
|
270
|
-
MISSPELLINGS = [
|
271
|
-
Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
|
272
|
-
Misspelling.new(
|
273
|
-
incorrect_text: "Mondulkiri Province",
|
274
|
-
correct_text: "Mondul Kiri Province"
|
275
|
-
),
|
276
|
-
Misspelling.new(
|
277
|
-
incorrect_text: "Ratanakiri Province",
|
278
|
-
correct_text: "Ratanak Kiri Province"
|
279
|
-
),
|
280
|
-
Misspelling.new(
|
281
|
-
incorrect_text: "Siem Reap Province",
|
282
|
-
correct_text: "Siemreap Province"
|
283
|
-
),
|
284
|
-
Misspelling.new(
|
285
|
-
incorrect_text: "Serei Saophoan District",
|
286
|
-
correct_text: "Serei Saophoan Municipality"
|
287
|
-
),
|
288
|
-
Misspelling.new(
|
289
|
-
incorrect_text: "Poipet Municipality",
|
290
|
-
correct_text: "Paoy Paet Municipality"
|
291
|
-
),
|
292
|
-
Misspelling.new(
|
293
|
-
incorrect_text: "Battambang District",
|
294
|
-
correct_text: "Battambang Municipality"
|
295
|
-
),
|
296
|
-
Misspelling.new(
|
297
|
-
incorrect_text: "Rotanak Mondol District",
|
298
|
-
correct_text: "Rotonak Mondol District"
|
299
|
-
),
|
300
|
-
Misspelling.new(
|
301
|
-
incorrect_text: "Sampov Loun District",
|
302
|
-
correct_text: "Sampov Lun District"
|
303
|
-
),
|
304
|
-
Misspelling.new(
|
305
|
-
incorrect_text: "Koh Kralor District",
|
306
|
-
correct_text: "Koas Krala District"
|
307
|
-
),
|
308
|
-
Misspelling.new(
|
309
|
-
incorrect_text: "Rukhak Kiri District",
|
310
|
-
correct_text: "Rukh Kiri District"
|
311
|
-
),
|
312
|
-
Misspelling.new(
|
313
|
-
incorrect_text: "Koh Sotin District",
|
314
|
-
correct_text: "Kaoh Soutin District"
|
315
|
-
),
|
316
|
-
Misspelling.new(
|
317
|
-
incorrect_text: "Srey Santhor District",
|
318
|
-
correct_text: "Srei Santhor District"
|
319
|
-
),
|
320
|
-
Misspelling.new(
|
321
|
-
incorrect_text: "Kong Pisey",
|
322
|
-
correct_text: "Kong Pisei District"
|
323
|
-
),
|
324
|
-
Misspelling.new(
|
325
|
-
incorrect_text: "Phnom Sruoch District",
|
326
|
-
correct_text: "Phnum Sruoch District"
|
327
|
-
),
|
328
|
-
Misspelling.new(
|
329
|
-
incorrect_text: "Stueng Saen District",
|
330
|
-
correct_text: "Stueng Saen Municipality"
|
331
|
-
),
|
332
|
-
Misspelling.new(
|
333
|
-
incorrect_text: "Prasat Balangk District",
|
334
|
-
correct_text: "Prasat Ballangk District"
|
335
|
-
),
|
336
|
-
Misspelling.new(
|
337
|
-
incorrect_text: "Kampot District",
|
338
|
-
correct_text: "Kampot Municipality"
|
339
|
-
),
|
340
|
-
Misspelling.new(
|
341
|
-
incorrect_text: "Kampot District",
|
342
|
-
correct_text: "Kampot Municipality"
|
343
|
-
),
|
344
|
-
Misspelling.new(
|
345
|
-
incorrect_text: "Koh Thum District",
|
346
|
-
correct_text: "Kaoh Thum District"
|
347
|
-
),
|
348
|
-
Misspelling.new(
|
349
|
-
incorrect_text: "Mukh Kamphool District",
|
350
|
-
correct_text: "Mukh Kampul District"
|
351
|
-
),
|
352
|
-
Misspelling.new(
|
353
|
-
incorrect_text: "Ponhea Leu District",
|
354
|
-
correct_text: "Ponhea Lueu District"
|
355
|
-
),
|
356
|
-
Misspelling.new(
|
357
|
-
incorrect_text: "Kiri Sakor",
|
358
|
-
correct_text: "Kiri Sakor District"
|
359
|
-
),
|
360
|
-
Misspelling.new(
|
361
|
-
incorrect_text: "Koh Kong",
|
362
|
-
correct_text: "Kaoh Kong District"
|
363
|
-
),
|
364
|
-
Misspelling.new(
|
365
|
-
incorrect_text: "Khemara Phoumin",
|
366
|
-
correct_text: "Khemara Phoumin Municipality"
|
367
|
-
),
|
368
|
-
Misspelling.new(
|
369
|
-
incorrect_text: "Mondol Seima",
|
370
|
-
correct_text: "Mondol Seima District"
|
371
|
-
),
|
372
|
-
Misspelling.new(
|
373
|
-
incorrect_text: "Srae Ambel",
|
374
|
-
correct_text: "Srae Ambel District"
|
375
|
-
),
|
376
|
-
Misspelling.new(
|
377
|
-
incorrect_text: "Thma Bang",
|
378
|
-
correct_text: "Thma Bang District"
|
379
|
-
),
|
380
|
-
Misspelling.new(
|
381
|
-
incorrect_text: "Kratie Municipality",
|
382
|
-
correct_text: "Kracheh Municipality"
|
383
|
-
),
|
384
|
-
Misspelling.new(
|
385
|
-
incorrect_text: "Preaek Prasab District",
|
386
|
-
correct_text: "Prek Prasab District"
|
387
|
-
),
|
388
|
-
Misspelling.new(
|
389
|
-
incorrect_text: "Krong Saen Monorom",
|
390
|
-
correct_text: "Saen Monourom Municipality"
|
391
|
-
),
|
392
|
-
Misspelling.new(
|
393
|
-
incorrect_text: "Khan Daun Penh",
|
394
|
-
correct_text: "Doun Penh Section"
|
395
|
-
),
|
396
|
-
Misspelling.new(
|
397
|
-
incorrect_text: "Khan Prampir Makara",
|
398
|
-
correct_text: "Prampir Meakkakra Section"
|
399
|
-
),
|
400
|
-
Misspelling.new(
|
401
|
-
incorrect_text: "Khan Meanchey",
|
402
|
-
correct_text: "Mean Chey Section"
|
403
|
-
),
|
404
|
-
Misspelling.new(
|
405
|
-
incorrect_text: "Khan Sen Sok",
|
406
|
-
correct_text: "Saensokh Section"
|
407
|
-
),
|
408
|
-
Misspelling.new(
|
409
|
-
incorrect_text: "Khan Por Sen Chey",
|
410
|
-
correct_text: "Pur SenChey Section"
|
411
|
-
),
|
412
|
-
Misspelling.new(
|
413
|
-
incorrect_text: "Khan Chrouy Changvar",
|
414
|
-
correct_text: "Chraoy Chongvar Section"
|
415
|
-
),
|
416
|
-
Misspelling.new(
|
417
|
-
incorrect_text: "Khan Prek Phnov",
|
418
|
-
correct_text: "Praek Pnov Section"
|
419
|
-
),
|
420
|
-
Misspelling.new(
|
421
|
-
incorrect_text: "Choam Khsant",
|
422
|
-
correct_text: "Choam Ksant District"
|
423
|
-
),
|
424
|
-
Misspelling.new(
|
425
|
-
incorrect_text: "Kulen",
|
426
|
-
correct_text: "Kuleaen District"
|
427
|
-
),
|
428
|
-
Misspelling.new(
|
429
|
-
incorrect_text: "Sangkom Thmei",
|
430
|
-
correct_text: "Sangkum Thmei District"
|
431
|
-
),
|
432
|
-
Misspelling.new(
|
433
|
-
incorrect_text: "Prey Veaeng",
|
434
|
-
correct_text: "Prey Veng Municipality"
|
435
|
-
),
|
436
|
-
Misspelling.new(
|
437
|
-
incorrect_text: "Por Reang",
|
438
|
-
correct_text: "Pur Rieng District"
|
439
|
-
),
|
440
|
-
Misspelling.new(
|
441
|
-
incorrect_text: "Veal Veng",
|
442
|
-
correct_text: "Veal Veaeng District"
|
443
|
-
),
|
444
|
-
Misspelling.new(
|
445
|
-
incorrect_text: "Krong Banlung",
|
446
|
-
correct_text: "Ban Lung Municipality"
|
447
|
-
),
|
448
|
-
Misspelling.new(
|
449
|
-
incorrect_text: "Angkor Thom",
|
450
|
-
correct_text: "Angkor Thum District"
|
451
|
-
),
|
452
|
-
Misspelling.new(
|
453
|
-
incorrect_text: "Sout Nikom",
|
454
|
-
correct_text: "Soutr Nikom District"
|
455
|
-
),
|
456
|
-
Misspelling.new(
|
457
|
-
incorrect_text: "Steung Hav",
|
458
|
-
correct_text: "Stueng Hav District"
|
459
|
-
),
|
460
|
-
Misspelling.new(
|
461
|
-
incorrect_text: "Krong Stung Treng",
|
462
|
-
correct_text: "Stueng Traeng Municipality"
|
463
|
-
),
|
464
|
-
Misspelling.new(
|
465
|
-
incorrect_text: "Bourei Cholsar District",
|
466
|
-
correct_text: "Borei Cholsar District"
|
467
|
-
),
|
468
|
-
Misspelling.new(
|
469
|
-
incorrect_text: "Damnak Chang'Eur",
|
470
|
-
correct_text: "Damnak Chang'aeur District"
|
471
|
-
),
|
472
|
-
Misspelling.new(
|
473
|
-
incorrect_text: "Krong Keb",
|
474
|
-
correct_text: "Kaeb Municipality"
|
475
|
-
),
|
476
|
-
Misspelling.new(
|
477
|
-
incorrect_text: "Sala Krao",
|
478
|
-
correct_text: "Sala Krau District"
|
479
|
-
),
|
480
|
-
Misspelling.new(
|
481
|
-
incorrect_text: "Dombae",
|
482
|
-
correct_text: "Dambae District"
|
483
|
-
),
|
484
|
-
Misspelling.new(
|
485
|
-
incorrect_text: "Krouch Chhma",
|
486
|
-
correct_text: "Krouch Chhmar District"
|
487
|
-
),
|
488
|
-
Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
|
489
|
-
Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
|
490
|
-
Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
|
491
|
-
Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
|
492
|
-
Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
|
493
|
-
Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
|
494
|
-
Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
|
495
|
-
Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
|
496
|
-
Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
|
497
|
-
Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
|
498
|
-
Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
|
499
|
-
Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
|
500
|
-
Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
|
501
|
-
Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
|
502
|
-
Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
|
503
|
-
Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
|
504
|
-
Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
|
505
|
-
Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
|
506
|
-
Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
|
507
|
-
Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
|
508
|
-
Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
|
509
|
-
Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
|
510
|
-
Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
|
511
|
-
Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
|
512
|
-
Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
|
513
|
-
Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
|
514
|
-
Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
|
515
|
-
Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
|
516
|
-
Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
|
517
|
-
Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
|
518
|
-
Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
|
519
|
-
Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
|
520
|
-
correct_text: "Tuol Svay Prey Ti Pir"),
|
521
|
-
Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
|
522
|
-
Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
|
523
|
-
Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
|
524
|
-
Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
|
525
|
-
Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
|
526
|
-
Misspelling.new(
|
527
|
-
incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
|
528
|
-
correct_text: "Kampeaeng"
|
529
|
-
),
|
530
|
-
Misspelling.new(
|
531
|
-
incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
|
532
|
-
correct_text: "Prey Rumdeng"
|
533
|
-
),
|
534
|
-
Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
|
535
|
-
Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
|
536
|
-
Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
|
537
|
-
Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
|
538
|
-
Misspelling.new(
|
539
|
-
incorrect_text: "Trapeang Kranhung Commune",
|
540
|
-
correct_text: "Trapeang Kranhoung"
|
541
|
-
),
|
542
|
-
Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
|
543
|
-
Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
|
544
|
-
Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
|
545
|
-
].freeze
|
546
130
|
|
547
131
|
def scrape!
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
province_section = find_section(
|
552
|
-
text: district.province.address_en,
|
553
|
-
section: scraper.page,
|
554
|
-
xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
|
555
|
-
).xpath("ancestor::h2/following-sibling::div").first
|
556
|
-
|
557
|
-
district_title = find_section(
|
558
|
-
text: [district.full_name_en, district.full_name_latin, district.name_latin],
|
559
|
-
section: province_section,
|
560
|
-
xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
|
561
|
-
)
|
562
|
-
|
563
|
-
next unless district_title
|
564
|
-
|
565
|
-
district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
|
566
|
-
commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
|
567
|
-
|
568
|
-
commune_links.each do |link|
|
569
|
-
invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
|
570
|
-
|
571
|
-
next if invalid_commune_link
|
572
|
-
|
573
|
-
commune = begin
|
574
|
-
find_commune(
|
575
|
-
district:,
|
576
|
-
names: {
|
577
|
-
name_latin: link.text,
|
578
|
-
full_name_en: link.text,
|
579
|
-
full_name_latin: link.text
|
580
|
-
}
|
581
|
-
)
|
582
|
-
rescue CommuneNotFoundError => e
|
583
|
-
misspelling = MISSPELLINGS.find do |m|
|
584
|
-
m.incorrect_text == link.text
|
585
|
-
end
|
586
|
-
|
587
|
-
raise(e) unless misspelling
|
588
|
-
|
589
|
-
find_commune(district:, names: { name_latin: misspelling.correct_text })
|
590
|
-
end
|
591
|
-
|
592
|
-
result << ScraperResult.new(code: commune.id,
|
593
|
-
wikipedia: URI.join(
|
594
|
-
URL, link[:href]
|
595
|
-
).to_s)
|
596
|
-
end
|
597
|
-
end
|
598
|
-
|
599
|
-
result
|
600
|
-
end
|
601
|
-
|
602
|
-
private
|
132
|
+
Commune.all.each_with_object([]) do |commune, result|
|
133
|
+
url = find_url(commune)
|
134
|
+
next if url.nil?
|
603
135
|
|
604
|
-
|
605
|
-
|
606
|
-
def find_invalid_commune_link(district:, text:)
|
607
|
-
INVALID_COMMUNE_LINKS.find do |c|
|
608
|
-
c.district_code == district.id && c.name == text
|
136
|
+
result << ScraperResult.new(code: commune.id, wikipedia: url)
|
609
137
|
end
|
610
138
|
end
|
611
139
|
|
612
|
-
|
613
|
-
texts = Array(text)
|
614
|
-
default_text = texts.first
|
615
|
-
texts.each do |t|
|
616
|
-
return find_link(text: t, section:, xpath_pattern:)
|
617
|
-
rescue WebScraper::ElementNotFoundError => e
|
618
|
-
raise(e) if t == texts.last
|
619
|
-
end
|
620
|
-
rescue WebScraper::ElementNotFoundError => e
|
621
|
-
misspelling = MISSPELLINGS.find do |m|
|
622
|
-
m.correct_text == default_text
|
623
|
-
end
|
624
|
-
|
625
|
-
return if !misspelling && MISSING_LOCATIONS.include?(default_text)
|
626
|
-
raise(e) unless misspelling
|
627
|
-
|
628
|
-
find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
|
629
|
-
end
|
630
|
-
|
631
|
-
def find_link(text:, section:, xpath_pattern:)
|
632
|
-
xpath = format(xpath_pattern, text:)
|
633
|
-
result = section.xpath(xpath)
|
634
|
-
|
635
|
-
return result.first if result.size == 1
|
636
|
-
|
637
|
-
raise WebScraper::ElementNotFoundError,
|
638
|
-
"No link or many links found on #{URL} (xpath: '#{xpath}') "
|
639
|
-
end
|
140
|
+
private
|
640
141
|
|
641
|
-
def
|
642
|
-
|
643
|
-
names.each do |k, v|
|
644
|
-
results = Commune.where(district_id: district.id, k => v)
|
142
|
+
def find_url(commune)
|
143
|
+
geocode = scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
|
645
144
|
|
646
|
-
|
647
|
-
end
|
145
|
+
return if geocode.nil?
|
648
146
|
|
649
|
-
|
147
|
+
link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
|
650
148
|
|
651
|
-
if
|
652
|
-
raise DuplicateCommuneError,
|
653
|
-
"Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
|
654
|
-
end
|
149
|
+
return if link.nil?
|
655
150
|
|
656
|
-
|
151
|
+
URI.join(URL, link[:href]).to_s
|
657
152
|
end
|
658
153
|
|
659
154
|
def scraper
|
data/lib/pumi/data_source.rb
CHANGED
data/lib/pumi/geodata.rb
ADDED
data/lib/pumi/location.rb
CHANGED
data/lib/pumi/parser.rb
CHANGED
@@ -15,6 +15,7 @@ module Pumi
|
|
15
15
|
data_key: :provinces,
|
16
16
|
id_length: 2
|
17
17
|
)
|
18
|
+
|
18
19
|
DISTRICT = AdministrativeDivision.new(
|
19
20
|
type: District,
|
20
21
|
name: :district,
|
@@ -85,11 +86,17 @@ module Pumi
|
|
85
86
|
attributes.fetch("administrative_unit")
|
86
87
|
)
|
87
88
|
|
89
|
+
if attributes.key?("geodata")
|
90
|
+
geodata = Geodata.new(attributes.fetch("geodata").transform_keys(&:to_sym))
|
91
|
+
end
|
92
|
+
|
88
93
|
{
|
89
94
|
id:,
|
90
95
|
administrative_unit:,
|
91
96
|
name_km:,
|
92
97
|
name_latin:,
|
98
|
+
geodata:,
|
99
|
+
iso3166_2: attributes["iso3166_2"],
|
93
100
|
links: attributes.fetch("links", {}).transform_keys(&:to_sym),
|
94
101
|
name_en: name_latin,
|
95
102
|
full_name_km: [
|
data/lib/pumi/version.rb
CHANGED