pumi 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -69,13 +69,13 @@ module Pumi
69
69
  end
70
70
 
71
71
  def find_url(province)
72
- td = province_table_rows.xpath("child::td[contains(., '#{province.name_km}')]").first
72
+ td = province_table_rows.at_xpath("child::td[contains(., '#{province.name_km}')]")
73
73
  if td.nil?
74
74
  raise WebScraper::ElementNotFoundError,
75
75
  "No cell containing '#{province.name_km}' was found in a table on #{URL}"
76
76
  end
77
77
 
78
- link = td.xpath("preceding-sibling::td/a").first
78
+ link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
79
79
  URI.join(URL, link[:href]).to_s
80
80
  end
81
81
 
@@ -83,8 +83,8 @@ module Pumi
83
83
  @province_table_rows ||= begin
84
84
  sample_province = Province.all.first
85
85
 
86
- sample_row = scraper.page.xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]").first
87
- if sample_row.xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").empty?
86
+ sample_row = scraper.page.at_xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]")
87
+ if sample_row.at_xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").nil?
88
88
  raise WebScraper::ElementNotFoundError,
89
89
  "No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
90
90
  end
@@ -113,547 +113,42 @@ module Pumi
113
113
  end
114
114
 
115
115
  def find_url(district)
116
- identifier = district.id.chars.each_slice(2).map(&:join).join("-")
117
- list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
116
+ geocode = scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
118
117
 
119
- return if list_items.empty?
118
+ return if geocode.nil?
120
119
 
121
- if list_items.size > 1
122
- raise WebScraper::ElementNotFoundError,
123
- "More than one element was found with the identifier '#{identifier}' on #{URL}"
124
- end
120
+ link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
125
121
 
126
- link = list_items.first.xpath("child::a[contains(@href, '/wiki/')]").first
127
- return unless link
122
+ return if link.nil?
128
123
 
129
124
  URI.join(URL, link[:href]).to_s
130
125
  end
131
126
  end
132
127
 
133
128
  class CambodianCommunesScraper
134
- class CommuneNotFoundError < StandardError; end
135
- class DuplicateCommuneError < StandardError; end
136
-
137
129
  URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
138
- Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
139
- InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
140
-
141
- MISSING_LOCATIONS = [
142
- "Taing Kouk District",
143
- "Bokor Municipality",
144
- "Ta Lou Senchey District",
145
- "Kaoh Rung Municipality",
146
- "Borei Ou Svay Senchey District"
147
- ].freeze
148
-
149
- INVALID_COMMUNE_LINKS = [
150
- InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
151
- InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
152
- InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
153
- InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
154
- InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
155
- InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
156
- InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
157
- InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
158
- InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
159
- InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
160
- InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
161
- InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
162
- InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
163
- InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
164
- InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
165
- InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
166
- InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
167
- InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
168
- InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
169
- InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
170
- InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
171
- InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
172
- InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
173
- InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
174
- InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
175
- InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
176
- InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
177
- InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
178
- InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
179
- InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
180
- InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
181
- InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
182
- InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
183
- InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
184
- InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
185
- InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
186
- InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
187
- InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
188
- InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
189
- InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
190
- InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
191
- InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
192
- InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
193
- InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
194
- InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
195
- InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
196
- InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
197
- InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
198
- InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
199
- InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
200
- InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
201
- InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
202
- InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
203
- InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
204
- InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
205
- InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
206
- InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
207
- InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
208
- InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
209
- InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
210
- InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
211
- InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
212
- InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
213
- InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
214
- InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
215
- InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
216
- InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
217
- InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
218
- InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
219
- InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
220
- InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
221
- InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
222
- InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
223
- InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
224
- InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
225
- InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
226
- InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
227
- InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
228
- InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
229
- InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
230
- InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
231
- InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
232
- InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
233
- InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
234
- InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
235
- InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
236
- InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
237
- InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
238
- InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
239
- InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
240
- InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
241
- InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
242
- InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
243
- InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
244
- InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
245
- InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
246
- InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
247
- InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
248
- InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
249
- InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
250
- InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
251
- InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
252
- InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
253
- InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
254
- InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
255
- InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
256
- InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
257
- InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
258
- InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
259
- InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
260
- InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
261
- InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
262
- InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
263
- InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
264
- InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
265
- InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
266
- InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
267
- InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
268
- ].freeze
269
-
270
- MISSPELLINGS = [
271
- Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
272
- Misspelling.new(
273
- incorrect_text: "Mondulkiri Province",
274
- correct_text: "Mondul Kiri Province"
275
- ),
276
- Misspelling.new(
277
- incorrect_text: "Ratanakiri Province",
278
- correct_text: "Ratanak Kiri Province"
279
- ),
280
- Misspelling.new(
281
- incorrect_text: "Siem Reap Province",
282
- correct_text: "Siemreap Province"
283
- ),
284
- Misspelling.new(
285
- incorrect_text: "Serei Saophoan District",
286
- correct_text: "Serei Saophoan Municipality"
287
- ),
288
- Misspelling.new(
289
- incorrect_text: "Poipet Municipality",
290
- correct_text: "Paoy Paet Municipality"
291
- ),
292
- Misspelling.new(
293
- incorrect_text: "Battambang District",
294
- correct_text: "Battambang Municipality"
295
- ),
296
- Misspelling.new(
297
- incorrect_text: "Rotanak Mondol District",
298
- correct_text: "Rotonak Mondol District"
299
- ),
300
- Misspelling.new(
301
- incorrect_text: "Sampov Loun District",
302
- correct_text: "Sampov Lun District"
303
- ),
304
- Misspelling.new(
305
- incorrect_text: "Koh Kralor District",
306
- correct_text: "Koas Krala District"
307
- ),
308
- Misspelling.new(
309
- incorrect_text: "Rukhak Kiri District",
310
- correct_text: "Rukh Kiri District"
311
- ),
312
- Misspelling.new(
313
- incorrect_text: "Koh Sotin District",
314
- correct_text: "Kaoh Soutin District"
315
- ),
316
- Misspelling.new(
317
- incorrect_text: "Srey Santhor District",
318
- correct_text: "Srei Santhor District"
319
- ),
320
- Misspelling.new(
321
- incorrect_text: "Kong Pisey",
322
- correct_text: "Kong Pisei District"
323
- ),
324
- Misspelling.new(
325
- incorrect_text: "Phnom Sruoch District",
326
- correct_text: "Phnum Sruoch District"
327
- ),
328
- Misspelling.new(
329
- incorrect_text: "Stueng Saen District",
330
- correct_text: "Stueng Saen Municipality"
331
- ),
332
- Misspelling.new(
333
- incorrect_text: "Prasat Balangk District",
334
- correct_text: "Prasat Ballangk District"
335
- ),
336
- Misspelling.new(
337
- incorrect_text: "Kampot District",
338
- correct_text: "Kampot Municipality"
339
- ),
340
- Misspelling.new(
341
- incorrect_text: "Kampot District",
342
- correct_text: "Kampot Municipality"
343
- ),
344
- Misspelling.new(
345
- incorrect_text: "Koh Thum District",
346
- correct_text: "Kaoh Thum District"
347
- ),
348
- Misspelling.new(
349
- incorrect_text: "Mukh Kamphool District",
350
- correct_text: "Mukh Kampul District"
351
- ),
352
- Misspelling.new(
353
- incorrect_text: "Ponhea Leu District",
354
- correct_text: "Ponhea Lueu District"
355
- ),
356
- Misspelling.new(
357
- incorrect_text: "Kiri Sakor",
358
- correct_text: "Kiri Sakor District"
359
- ),
360
- Misspelling.new(
361
- incorrect_text: "Koh Kong",
362
- correct_text: "Kaoh Kong District"
363
- ),
364
- Misspelling.new(
365
- incorrect_text: "Khemara Phoumin",
366
- correct_text: "Khemara Phoumin Municipality"
367
- ),
368
- Misspelling.new(
369
- incorrect_text: "Mondol Seima",
370
- correct_text: "Mondol Seima District"
371
- ),
372
- Misspelling.new(
373
- incorrect_text: "Srae Ambel",
374
- correct_text: "Srae Ambel District"
375
- ),
376
- Misspelling.new(
377
- incorrect_text: "Thma Bang",
378
- correct_text: "Thma Bang District"
379
- ),
380
- Misspelling.new(
381
- incorrect_text: "Kratie Municipality",
382
- correct_text: "Kracheh Municipality"
383
- ),
384
- Misspelling.new(
385
- incorrect_text: "Preaek Prasab District",
386
- correct_text: "Prek Prasab District"
387
- ),
388
- Misspelling.new(
389
- incorrect_text: "Krong Saen Monorom",
390
- correct_text: "Saen Monourom Municipality"
391
- ),
392
- Misspelling.new(
393
- incorrect_text: "Khan Daun Penh",
394
- correct_text: "Doun Penh Section"
395
- ),
396
- Misspelling.new(
397
- incorrect_text: "Khan Prampir Makara",
398
- correct_text: "Prampir Meakkakra Section"
399
- ),
400
- Misspelling.new(
401
- incorrect_text: "Khan Meanchey",
402
- correct_text: "Mean Chey Section"
403
- ),
404
- Misspelling.new(
405
- incorrect_text: "Khan Sen Sok",
406
- correct_text: "Saensokh Section"
407
- ),
408
- Misspelling.new(
409
- incorrect_text: "Khan Por Sen Chey",
410
- correct_text: "Pur SenChey Section"
411
- ),
412
- Misspelling.new(
413
- incorrect_text: "Khan Chrouy Changvar",
414
- correct_text: "Chraoy Chongvar Section"
415
- ),
416
- Misspelling.new(
417
- incorrect_text: "Khan Prek Phnov",
418
- correct_text: "Praek Pnov Section"
419
- ),
420
- Misspelling.new(
421
- incorrect_text: "Choam Khsant",
422
- correct_text: "Choam Ksant District"
423
- ),
424
- Misspelling.new(
425
- incorrect_text: "Kulen",
426
- correct_text: "Kuleaen District"
427
- ),
428
- Misspelling.new(
429
- incorrect_text: "Sangkom Thmei",
430
- correct_text: "Sangkum Thmei District"
431
- ),
432
- Misspelling.new(
433
- incorrect_text: "Prey Veaeng",
434
- correct_text: "Prey Veng Municipality"
435
- ),
436
- Misspelling.new(
437
- incorrect_text: "Por Reang",
438
- correct_text: "Pur Rieng District"
439
- ),
440
- Misspelling.new(
441
- incorrect_text: "Veal Veng",
442
- correct_text: "Veal Veaeng District"
443
- ),
444
- Misspelling.new(
445
- incorrect_text: "Krong Banlung",
446
- correct_text: "Ban Lung Municipality"
447
- ),
448
- Misspelling.new(
449
- incorrect_text: "Angkor Thom",
450
- correct_text: "Angkor Thum District"
451
- ),
452
- Misspelling.new(
453
- incorrect_text: "Sout Nikom",
454
- correct_text: "Soutr Nikom District"
455
- ),
456
- Misspelling.new(
457
- incorrect_text: "Steung Hav",
458
- correct_text: "Stueng Hav District"
459
- ),
460
- Misspelling.new(
461
- incorrect_text: "Krong Stung Treng",
462
- correct_text: "Stueng Traeng Municipality"
463
- ),
464
- Misspelling.new(
465
- incorrect_text: "Bourei Cholsar District",
466
- correct_text: "Borei Cholsar District"
467
- ),
468
- Misspelling.new(
469
- incorrect_text: "Damnak Chang'Eur",
470
- correct_text: "Damnak Chang'aeur District"
471
- ),
472
- Misspelling.new(
473
- incorrect_text: "Krong Keb",
474
- correct_text: "Kaeb Municipality"
475
- ),
476
- Misspelling.new(
477
- incorrect_text: "Sala Krao",
478
- correct_text: "Sala Krau District"
479
- ),
480
- Misspelling.new(
481
- incorrect_text: "Dombae",
482
- correct_text: "Dambae District"
483
- ),
484
- Misspelling.new(
485
- incorrect_text: "Krouch Chhma",
486
- correct_text: "Krouch Chhmar District"
487
- ),
488
- Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
489
- Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
490
- Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
491
- Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
492
- Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
493
- Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
494
- Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
495
- Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
496
- Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
497
- Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
498
- Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
499
- Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
500
- Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
501
- Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
502
- Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
503
- Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
504
- Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
505
- Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
506
- Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
507
- Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
508
- Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
509
- Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
510
- Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
511
- Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
512
- Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
513
- Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
514
- Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
515
- Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
516
- Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
517
- Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
518
- Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
519
- Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
520
- correct_text: "Tuol Svay Prey Ti Pir"),
521
- Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
522
- Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
523
- Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
524
- Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
525
- Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
526
- Misspelling.new(
527
- incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
528
- correct_text: "Kampeaeng"
529
- ),
530
- Misspelling.new(
531
- incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
532
- correct_text: "Prey Rumdeng"
533
- ),
534
- Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
535
- Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
536
- Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
537
- Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
538
- Misspelling.new(
539
- incorrect_text: "Trapeang Kranhung Commune",
540
- correct_text: "Trapeang Kranhoung"
541
- ),
542
- Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
543
- Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
544
- Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
545
- ].freeze
546
130
 
547
131
  def scrape!
548
- result = []
549
-
550
- District.all.each do |district|
551
- province_section = find_section(
552
- text: district.province.address_en,
553
- section: scraper.page,
554
- xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
555
- ).xpath("ancestor::h2/following-sibling::div").first
556
-
557
- district_title = find_section(
558
- text: [district.full_name_en, district.full_name_latin, district.name_latin],
559
- section: province_section,
560
- xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
561
- )
562
-
563
- next unless district_title
564
-
565
- district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
566
- commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
567
-
568
- commune_links.each do |link|
569
- invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
570
-
571
- next if invalid_commune_link
572
-
573
- commune = begin
574
- find_commune(
575
- district:,
576
- names: {
577
- name_latin: link.text,
578
- full_name_en: link.text,
579
- full_name_latin: link.text
580
- }
581
- )
582
- rescue CommuneNotFoundError => e
583
- misspelling = MISSPELLINGS.find do |m|
584
- m.incorrect_text == link.text
585
- end
586
-
587
- raise(e) unless misspelling
588
-
589
- find_commune(district:, names: { name_latin: misspelling.correct_text })
590
- end
591
-
592
- result << ScraperResult.new(code: commune.id,
593
- wikipedia: URI.join(
594
- URL, link[:href]
595
- ).to_s)
596
- end
597
- end
598
-
599
- result
600
- end
601
-
602
- private
132
+ Commune.all.each_with_object([]) do |commune, result|
133
+ url = find_url(commune)
134
+ next if url.nil?
603
135
 
604
- def build_commune_links(district:, pool:); end
605
-
606
- def find_invalid_commune_link(district:, text:)
607
- INVALID_COMMUNE_LINKS.find do |c|
608
- c.district_code == district.id && c.name == text
136
+ result << ScraperResult.new(code: commune.id, wikipedia: url)
609
137
  end
610
138
  end
611
139
 
612
- def find_section(text:, section:, xpath_pattern:)
613
- texts = Array(text)
614
- default_text = texts.first
615
- texts.each do |t|
616
- return find_link(text: t, section:, xpath_pattern:)
617
- rescue WebScraper::ElementNotFoundError => e
618
- raise(e) if t == texts.last
619
- end
620
- rescue WebScraper::ElementNotFoundError => e
621
- misspelling = MISSPELLINGS.find do |m|
622
- m.correct_text == default_text
623
- end
624
-
625
- return if !misspelling && MISSING_LOCATIONS.include?(default_text)
626
- raise(e) unless misspelling
627
-
628
- find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
629
- end
630
-
631
- def find_link(text:, section:, xpath_pattern:)
632
- xpath = format(xpath_pattern, text:)
633
- result = section.xpath(xpath)
634
-
635
- return result.first if result.size == 1
636
-
637
- raise WebScraper::ElementNotFoundError,
638
- "No link or many links found on #{URL} (xpath: '#{xpath}') "
639
- end
140
+ private
640
141
 
641
- def find_commune(district:, names:)
642
- results = []
643
- names.each do |k, v|
644
- results = Commune.where(district_id: district.id, k => v)
142
+ def find_url(commune)
143
+ geocode = scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
645
144
 
646
- break unless results.empty?
647
- end
145
+ return if geocode.nil?
648
146
 
649
- raise CommuneNotFoundError if results.empty?
147
+ link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
650
148
 
651
- if results.size > 1
652
- raise DuplicateCommuneError,
653
- "Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
654
- end
149
+ return if link.nil?
655
150
 
656
- results.first
151
+ URI.join(URL, link[:href]).to_s
657
152
  end
658
153
 
659
154
  def scraper
@@ -3,5 +3,7 @@ module Pumi
3
3
  end
4
4
  end
5
5
 
6
+ require_relative "data_source/geocoder"
7
+ require_relative "data_source/iso31662"
6
8
  require_relative "data_source/ncdd"
7
9
  require_relative "data_source/wikipedia"
@@ -0,0 +1,3 @@
1
+ module Pumi
2
+ Geodata = Struct.new(:lat, :long, :bounding_box, keyword_init: true)
3
+ end
data/lib/pumi/location.rb CHANGED
@@ -8,6 +8,8 @@ module Pumi
8
8
  :address_km, :address_latin, :address_en,
9
9
  :administrative_unit,
10
10
  :links,
11
+ :geodata,
12
+ :iso3166_2,
11
13
  keyword_init: true
12
14
  ) do
13
15
  class << self
data/lib/pumi/parser.rb CHANGED
@@ -15,6 +15,7 @@ module Pumi
15
15
  data_key: :provinces,
16
16
  id_length: 2
17
17
  )
18
+
18
19
  DISTRICT = AdministrativeDivision.new(
19
20
  type: District,
20
21
  name: :district,
@@ -85,11 +86,17 @@ module Pumi
85
86
  attributes.fetch("administrative_unit")
86
87
  )
87
88
 
89
+ if attributes.key?("geodata")
90
+ geodata = Geodata.new(attributes.fetch("geodata").transform_keys(&:to_sym))
91
+ end
92
+
88
93
  {
89
94
  id:,
90
95
  administrative_unit:,
91
96
  name_km:,
92
97
  name_latin:,
98
+ geodata:,
99
+ iso3166_2: attributes["iso3166_2"],
93
100
  links: attributes.fetch("links", {}).transform_keys(&:to_sym),
94
101
  name_en: name_latin,
95
102
  full_name_km: [
data/lib/pumi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pumi
2
- VERSION = "0.19.0".freeze
2
+ VERSION = "0.20.0".freeze
3
3
  end