pumi 0.19.0 → 0.20.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -69,13 +69,13 @@ module Pumi
69
69
  end
70
70
 
71
71
  def find_url(province)
72
- td = province_table_rows.xpath("child::td[contains(., '#{province.name_km}')]").first
72
+ td = province_table_rows.at_xpath("child::td[contains(., '#{province.name_km}')]")
73
73
  if td.nil?
74
74
  raise WebScraper::ElementNotFoundError,
75
75
  "No cell containing '#{province.name_km}' was found in a table on #{URL}"
76
76
  end
77
77
 
78
- link = td.xpath("preceding-sibling::td/a").first
78
+ link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
79
79
  URI.join(URL, link[:href]).to_s
80
80
  end
81
81
 
@@ -83,8 +83,8 @@ module Pumi
83
83
  @province_table_rows ||= begin
84
84
  sample_province = Province.all.first
85
85
 
86
- sample_row = scraper.page.xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]").first
87
- if sample_row.xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").empty?
86
+ sample_row = scraper.page.at_xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]")
87
+ if sample_row.at_xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").nil?
88
88
  raise WebScraper::ElementNotFoundError,
89
89
  "No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
90
90
  end
@@ -113,547 +113,42 @@ module Pumi
113
113
  end
114
114
 
115
115
  def find_url(district)
116
- identifier = district.id.chars.each_slice(2).map(&:join).join("-")
117
- list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
116
+ geocode = scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
118
117
 
119
- return if list_items.empty?
118
+ return if geocode.nil?
120
119
 
121
- if list_items.size > 1
122
- raise WebScraper::ElementNotFoundError,
123
- "More than one element was found with the identifier '#{identifier}' on #{URL}"
124
- end
120
+ link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
125
121
 
126
- link = list_items.first.xpath("child::a[contains(@href, '/wiki/')]").first
127
- return unless link
122
+ return if link.nil?
128
123
 
129
124
  URI.join(URL, link[:href]).to_s
130
125
  end
131
126
  end
132
127
 
133
128
  class CambodianCommunesScraper
134
- class CommuneNotFoundError < StandardError; end
135
- class DuplicateCommuneError < StandardError; end
136
-
137
129
  URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
138
- Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
139
- InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
140
-
141
- MISSING_LOCATIONS = [
142
- "Taing Kouk District",
143
- "Bokor Municipality",
144
- "Ta Lou Senchey District",
145
- "Kaoh Rung Municipality",
146
- "Borei Ou Svay Senchey District"
147
- ].freeze
148
-
149
- INVALID_COMMUNE_LINKS = [
150
- InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
151
- InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
152
- InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
153
- InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
154
- InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
155
- InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
156
- InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
157
- InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
158
- InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
159
- InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
160
- InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
161
- InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
162
- InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
163
- InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
164
- InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
165
- InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
166
- InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
167
- InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
168
- InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
169
- InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
170
- InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
171
- InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
172
- InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
173
- InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
174
- InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
175
- InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
176
- InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
177
- InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
178
- InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
179
- InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
180
- InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
181
- InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
182
- InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
183
- InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
184
- InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
185
- InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
186
- InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
187
- InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
188
- InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
189
- InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
190
- InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
191
- InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
192
- InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
193
- InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
194
- InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
195
- InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
196
- InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
197
- InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
198
- InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
199
- InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
200
- InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
201
- InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
202
- InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
203
- InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
204
- InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
205
- InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
206
- InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
207
- InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
208
- InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
209
- InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
210
- InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
211
- InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
212
- InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
213
- InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
214
- InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
215
- InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
216
- InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
217
- InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
218
- InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
219
- InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
220
- InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
221
- InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
222
- InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
223
- InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
224
- InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
225
- InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
226
- InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
227
- InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
228
- InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
229
- InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
230
- InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
231
- InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
232
- InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
233
- InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
234
- InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
235
- InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
236
- InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
237
- InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
238
- InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
239
- InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
240
- InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
241
- InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
242
- InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
243
- InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
244
- InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
245
- InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
246
- InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
247
- InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
248
- InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
249
- InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
250
- InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
251
- InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
252
- InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
253
- InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
254
- InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
255
- InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
256
- InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
257
- InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
258
- InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
259
- InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
260
- InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
261
- InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
262
- InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
263
- InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
264
- InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
265
- InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
266
- InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
267
- InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
268
- ].freeze
269
-
270
- MISSPELLINGS = [
271
- Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
272
- Misspelling.new(
273
- incorrect_text: "Mondulkiri Province",
274
- correct_text: "Mondul Kiri Province"
275
- ),
276
- Misspelling.new(
277
- incorrect_text: "Ratanakiri Province",
278
- correct_text: "Ratanak Kiri Province"
279
- ),
280
- Misspelling.new(
281
- incorrect_text: "Siem Reap Province",
282
- correct_text: "Siemreap Province"
283
- ),
284
- Misspelling.new(
285
- incorrect_text: "Serei Saophoan District",
286
- correct_text: "Serei Saophoan Municipality"
287
- ),
288
- Misspelling.new(
289
- incorrect_text: "Poipet Municipality",
290
- correct_text: "Paoy Paet Municipality"
291
- ),
292
- Misspelling.new(
293
- incorrect_text: "Battambang District",
294
- correct_text: "Battambang Municipality"
295
- ),
296
- Misspelling.new(
297
- incorrect_text: "Rotanak Mondol District",
298
- correct_text: "Rotonak Mondol District"
299
- ),
300
- Misspelling.new(
301
- incorrect_text: "Sampov Loun District",
302
- correct_text: "Sampov Lun District"
303
- ),
304
- Misspelling.new(
305
- incorrect_text: "Koh Kralor District",
306
- correct_text: "Koas Krala District"
307
- ),
308
- Misspelling.new(
309
- incorrect_text: "Rukhak Kiri District",
310
- correct_text: "Rukh Kiri District"
311
- ),
312
- Misspelling.new(
313
- incorrect_text: "Koh Sotin District",
314
- correct_text: "Kaoh Soutin District"
315
- ),
316
- Misspelling.new(
317
- incorrect_text: "Srey Santhor District",
318
- correct_text: "Srei Santhor District"
319
- ),
320
- Misspelling.new(
321
- incorrect_text: "Kong Pisey",
322
- correct_text: "Kong Pisei District"
323
- ),
324
- Misspelling.new(
325
- incorrect_text: "Phnom Sruoch District",
326
- correct_text: "Phnum Sruoch District"
327
- ),
328
- Misspelling.new(
329
- incorrect_text: "Stueng Saen District",
330
- correct_text: "Stueng Saen Municipality"
331
- ),
332
- Misspelling.new(
333
- incorrect_text: "Prasat Balangk District",
334
- correct_text: "Prasat Ballangk District"
335
- ),
336
- Misspelling.new(
337
- incorrect_text: "Kampot District",
338
- correct_text: "Kampot Municipality"
339
- ),
340
- Misspelling.new(
341
- incorrect_text: "Kampot District",
342
- correct_text: "Kampot Municipality"
343
- ),
344
- Misspelling.new(
345
- incorrect_text: "Koh Thum District",
346
- correct_text: "Kaoh Thum District"
347
- ),
348
- Misspelling.new(
349
- incorrect_text: "Mukh Kamphool District",
350
- correct_text: "Mukh Kampul District"
351
- ),
352
- Misspelling.new(
353
- incorrect_text: "Ponhea Leu District",
354
- correct_text: "Ponhea Lueu District"
355
- ),
356
- Misspelling.new(
357
- incorrect_text: "Kiri Sakor",
358
- correct_text: "Kiri Sakor District"
359
- ),
360
- Misspelling.new(
361
- incorrect_text: "Koh Kong",
362
- correct_text: "Kaoh Kong District"
363
- ),
364
- Misspelling.new(
365
- incorrect_text: "Khemara Phoumin",
366
- correct_text: "Khemara Phoumin Municipality"
367
- ),
368
- Misspelling.new(
369
- incorrect_text: "Mondol Seima",
370
- correct_text: "Mondol Seima District"
371
- ),
372
- Misspelling.new(
373
- incorrect_text: "Srae Ambel",
374
- correct_text: "Srae Ambel District"
375
- ),
376
- Misspelling.new(
377
- incorrect_text: "Thma Bang",
378
- correct_text: "Thma Bang District"
379
- ),
380
- Misspelling.new(
381
- incorrect_text: "Kratie Municipality",
382
- correct_text: "Kracheh Municipality"
383
- ),
384
- Misspelling.new(
385
- incorrect_text: "Preaek Prasab District",
386
- correct_text: "Prek Prasab District"
387
- ),
388
- Misspelling.new(
389
- incorrect_text: "Krong Saen Monorom",
390
- correct_text: "Saen Monourom Municipality"
391
- ),
392
- Misspelling.new(
393
- incorrect_text: "Khan Daun Penh",
394
- correct_text: "Doun Penh Section"
395
- ),
396
- Misspelling.new(
397
- incorrect_text: "Khan Prampir Makara",
398
- correct_text: "Prampir Meakkakra Section"
399
- ),
400
- Misspelling.new(
401
- incorrect_text: "Khan Meanchey",
402
- correct_text: "Mean Chey Section"
403
- ),
404
- Misspelling.new(
405
- incorrect_text: "Khan Sen Sok",
406
- correct_text: "Saensokh Section"
407
- ),
408
- Misspelling.new(
409
- incorrect_text: "Khan Por Sen Chey",
410
- correct_text: "Pur SenChey Section"
411
- ),
412
- Misspelling.new(
413
- incorrect_text: "Khan Chrouy Changvar",
414
- correct_text: "Chraoy Chongvar Section"
415
- ),
416
- Misspelling.new(
417
- incorrect_text: "Khan Prek Phnov",
418
- correct_text: "Praek Pnov Section"
419
- ),
420
- Misspelling.new(
421
- incorrect_text: "Choam Khsant",
422
- correct_text: "Choam Ksant District"
423
- ),
424
- Misspelling.new(
425
- incorrect_text: "Kulen",
426
- correct_text: "Kuleaen District"
427
- ),
428
- Misspelling.new(
429
- incorrect_text: "Sangkom Thmei",
430
- correct_text: "Sangkum Thmei District"
431
- ),
432
- Misspelling.new(
433
- incorrect_text: "Prey Veaeng",
434
- correct_text: "Prey Veng Municipality"
435
- ),
436
- Misspelling.new(
437
- incorrect_text: "Por Reang",
438
- correct_text: "Pur Rieng District"
439
- ),
440
- Misspelling.new(
441
- incorrect_text: "Veal Veng",
442
- correct_text: "Veal Veaeng District"
443
- ),
444
- Misspelling.new(
445
- incorrect_text: "Krong Banlung",
446
- correct_text: "Ban Lung Municipality"
447
- ),
448
- Misspelling.new(
449
- incorrect_text: "Angkor Thom",
450
- correct_text: "Angkor Thum District"
451
- ),
452
- Misspelling.new(
453
- incorrect_text: "Sout Nikom",
454
- correct_text: "Soutr Nikom District"
455
- ),
456
- Misspelling.new(
457
- incorrect_text: "Steung Hav",
458
- correct_text: "Stueng Hav District"
459
- ),
460
- Misspelling.new(
461
- incorrect_text: "Krong Stung Treng",
462
- correct_text: "Stueng Traeng Municipality"
463
- ),
464
- Misspelling.new(
465
- incorrect_text: "Bourei Cholsar District",
466
- correct_text: "Borei Cholsar District"
467
- ),
468
- Misspelling.new(
469
- incorrect_text: "Damnak Chang'Eur",
470
- correct_text: "Damnak Chang'aeur District"
471
- ),
472
- Misspelling.new(
473
- incorrect_text: "Krong Keb",
474
- correct_text: "Kaeb Municipality"
475
- ),
476
- Misspelling.new(
477
- incorrect_text: "Sala Krao",
478
- correct_text: "Sala Krau District"
479
- ),
480
- Misspelling.new(
481
- incorrect_text: "Dombae",
482
- correct_text: "Dambae District"
483
- ),
484
- Misspelling.new(
485
- incorrect_text: "Krouch Chhma",
486
- correct_text: "Krouch Chhmar District"
487
- ),
488
- Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
489
- Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
490
- Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
491
- Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
492
- Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
493
- Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
494
- Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
495
- Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
496
- Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
497
- Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
498
- Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
499
- Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
500
- Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
501
- Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
502
- Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
503
- Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
504
- Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
505
- Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
506
- Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
507
- Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
508
- Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
509
- Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
510
- Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
511
- Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
512
- Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
513
- Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
514
- Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
515
- Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
516
- Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
517
- Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
518
- Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
519
- Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
520
- correct_text: "Tuol Svay Prey Ti Pir"),
521
- Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
522
- Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
523
- Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
524
- Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
525
- Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
526
- Misspelling.new(
527
- incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
528
- correct_text: "Kampeaeng"
529
- ),
530
- Misspelling.new(
531
- incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
532
- correct_text: "Prey Rumdeng"
533
- ),
534
- Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
535
- Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
536
- Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
537
- Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
538
- Misspelling.new(
539
- incorrect_text: "Trapeang Kranhung Commune",
540
- correct_text: "Trapeang Kranhoung"
541
- ),
542
- Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
543
- Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
544
- Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
545
- ].freeze
546
130
 
547
131
  def scrape!
548
- result = []
549
-
550
- District.all.each do |district|
551
- province_section = find_section(
552
- text: district.province.address_en,
553
- section: scraper.page,
554
- xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
555
- ).xpath("ancestor::h2/following-sibling::div").first
556
-
557
- district_title = find_section(
558
- text: [district.full_name_en, district.full_name_latin, district.name_latin],
559
- section: province_section,
560
- xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
561
- )
562
-
563
- next unless district_title
564
-
565
- district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
566
- commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
567
-
568
- commune_links.each do |link|
569
- invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
570
-
571
- next if invalid_commune_link
572
-
573
- commune = begin
574
- find_commune(
575
- district:,
576
- names: {
577
- name_latin: link.text,
578
- full_name_en: link.text,
579
- full_name_latin: link.text
580
- }
581
- )
582
- rescue CommuneNotFoundError => e
583
- misspelling = MISSPELLINGS.find do |m|
584
- m.incorrect_text == link.text
585
- end
586
-
587
- raise(e) unless misspelling
588
-
589
- find_commune(district:, names: { name_latin: misspelling.correct_text })
590
- end
591
-
592
- result << ScraperResult.new(code: commune.id,
593
- wikipedia: URI.join(
594
- URL, link[:href]
595
- ).to_s)
596
- end
597
- end
598
-
599
- result
600
- end
601
-
602
- private
132
+ Commune.all.each_with_object([]) do |commune, result|
133
+ url = find_url(commune)
134
+ next if url.nil?
603
135
 
604
- def build_commune_links(district:, pool:); end
605
-
606
- def find_invalid_commune_link(district:, text:)
607
- INVALID_COMMUNE_LINKS.find do |c|
608
- c.district_code == district.id && c.name == text
136
+ result << ScraperResult.new(code: commune.id, wikipedia: url)
609
137
  end
610
138
  end
611
139
 
612
- def find_section(text:, section:, xpath_pattern:)
613
- texts = Array(text)
614
- default_text = texts.first
615
- texts.each do |t|
616
- return find_link(text: t, section:, xpath_pattern:)
617
- rescue WebScraper::ElementNotFoundError => e
618
- raise(e) if t == texts.last
619
- end
620
- rescue WebScraper::ElementNotFoundError => e
621
- misspelling = MISSPELLINGS.find do |m|
622
- m.correct_text == default_text
623
- end
624
-
625
- return if !misspelling && MISSING_LOCATIONS.include?(default_text)
626
- raise(e) unless misspelling
627
-
628
- find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
629
- end
630
-
631
- def find_link(text:, section:, xpath_pattern:)
632
- xpath = format(xpath_pattern, text:)
633
- result = section.xpath(xpath)
634
-
635
- return result.first if result.size == 1
636
-
637
- raise WebScraper::ElementNotFoundError,
638
- "No link or many links found on #{URL} (xpath: '#{xpath}') "
639
- end
140
+ private
640
141
 
641
- def find_commune(district:, names:)
642
- results = []
643
- names.each do |k, v|
644
- results = Commune.where(district_id: district.id, k => v)
142
+ def find_url(commune)
143
+ geocode = scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
645
144
 
646
- break unless results.empty?
647
- end
145
+ return if geocode.nil?
648
146
 
649
- raise CommuneNotFoundError if results.empty?
147
+ link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
650
148
 
651
- if results.size > 1
652
- raise DuplicateCommuneError,
653
- "Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
654
- end
149
+ return if link.nil?
655
150
 
656
- results.first
151
+ URI.join(URL, link[:href]).to_s
657
152
  end
658
153
 
659
154
  def scraper
@@ -3,5 +3,7 @@ module Pumi
3
3
  end
4
4
  end
5
5
 
6
+ require_relative "data_source/geocoder"
7
+ require_relative "data_source/iso31662"
6
8
  require_relative "data_source/ncdd"
7
9
  require_relative "data_source/wikipedia"
@@ -0,0 +1,3 @@
1
+ module Pumi
2
+ Geodata = Struct.new(:lat, :long, :bounding_box, keyword_init: true)
3
+ end
data/lib/pumi/location.rb CHANGED
@@ -8,6 +8,8 @@ module Pumi
8
8
  :address_km, :address_latin, :address_en,
9
9
  :administrative_unit,
10
10
  :links,
11
+ :geodata,
12
+ :iso3166_2,
11
13
  keyword_init: true
12
14
  ) do
13
15
  class << self
data/lib/pumi/parser.rb CHANGED
@@ -15,6 +15,7 @@ module Pumi
15
15
  data_key: :provinces,
16
16
  id_length: 2
17
17
  )
18
+
18
19
  DISTRICT = AdministrativeDivision.new(
19
20
  type: District,
20
21
  name: :district,
@@ -85,11 +86,17 @@ module Pumi
85
86
  attributes.fetch("administrative_unit")
86
87
  )
87
88
 
89
+ if attributes.key?("geodata")
90
+ geodata = Geodata.new(attributes.fetch("geodata").transform_keys(&:to_sym))
91
+ end
92
+
88
93
  {
89
94
  id:,
90
95
  administrative_unit:,
91
96
  name_km:,
92
97
  name_latin:,
98
+ geodata:,
99
+ iso3166_2: attributes["iso3166_2"],
93
100
  links: attributes.fetch("links", {}).transform_keys(&:to_sym),
94
101
  name_en: name_latin,
95
102
  full_name_km: [
data/lib/pumi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pumi
2
- VERSION = "0.19.0".freeze
2
+ VERSION = "0.20.0".freeze
3
3
  end