pumi 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,665 @@
1
+ require "nokogiri"
2
+ require "open-uri"
3
+
4
+ module Pumi
5
+ module DataSource
6
+ class Wikipedia
7
+ attr_reader :data_file, :scraper
8
+
9
+ def initialize(data_file:, scraper:)
10
+ @data_file = data_file
11
+ @scraper = scraper
12
+ end
13
+
14
+ def load_data!(output_dir: "data")
15
+ data.each do |code, attributes|
16
+ location_data = scraped_data.find { |location| location.code == code }
17
+ next unless location_data
18
+
19
+ attributes["links"] ||= {}
20
+ attributes["links"]["wikipedia"] = location_data.wikipedia
21
+ end
22
+
23
+ write_data!(output_dir)
24
+ end
25
+
26
+ private
27
+
28
+ def scraped_data
29
+ @scraped_data ||= scraper.scrape!
30
+ end
31
+
32
+ def data
33
+ @data ||= data_file.read
34
+ end
35
+
36
+ def write_data!(data_directory)
37
+ data_file.write(data, data_directory:)
38
+ end
39
+
40
+ ScraperResult = Struct.new(:code, :wikipedia, keyword_init: true)
41
+
42
+ class WebScraper
43
+ class ElementNotFoundError < StandardError; end
44
+
45
+ attr_reader :url
46
+
47
+ def initialize(url)
48
+ @url = url
49
+ end
50
+
51
+ def page
52
+ @page ||= Nokogiri::HTML(URI.parse(url).open)
53
+ end
54
+ end
55
+
56
+ class CambodianProvincesScraper
57
+ URL = "https://en.wikipedia.org/wiki/Provinces_of_Cambodia".freeze
58
+
59
+ def scrape!
60
+ Province.all.each_with_object([]) do |province, result|
61
+ result << ScraperResult.new(code: province.id, wikipedia: find_url(province))
62
+ end
63
+ end
64
+
65
+ private
66
+
67
+ def scraper
68
+ @scraper ||= WebScraper.new(URL)
69
+ end
70
+
71
+ def find_url(province)
72
+ td = province_table_rows.xpath("child::td[contains(., '#{province.name_km}')]").first
73
+ if td.nil?
74
+ raise WebScraper::ElementNotFoundError,
75
+ "No cell containing '#{province.name_km}' was found in a table on #{URL}"
76
+ end
77
+
78
+ link = td.xpath("preceding-sibling::td/a").first
79
+ URI.join(URL, link[:href]).to_s
80
+ end
81
+
82
+ def province_table_rows
83
+ @province_table_rows ||= begin
84
+ sample_province = Province.all.first
85
+
86
+ sample_row = scraper.page.xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]").first
87
+ if sample_row.xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").empty?
88
+ raise WebScraper::ElementNotFoundError,
89
+ "No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
90
+ end
91
+
92
+ sample_row.parent.xpath("child::tr")
93
+ end
94
+ end
95
+ end
96
+
97
+ class CambodianDistrictsScraper
98
+ URL = "https://en.wikipedia.org/wiki/List_of_districts,_municipalities_and_sections_in_Cambodia".freeze
99
+
100
+ def scrape!
101
+ District.all.each_with_object([]) do |district, result|
102
+ url = find_url(district)
103
+ next unless url
104
+
105
+ result << ScraperResult.new(code: district.id, wikipedia: url)
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def scraper
112
+ @scraper ||= WebScraper.new(URL)
113
+ end
114
+
115
+ def find_url(district)
116
+ identifier = district.id.chars.each_slice(2).map(&:join).join("-")
117
+ list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
118
+
119
+ return if list_items.empty?
120
+
121
+ if list_items.size > 1
122
+ raise WebScraper::ElementNotFoundError,
123
+ "More than one element was found with the identifier '#{identifier}' on #{URL}"
124
+ end
125
+
126
+ link = list_items.first.xpath("child::a[contains(@href, '/wiki/')]").first
127
+ return unless link
128
+
129
+ URI.join(URL, link[:href]).to_s
130
+ end
131
+ end
132
+
133
+ class CambodianCommunesScraper
134
+ class CommuneNotFoundError < StandardError; end
135
+ class DuplicateCommuneError < StandardError; end
136
+
137
+ URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
138
+ Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
139
+ InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
140
+
141
+ MISSING_LOCATIONS = [
142
+ "Taing Kouk District",
143
+ "Bokor Municipality",
144
+ "Ta Lou Senchey District",
145
+ "Kaoh Rung Municipality",
146
+ "Borei Ou Svay Senchey District"
147
+ ].freeze
148
+
149
+ INVALID_COMMUNE_LINKS = [
150
+ InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
151
+ InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
152
+ InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
153
+ InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
154
+ InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
155
+ InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
156
+ InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
157
+ InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
158
+ InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
159
+ InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
160
+ InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
161
+ InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
162
+ InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
163
+ InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
164
+ InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
165
+ InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
166
+ InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
167
+ InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
168
+ InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
169
+ InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
170
+ InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
171
+ InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
172
+ InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
173
+ InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
174
+ InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
175
+ InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
176
+ InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
177
+ InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
178
+ InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
179
+ InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
180
+ InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
181
+ InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
182
+ InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
183
+ InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
184
+ InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
185
+ InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
186
+ InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
187
+ InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
188
+ InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
189
+ InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
190
+ InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
191
+ InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
192
+ InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
193
+ InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
194
+ InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
195
+ InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
196
+ InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
197
+ InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
198
+ InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
199
+ InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
200
+ InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
201
+ InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
202
+ InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
203
+ InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
204
+ InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
205
+ InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
206
+ InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
207
+ InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
208
+ InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
209
+ InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
210
+ InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
211
+ InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
212
+ InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
213
+ InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
214
+ InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
215
+ InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
216
+ InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
217
+ InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
218
+ InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
219
+ InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
220
+ InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
221
+ InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
222
+ InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
223
+ InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
224
+ InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
225
+ InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
226
+ InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
227
+ InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
228
+ InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
229
+ InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
230
+ InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
231
+ InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
232
+ InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
233
+ InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
234
+ InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
235
+ InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
236
+ InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
237
+ InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
238
+ InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
239
+ InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
240
+ InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
241
+ InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
242
+ InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
243
+ InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
244
+ InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
245
+ InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
246
+ InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
247
+ InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
248
+ InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
249
+ InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
250
+ InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
251
+ InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
252
+ InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
253
+ InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
254
+ InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
255
+ InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
256
+ InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
257
+ InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
258
+ InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
259
+ InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
260
+ InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
261
+ InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
262
+ InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
263
+ InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
264
+ InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
265
+ InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
266
+ InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
267
+ InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
268
+ ].freeze
269
+
270
+ MISSPELLINGS = [
271
+ Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
272
+ Misspelling.new(
273
+ incorrect_text: "Mondulkiri Province",
274
+ correct_text: "Mondul Kiri Province"
275
+ ),
276
+ Misspelling.new(
277
+ incorrect_text: "Ratanakiri Province",
278
+ correct_text: "Ratanak Kiri Province"
279
+ ),
280
+ Misspelling.new(
281
+ incorrect_text: "Siem Reap Province",
282
+ correct_text: "Siemreap Province"
283
+ ),
284
+ Misspelling.new(
285
+ incorrect_text: "Serei Saophoan District",
286
+ correct_text: "Serei Saophoan Municipality"
287
+ ),
288
+ Misspelling.new(
289
+ incorrect_text: "Poipet Municipality",
290
+ correct_text: "Paoy Paet Municipality"
291
+ ),
292
+ Misspelling.new(
293
+ incorrect_text: "Battambang District",
294
+ correct_text: "Battambang Municipality"
295
+ ),
296
+ Misspelling.new(
297
+ incorrect_text: "Rotanak Mondol District",
298
+ correct_text: "Rotonak Mondol District"
299
+ ),
300
+ Misspelling.new(
301
+ incorrect_text: "Sampov Loun District",
302
+ correct_text: "Sampov Lun District"
303
+ ),
304
+ Misspelling.new(
305
+ incorrect_text: "Koh Kralor District",
306
+ correct_text: "Koas Krala District"
307
+ ),
308
+ Misspelling.new(
309
+ incorrect_text: "Rukhak Kiri District",
310
+ correct_text: "Rukh Kiri District"
311
+ ),
312
+ Misspelling.new(
313
+ incorrect_text: "Koh Sotin District",
314
+ correct_text: "Kaoh Soutin District"
315
+ ),
316
+ Misspelling.new(
317
+ incorrect_text: "Srey Santhor District",
318
+ correct_text: "Srei Santhor District"
319
+ ),
320
+ Misspelling.new(
321
+ incorrect_text: "Kong Pisey",
322
+ correct_text: "Kong Pisei District"
323
+ ),
324
+ Misspelling.new(
325
+ incorrect_text: "Phnom Sruoch District",
326
+ correct_text: "Phnum Sruoch District"
327
+ ),
328
+ Misspelling.new(
329
+ incorrect_text: "Stueng Saen District",
330
+ correct_text: "Stueng Saen Municipality"
331
+ ),
332
+ Misspelling.new(
333
+ incorrect_text: "Prasat Balangk District",
334
+ correct_text: "Prasat Ballangk District"
335
+ ),
336
+ Misspelling.new(
337
+ incorrect_text: "Kampot District",
338
+ correct_text: "Kampot Municipality"
339
+ ),
340
+ Misspelling.new(
341
+ incorrect_text: "Kampot District",
342
+ correct_text: "Kampot Municipality"
343
+ ),
344
+ Misspelling.new(
345
+ incorrect_text: "Koh Thum District",
346
+ correct_text: "Kaoh Thum District"
347
+ ),
348
+ Misspelling.new(
349
+ incorrect_text: "Mukh Kamphool District",
350
+ correct_text: "Mukh Kampul District"
351
+ ),
352
+ Misspelling.new(
353
+ incorrect_text: "Ponhea Leu District",
354
+ correct_text: "Ponhea Lueu District"
355
+ ),
356
+ Misspelling.new(
357
+ incorrect_text: "Kiri Sakor",
358
+ correct_text: "Kiri Sakor District"
359
+ ),
360
+ Misspelling.new(
361
+ incorrect_text: "Koh Kong",
362
+ correct_text: "Kaoh Kong District"
363
+ ),
364
+ Misspelling.new(
365
+ incorrect_text: "Khemara Phoumin",
366
+ correct_text: "Khemara Phoumin Municipality"
367
+ ),
368
+ Misspelling.new(
369
+ incorrect_text: "Mondol Seima",
370
+ correct_text: "Mondol Seima District"
371
+ ),
372
+ Misspelling.new(
373
+ incorrect_text: "Srae Ambel",
374
+ correct_text: "Srae Ambel District"
375
+ ),
376
+ Misspelling.new(
377
+ incorrect_text: "Thma Bang",
378
+ correct_text: "Thma Bang District"
379
+ ),
380
+ Misspelling.new(
381
+ incorrect_text: "Kratie Municipality",
382
+ correct_text: "Kracheh Municipality"
383
+ ),
384
+ Misspelling.new(
385
+ incorrect_text: "Preaek Prasab District",
386
+ correct_text: "Prek Prasab District"
387
+ ),
388
+ Misspelling.new(
389
+ incorrect_text: "Krong Saen Monorom",
390
+ correct_text: "Saen Monourom Municipality"
391
+ ),
392
+ Misspelling.new(
393
+ incorrect_text: "Khan Daun Penh",
394
+ correct_text: "Doun Penh Section"
395
+ ),
396
+ Misspelling.new(
397
+ incorrect_text: "Khan Prampir Makara",
398
+ correct_text: "Prampir Meakkakra Section"
399
+ ),
400
+ Misspelling.new(
401
+ incorrect_text: "Khan Meanchey",
402
+ correct_text: "Mean Chey Section"
403
+ ),
404
+ Misspelling.new(
405
+ incorrect_text: "Khan Sen Sok",
406
+ correct_text: "Saensokh Section"
407
+ ),
408
+ Misspelling.new(
409
+ incorrect_text: "Khan Por Sen Chey",
410
+ correct_text: "Pur SenChey Section"
411
+ ),
412
+ Misspelling.new(
413
+ incorrect_text: "Khan Chrouy Changvar",
414
+ correct_text: "Chraoy Chongvar Section"
415
+ ),
416
+ Misspelling.new(
417
+ incorrect_text: "Khan Prek Phnov",
418
+ correct_text: "Praek Pnov Section"
419
+ ),
420
+ Misspelling.new(
421
+ incorrect_text: "Choam Khsant",
422
+ correct_text: "Choam Ksant District"
423
+ ),
424
+ Misspelling.new(
425
+ incorrect_text: "Kulen",
426
+ correct_text: "Kuleaen District"
427
+ ),
428
+ Misspelling.new(
429
+ incorrect_text: "Sangkom Thmei",
430
+ correct_text: "Sangkum Thmei District"
431
+ ),
432
+ Misspelling.new(
433
+ incorrect_text: "Prey Veaeng",
434
+ correct_text: "Prey Veng Municipality"
435
+ ),
436
+ Misspelling.new(
437
+ incorrect_text: "Por Reang",
438
+ correct_text: "Pur Rieng District"
439
+ ),
440
+ Misspelling.new(
441
+ incorrect_text: "Veal Veng",
442
+ correct_text: "Veal Veaeng District"
443
+ ),
444
+ Misspelling.new(
445
+ incorrect_text: "Krong Banlung",
446
+ correct_text: "Ban Lung Municipality"
447
+ ),
448
+ Misspelling.new(
449
+ incorrect_text: "Angkor Thom",
450
+ correct_text: "Angkor Thum District"
451
+ ),
452
+ Misspelling.new(
453
+ incorrect_text: "Sout Nikom",
454
+ correct_text: "Soutr Nikom District"
455
+ ),
456
+ Misspelling.new(
457
+ incorrect_text: "Steung Hav",
458
+ correct_text: "Stueng Hav District"
459
+ ),
460
+ Misspelling.new(
461
+ incorrect_text: "Krong Stung Treng",
462
+ correct_text: "Stueng Traeng Municipality"
463
+ ),
464
+ Misspelling.new(
465
+ incorrect_text: "Bourei Cholsar District",
466
+ correct_text: "Borei Cholsar District"
467
+ ),
468
+ Misspelling.new(
469
+ incorrect_text: "Damnak Chang'Eur",
470
+ correct_text: "Damnak Chang'aeur District"
471
+ ),
472
+ Misspelling.new(
473
+ incorrect_text: "Krong Keb",
474
+ correct_text: "Kaeb Municipality"
475
+ ),
476
+ Misspelling.new(
477
+ incorrect_text: "Sala Krao",
478
+ correct_text: "Sala Krau District"
479
+ ),
480
+ Misspelling.new(
481
+ incorrect_text: "Dombae",
482
+ correct_text: "Dambae District"
483
+ ),
484
+ Misspelling.new(
485
+ incorrect_text: "Krouch Chhma",
486
+ correct_text: "Krouch Chhmar District"
487
+ ),
488
+ Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
489
+ Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
490
+ Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
491
+ Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
492
+ Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
493
+ Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
494
+ Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
495
+ Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
496
+ Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
497
+ Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
498
+ Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
499
+ Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
500
+ Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
501
+ Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
502
+ Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
503
+ Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
504
+ Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
505
+ Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
506
+ Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
507
+ Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
508
+ Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
509
+ Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
510
+ Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
511
+ Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
512
+ Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
513
+ Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
514
+ Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
515
+ Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
516
+ Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
517
+ Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
518
+ Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
519
+ Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
520
+ correct_text: "Tuol Svay Prey Ti Pir"),
521
+ Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
522
+ Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
523
+ Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
524
+ Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
525
+ Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
526
+ Misspelling.new(
527
+ incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
528
+ correct_text: "Kampeaeng"
529
+ ),
530
+ Misspelling.new(
531
+ incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
532
+ correct_text: "Prey Rumdeng"
533
+ ),
534
+ Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
535
+ Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
536
+ Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
537
+ Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
538
+ Misspelling.new(
539
+ incorrect_text: "Trapeang Kranhung Commune",
540
+ correct_text: "Trapeang Kranhoung"
541
+ ),
542
+ Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
543
+ Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
544
+ Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
545
+ ].freeze
546
+
547
+ def scrape!
548
+ result = []
549
+
550
+ District.all.each do |district|
551
+ province_section = find_section(
552
+ text: district.province.address_en,
553
+ section: scraper.page,
554
+ xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
555
+ ).xpath("ancestor::h2/following-sibling::div").first
556
+
557
+ district_title = find_section(
558
+ text: [district.full_name_en, district.full_name_latin, district.name_latin],
559
+ section: province_section,
560
+ xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
561
+ )
562
+
563
+ next unless district_title
564
+
565
+ district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
566
+ commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
567
+
568
+ commune_links.each do |link|
569
+ invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
570
+
571
+ next if invalid_commune_link
572
+
573
+ commune = begin
574
+ find_commune(
575
+ district:,
576
+ names: {
577
+ name_latin: link.text,
578
+ full_name_en: link.text,
579
+ full_name_latin: link.text
580
+ }
581
+ )
582
+ rescue CommuneNotFoundError => e
583
+ misspelling = MISSPELLINGS.find do |m|
584
+ m.incorrect_text == link.text
585
+ end
586
+
587
+ raise(e) unless misspelling
588
+
589
+ find_commune(district:, names: { name_latin: misspelling.correct_text })
590
+ end
591
+
592
+ result << ScraperResult.new(code: commune.id,
593
+ wikipedia: URI.join(
594
+ URL, link[:href]
595
+ ).to_s)
596
+ end
597
+ end
598
+
599
+ result
600
+ end
601
+
602
+ private
603
+
604
+ def build_commune_links(district:, pool:); end
605
+
606
+ def find_invalid_commune_link(district:, text:)
607
+ INVALID_COMMUNE_LINKS.find do |c|
608
+ c.district_code == district.id && c.name == text
609
+ end
610
+ end
611
+
612
+ def find_section(text:, section:, xpath_pattern:)
613
+ texts = Array(text)
614
+ default_text = texts.first
615
+ texts.each do |t|
616
+ return find_link(text: t, section:, xpath_pattern:)
617
+ rescue WebScraper::ElementNotFoundError => e
618
+ raise(e) if t == texts.last
619
+ end
620
+ rescue WebScraper::ElementNotFoundError => e
621
+ misspelling = MISSPELLINGS.find do |m|
622
+ m.correct_text == default_text
623
+ end
624
+
625
+ return if !misspelling && MISSING_LOCATIONS.include?(default_text)
626
+ raise(e) unless misspelling
627
+
628
+ find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
629
+ end
630
+
631
+ def find_link(text:, section:, xpath_pattern:)
632
+ xpath = format(xpath_pattern, text:)
633
+ result = section.xpath(xpath)
634
+
635
+ return result.first if result.size == 1
636
+
637
+ raise WebScraper::ElementNotFoundError,
638
+ "No link or many links found on #{URL} (xpath: '#{xpath}') "
639
+ end
640
+
641
+ def find_commune(district:, names:)
642
+ results = []
643
+ names.each do |k, v|
644
+ results = Commune.where(district_id: district.id, k => v)
645
+
646
+ break unless results.empty?
647
+ end
648
+
649
+ raise CommuneNotFoundError if results.empty?
650
+
651
+ if results.size > 1
652
+ raise DuplicateCommuneError,
653
+ "Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
654
+ end
655
+
656
+ results.first
657
+ end
658
+
659
+ def scraper
660
+ @scraper ||= WebScraper.new(URL)
661
+ end
662
+ end
663
+ end
664
+ end
665
+ end
@@ -0,0 +1,7 @@
1
+ module Pumi
2
+ module DataSource
3
+ end
4
+ end
5
+
6
+ require_relative "data_source/ncdd"
7
+ require_relative "data_source/wikipedia"
data/lib/pumi/location.rb CHANGED
@@ -6,7 +6,9 @@ module Pumi
6
6
  :name_latin, :full_name_latin,
7
7
  :name_en, :full_name_en,
8
8
  :address_km, :address_latin, :address_en,
9
- :administrative_unit, keyword_init: true
9
+ :administrative_unit,
10
+ :links,
11
+ keyword_init: true
10
12
  ) do
11
13
  class << self
12
14
  attr_accessor :data_store_key