pumi 0.17.0 → 0.18.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,665 @@
1
+ require "nokogiri"
2
+ require "open-uri"
3
+
4
+ module Pumi
5
+ module DataSource
6
+ class Wikipedia
7
+ attr_reader :data_file, :scraper
8
+
9
+ def initialize(data_file:, scraper:)
10
+ @data_file = data_file
11
+ @scraper = scraper
12
+ end
13
+
14
+ def load_data!(output_dir: "data")
15
+ data.each do |code, attributes|
16
+ location_data = scraped_data.find { |location| location.code == code }
17
+ next unless location_data
18
+
19
+ attributes["links"] ||= {}
20
+ attributes["links"]["wikipedia"] = location_data.wikipedia
21
+ end
22
+
23
+ write_data!(output_dir)
24
+ end
25
+
26
+ private
27
+
28
+ def scraped_data
29
+ @scraped_data ||= scraper.scrape!
30
+ end
31
+
32
+ def data
33
+ @data ||= data_file.read
34
+ end
35
+
36
+ def write_data!(data_directory)
37
+ data_file.write(data, data_directory:)
38
+ end
39
+
40
+ ScraperResult = Struct.new(:code, :wikipedia, keyword_init: true)
41
+
42
+ class WebScraper
43
+ class ElementNotFoundError < StandardError; end
44
+
45
+ attr_reader :url
46
+
47
+ def initialize(url)
48
+ @url = url
49
+ end
50
+
51
+ def page
52
+ @page ||= Nokogiri::HTML(URI.parse(url).open)
53
+ end
54
+ end
55
+
56
+ class CambodianProvincesScraper
57
+ URL = "https://en.wikipedia.org/wiki/Provinces_of_Cambodia".freeze
58
+
59
+ def scrape!
60
+ Province.all.each_with_object([]) do |province, result|
61
+ result << ScraperResult.new(code: province.id, wikipedia: find_url(province))
62
+ end
63
+ end
64
+
65
+ private
66
+
67
+ def scraper
68
+ @scraper ||= WebScraper.new(URL)
69
+ end
70
+
71
+ def find_url(province)
72
+ td = province_table_rows.xpath("child::td[contains(., '#{province.name_km}')]").first
73
+ if td.nil?
74
+ raise WebScraper::ElementNotFoundError,
75
+ "No cell containing '#{province.name_km}' was found in a table on #{URL}"
76
+ end
77
+
78
+ link = td.xpath("preceding-sibling::td/a").first
79
+ URI.join(URL, link[:href]).to_s
80
+ end
81
+
82
+ def province_table_rows
83
+ @province_table_rows ||= begin
84
+ sample_province = Province.all.first
85
+
86
+ sample_row = scraper.page.xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]").first
87
+ if sample_row.xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").empty?
88
+ raise WebScraper::ElementNotFoundError,
89
+ "No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
90
+ end
91
+
92
+ sample_row.parent.xpath("child::tr")
93
+ end
94
+ end
95
+ end
96
+
97
+ class CambodianDistrictsScraper
98
+ URL = "https://en.wikipedia.org/wiki/List_of_districts,_municipalities_and_sections_in_Cambodia".freeze
99
+
100
+ def scrape!
101
+ District.all.each_with_object([]) do |district, result|
102
+ url = find_url(district)
103
+ next unless url
104
+
105
+ result << ScraperResult.new(code: district.id, wikipedia: url)
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def scraper
112
+ @scraper ||= WebScraper.new(URL)
113
+ end
114
+
115
+ def find_url(district)
116
+ identifier = district.id.chars.each_slice(2).map(&:join).join("-")
117
+ list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
118
+
119
+ return if list_items.empty?
120
+
121
+ if list_items.size > 1
122
+ raise WebScraper::ElementNotFoundError,
123
+ "More than one element was found with the identifier '#{identifier}' on #{URL}"
124
+ end
125
+
126
+ link = list_items.first.xpath("child::a[contains(@href, '/wiki/')]").first
127
+ return unless link
128
+
129
+ URI.join(URL, link[:href]).to_s
130
+ end
131
+ end
132
+
133
+ class CambodianCommunesScraper
134
+ class CommuneNotFoundError < StandardError; end
135
+ class DuplicateCommuneError < StandardError; end
136
+
137
+ URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
138
+ Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
139
+ InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
140
+
141
+ MISSING_LOCATIONS = [
142
+ "Taing Kouk District",
143
+ "Bokor Municipality",
144
+ "Ta Lou Senchey District",
145
+ "Kaoh Rung Municipality",
146
+ "Borei Ou Svay Senchey District"
147
+ ].freeze
148
+
149
+ INVALID_COMMUNE_LINKS = [
150
+ InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
151
+ InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
152
+ InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
153
+ InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
154
+ InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
155
+ InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
156
+ InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
157
+ InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
158
+ InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
159
+ InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
160
+ InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
161
+ InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
162
+ InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
163
+ InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
164
+ InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
165
+ InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
166
+ InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
167
+ InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
168
+ InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
169
+ InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
170
+ InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
171
+ InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
172
+ InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
173
+ InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
174
+ InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
175
+ InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
176
+ InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
177
+ InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
178
+ InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
179
+ InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
180
+ InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
181
+ InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
182
+ InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
183
+ InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
184
+ InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
185
+ InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
186
+ InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
187
+ InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
188
+ InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
189
+ InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
190
+ InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
191
+ InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
192
+ InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
193
+ InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
194
+ InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
195
+ InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
196
+ InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
197
+ InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
198
+ InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
199
+ InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
200
+ InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
201
+ InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
202
+ InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
203
+ InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
204
+ InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
205
+ InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
206
+ InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
207
+ InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
208
+ InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
209
+ InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
210
+ InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
211
+ InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
212
+ InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
213
+ InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
214
+ InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
215
+ InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
216
+ InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
217
+ InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
218
+ InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
219
+ InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
220
+ InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
221
+ InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
222
+ InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
223
+ InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
224
+ InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
225
+ InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
226
+ InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
227
+ InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
228
+ InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
229
+ InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
230
+ InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
231
+ InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
232
+ InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
233
+ InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
234
+ InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
235
+ InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
236
+ InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
237
+ InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
238
+ InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
239
+ InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
240
+ InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
241
+ InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
242
+ InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
243
+ InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
244
+ InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
245
+ InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
246
+ InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
247
+ InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
248
+ InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
249
+ InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
250
+ InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
251
+ InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
252
+ InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
253
+ InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
254
+ InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
255
+ InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
256
+ InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
257
+ InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
258
+ InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
259
+ InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
260
+ InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
261
+ InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
262
+ InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
263
+ InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
264
+ InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
265
+ InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
266
+ InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
267
+ InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
268
+ ].freeze
269
+
270
+ MISSPELLINGS = [
271
+ Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
272
+ Misspelling.new(
273
+ incorrect_text: "Mondulkiri Province",
274
+ correct_text: "Mondul Kiri Province"
275
+ ),
276
+ Misspelling.new(
277
+ incorrect_text: "Ratanakiri Province",
278
+ correct_text: "Ratanak Kiri Province"
279
+ ),
280
+ Misspelling.new(
281
+ incorrect_text: "Siem Reap Province",
282
+ correct_text: "Siemreap Province"
283
+ ),
284
+ Misspelling.new(
285
+ incorrect_text: "Serei Saophoan District",
286
+ correct_text: "Serei Saophoan Municipality"
287
+ ),
288
+ Misspelling.new(
289
+ incorrect_text: "Poipet Municipality",
290
+ correct_text: "Paoy Paet Municipality"
291
+ ),
292
+ Misspelling.new(
293
+ incorrect_text: "Battambang District",
294
+ correct_text: "Battambang Municipality"
295
+ ),
296
+ Misspelling.new(
297
+ incorrect_text: "Rotanak Mondol District",
298
+ correct_text: "Rotonak Mondol District"
299
+ ),
300
+ Misspelling.new(
301
+ incorrect_text: "Sampov Loun District",
302
+ correct_text: "Sampov Lun District"
303
+ ),
304
+ Misspelling.new(
305
+ incorrect_text: "Koh Kralor District",
306
+ correct_text: "Koas Krala District"
307
+ ),
308
+ Misspelling.new(
309
+ incorrect_text: "Rukhak Kiri District",
310
+ correct_text: "Rukh Kiri District"
311
+ ),
312
+ Misspelling.new(
313
+ incorrect_text: "Koh Sotin District",
314
+ correct_text: "Kaoh Soutin District"
315
+ ),
316
+ Misspelling.new(
317
+ incorrect_text: "Srey Santhor District",
318
+ correct_text: "Srei Santhor District"
319
+ ),
320
+ Misspelling.new(
321
+ incorrect_text: "Kong Pisey",
322
+ correct_text: "Kong Pisei District"
323
+ ),
324
+ Misspelling.new(
325
+ incorrect_text: "Phnom Sruoch District",
326
+ correct_text: "Phnum Sruoch District"
327
+ ),
328
+ Misspelling.new(
329
+ incorrect_text: "Stueng Saen District",
330
+ correct_text: "Stueng Saen Municipality"
331
+ ),
332
+ Misspelling.new(
333
+ incorrect_text: "Prasat Balangk District",
334
+ correct_text: "Prasat Ballangk District"
335
+ ),
336
+ Misspelling.new(
337
+ incorrect_text: "Kampot District",
338
+ correct_text: "Kampot Municipality"
339
+ ),
340
+ Misspelling.new(
341
+ incorrect_text: "Kampot District",
342
+ correct_text: "Kampot Municipality"
343
+ ),
344
+ Misspelling.new(
345
+ incorrect_text: "Koh Thum District",
346
+ correct_text: "Kaoh Thum District"
347
+ ),
348
+ Misspelling.new(
349
+ incorrect_text: "Mukh Kamphool District",
350
+ correct_text: "Mukh Kampul District"
351
+ ),
352
+ Misspelling.new(
353
+ incorrect_text: "Ponhea Leu District",
354
+ correct_text: "Ponhea Lueu District"
355
+ ),
356
+ Misspelling.new(
357
+ incorrect_text: "Kiri Sakor",
358
+ correct_text: "Kiri Sakor District"
359
+ ),
360
+ Misspelling.new(
361
+ incorrect_text: "Koh Kong",
362
+ correct_text: "Kaoh Kong District"
363
+ ),
364
+ Misspelling.new(
365
+ incorrect_text: "Khemara Phoumin",
366
+ correct_text: "Khemara Phoumin Municipality"
367
+ ),
368
+ Misspelling.new(
369
+ incorrect_text: "Mondol Seima",
370
+ correct_text: "Mondol Seima District"
371
+ ),
372
+ Misspelling.new(
373
+ incorrect_text: "Srae Ambel",
374
+ correct_text: "Srae Ambel District"
375
+ ),
376
+ Misspelling.new(
377
+ incorrect_text: "Thma Bang",
378
+ correct_text: "Thma Bang District"
379
+ ),
380
+ Misspelling.new(
381
+ incorrect_text: "Kratie Municipality",
382
+ correct_text: "Kracheh Municipality"
383
+ ),
384
+ Misspelling.new(
385
+ incorrect_text: "Preaek Prasab District",
386
+ correct_text: "Prek Prasab District"
387
+ ),
388
+ Misspelling.new(
389
+ incorrect_text: "Krong Saen Monorom",
390
+ correct_text: "Saen Monourom Municipality"
391
+ ),
392
+ Misspelling.new(
393
+ incorrect_text: "Khan Daun Penh",
394
+ correct_text: "Doun Penh Section"
395
+ ),
396
+ Misspelling.new(
397
+ incorrect_text: "Khan Prampir Makara",
398
+ correct_text: "Prampir Meakkakra Section"
399
+ ),
400
+ Misspelling.new(
401
+ incorrect_text: "Khan Meanchey",
402
+ correct_text: "Mean Chey Section"
403
+ ),
404
+ Misspelling.new(
405
+ incorrect_text: "Khan Sen Sok",
406
+ correct_text: "Saensokh Section"
407
+ ),
408
+ Misspelling.new(
409
+ incorrect_text: "Khan Por Sen Chey",
410
+ correct_text: "Pur SenChey Section"
411
+ ),
412
+ Misspelling.new(
413
+ incorrect_text: "Khan Chrouy Changvar",
414
+ correct_text: "Chraoy Chongvar Section"
415
+ ),
416
+ Misspelling.new(
417
+ incorrect_text: "Khan Prek Phnov",
418
+ correct_text: "Praek Pnov Section"
419
+ ),
420
+ Misspelling.new(
421
+ incorrect_text: "Choam Khsant",
422
+ correct_text: "Choam Ksant District"
423
+ ),
424
+ Misspelling.new(
425
+ incorrect_text: "Kulen",
426
+ correct_text: "Kuleaen District"
427
+ ),
428
+ Misspelling.new(
429
+ incorrect_text: "Sangkom Thmei",
430
+ correct_text: "Sangkum Thmei District"
431
+ ),
432
+ Misspelling.new(
433
+ incorrect_text: "Prey Veaeng",
434
+ correct_text: "Prey Veng Municipality"
435
+ ),
436
+ Misspelling.new(
437
+ incorrect_text: "Por Reang",
438
+ correct_text: "Pur Rieng District"
439
+ ),
440
+ Misspelling.new(
441
+ incorrect_text: "Veal Veng",
442
+ correct_text: "Veal Veaeng District"
443
+ ),
444
+ Misspelling.new(
445
+ incorrect_text: "Krong Banlung",
446
+ correct_text: "Ban Lung Municipality"
447
+ ),
448
+ Misspelling.new(
449
+ incorrect_text: "Angkor Thom",
450
+ correct_text: "Angkor Thum District"
451
+ ),
452
+ Misspelling.new(
453
+ incorrect_text: "Sout Nikom",
454
+ correct_text: "Soutr Nikom District"
455
+ ),
456
+ Misspelling.new(
457
+ incorrect_text: "Steung Hav",
458
+ correct_text: "Stueng Hav District"
459
+ ),
460
+ Misspelling.new(
461
+ incorrect_text: "Krong Stung Treng",
462
+ correct_text: "Stueng Traeng Municipality"
463
+ ),
464
+ Misspelling.new(
465
+ incorrect_text: "Bourei Cholsar District",
466
+ correct_text: "Borei Cholsar District"
467
+ ),
468
+ Misspelling.new(
469
+ incorrect_text: "Damnak Chang'Eur",
470
+ correct_text: "Damnak Chang'aeur District"
471
+ ),
472
+ Misspelling.new(
473
+ incorrect_text: "Krong Keb",
474
+ correct_text: "Kaeb Municipality"
475
+ ),
476
+ Misspelling.new(
477
+ incorrect_text: "Sala Krao",
478
+ correct_text: "Sala Krau District"
479
+ ),
480
+ Misspelling.new(
481
+ incorrect_text: "Dombae",
482
+ correct_text: "Dambae District"
483
+ ),
484
+ Misspelling.new(
485
+ incorrect_text: "Krouch Chhma",
486
+ correct_text: "Krouch Chhmar District"
487
+ ),
488
+ Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
489
+ Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
490
+ Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
491
+ Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
492
+ Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
493
+ Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
494
+ Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
495
+ Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
496
+ Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
497
+ Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
498
+ Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
499
+ Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
500
+ Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
501
+ Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
502
+ Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
503
+ Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
504
+ Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
505
+ Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
506
+ Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
507
+ Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
508
+ Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
509
+ Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
510
+ Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
511
+ Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
512
+ Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
513
+ Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
514
+ Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
515
+ Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
516
+ Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
517
+ Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
518
+ Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
519
+ Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
520
+ correct_text: "Tuol Svay Prey Ti Pir"),
521
+ Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
522
+ Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
523
+ Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
524
+ Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
525
+ Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
526
+ Misspelling.new(
527
+ incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
528
+ correct_text: "Kampeaeng"
529
+ ),
530
+ Misspelling.new(
531
+ incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
532
+ correct_text: "Prey Rumdeng"
533
+ ),
534
+ Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
535
+ Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
536
+ Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
537
+ Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
538
+ Misspelling.new(
539
+ incorrect_text: "Trapeang Kranhung Commune",
540
+ correct_text: "Trapeang Kranhoung"
541
+ ),
542
+ Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
543
+ Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
544
+ Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
545
+ ].freeze
546
+
547
+ def scrape!
548
+ result = []
549
+
550
+ District.all.each do |district|
551
+ province_section = find_section(
552
+ text: district.province.address_en,
553
+ section: scraper.page,
554
+ xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
555
+ ).xpath("ancestor::h2/following-sibling::div").first
556
+
557
+ district_title = find_section(
558
+ text: [district.full_name_en, district.full_name_latin, district.name_latin],
559
+ section: province_section,
560
+ xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
561
+ )
562
+
563
+ next unless district_title
564
+
565
+ district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
566
+ commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
567
+
568
+ commune_links.each do |link|
569
+ invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
570
+
571
+ next if invalid_commune_link
572
+
573
+ commune = begin
574
+ find_commune(
575
+ district:,
576
+ names: {
577
+ name_latin: link.text,
578
+ full_name_en: link.text,
579
+ full_name_latin: link.text
580
+ }
581
+ )
582
+ rescue CommuneNotFoundError => e
583
+ misspelling = MISSPELLINGS.find do |m|
584
+ m.incorrect_text == link.text
585
+ end
586
+
587
+ raise(e) unless misspelling
588
+
589
+ find_commune(district:, names: { name_latin: misspelling.correct_text })
590
+ end
591
+
592
+ result << ScraperResult.new(code: commune.id,
593
+ wikipedia: URI.join(
594
+ URL, link[:href]
595
+ ).to_s)
596
+ end
597
+ end
598
+
599
+ result
600
+ end
601
+
602
+ private
603
+
604
+ def build_commune_links(district:, pool:); end
605
+
606
+ def find_invalid_commune_link(district:, text:)
607
+ INVALID_COMMUNE_LINKS.find do |c|
608
+ c.district_code == district.id && c.name == text
609
+ end
610
+ end
611
+
612
+ def find_section(text:, section:, xpath_pattern:)
613
+ texts = Array(text)
614
+ default_text = texts.first
615
+ texts.each do |t|
616
+ return find_link(text: t, section:, xpath_pattern:)
617
+ rescue WebScraper::ElementNotFoundError => e
618
+ raise(e) if t == texts.last
619
+ end
620
+ rescue WebScraper::ElementNotFoundError => e
621
+ misspelling = MISSPELLINGS.find do |m|
622
+ m.correct_text == default_text
623
+ end
624
+
625
+ return if !misspelling && MISSING_LOCATIONS.include?(default_text)
626
+ raise(e) unless misspelling
627
+
628
+ find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
629
+ end
630
+
631
+ def find_link(text:, section:, xpath_pattern:)
632
+ xpath = format(xpath_pattern, text:)
633
+ result = section.xpath(xpath)
634
+
635
+ return result.first if result.size == 1
636
+
637
+ raise WebScraper::ElementNotFoundError,
638
+ "No link or many links found on #{URL} (xpath: '#{xpath}') "
639
+ end
640
+
641
+ def find_commune(district:, names:)
642
+ results = []
643
+ names.each do |k, v|
644
+ results = Commune.where(district_id: district.id, k => v)
645
+
646
+ break unless results.empty?
647
+ end
648
+
649
+ raise CommuneNotFoundError if results.empty?
650
+
651
+ if results.size > 1
652
+ raise DuplicateCommuneError,
653
+ "Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
654
+ end
655
+
656
+ results.first
657
+ end
658
+
659
+ def scraper
660
+ @scraper ||= WebScraper.new(URL)
661
+ end
662
+ end
663
+ end
664
+ end
665
+ end
@@ -0,0 +1,7 @@
1
+ module Pumi
2
+ module DataSource
3
+ end
4
+ end
5
+
6
+ require_relative "data_source/ncdd"
7
+ require_relative "data_source/wikipedia"
data/lib/pumi/location.rb CHANGED
@@ -6,7 +6,9 @@ module Pumi
6
6
  :name_latin, :full_name_latin,
7
7
  :name_en, :full_name_en,
8
8
  :address_km, :address_latin, :address_en,
9
- :administrative_unit, keyword_init: true
9
+ :administrative_unit,
10
+ :links,
11
+ keyword_init: true
10
12
  ) do
11
13
  class << self
12
14
  attr_accessor :data_store_key