pumi 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +5 -0
- data/bin/parse_data +16 -3
- data/data/communes.yml +562 -0
- data/data/districts.yml +402 -0
- data/data/provinces.yml +52 -2
- data/lib/pumi/data_file.rb +32 -0
- data/lib/pumi/data_source/ncdd.rb +100 -0
- data/lib/pumi/data_source/wikipedia.rb +665 -0
- data/lib/pumi/data_source.rb +7 -0
- data/lib/pumi/location.rb +3 -1
- data/lib/pumi/parser.rb +20 -13
- data/lib/pumi/scraper/result.rb +5 -0
- data/lib/pumi/version.rb +1 -1
- data/lib/pumi.rb +2 -1
- data/pumi.gemspec +1 -0
- metadata +21 -3
- data/lib/pumi/data_parser.rb +0 -75
@@ -0,0 +1,665 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "open-uri"
|
3
|
+
|
4
|
+
module Pumi
|
5
|
+
module DataSource
|
6
|
+
class Wikipedia
|
7
|
+
attr_reader :data_file, :scraper
|
8
|
+
|
9
|
+
def initialize(data_file:, scraper:)
|
10
|
+
@data_file = data_file
|
11
|
+
@scraper = scraper
|
12
|
+
end
|
13
|
+
|
14
|
+
def load_data!(output_dir: "data")
|
15
|
+
data.each do |code, attributes|
|
16
|
+
location_data = scraped_data.find { |location| location.code == code }
|
17
|
+
next unless location_data
|
18
|
+
|
19
|
+
attributes["links"] ||= {}
|
20
|
+
attributes["links"]["wikipedia"] = location_data.wikipedia
|
21
|
+
end
|
22
|
+
|
23
|
+
write_data!(output_dir)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def scraped_data
|
29
|
+
@scraped_data ||= scraper.scrape!
|
30
|
+
end
|
31
|
+
|
32
|
+
def data
|
33
|
+
@data ||= data_file.read
|
34
|
+
end
|
35
|
+
|
36
|
+
def write_data!(data_directory)
|
37
|
+
data_file.write(data, data_directory:)
|
38
|
+
end
|
39
|
+
|
40
|
+
ScraperResult = Struct.new(:code, :wikipedia, keyword_init: true)
|
41
|
+
|
42
|
+
class WebScraper
|
43
|
+
class ElementNotFoundError < StandardError; end
|
44
|
+
|
45
|
+
attr_reader :url
|
46
|
+
|
47
|
+
def initialize(url)
|
48
|
+
@url = url
|
49
|
+
end
|
50
|
+
|
51
|
+
def page
|
52
|
+
@page ||= Nokogiri::HTML(URI.parse(url).open)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class CambodianProvincesScraper
|
57
|
+
URL = "https://en.wikipedia.org/wiki/Provinces_of_Cambodia".freeze
|
58
|
+
|
59
|
+
def scrape!
|
60
|
+
Province.all.each_with_object([]) do |province, result|
|
61
|
+
result << ScraperResult.new(code: province.id, wikipedia: find_url(province))
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def scraper
|
68
|
+
@scraper ||= WebScraper.new(URL)
|
69
|
+
end
|
70
|
+
|
71
|
+
def find_url(province)
|
72
|
+
td = province_table_rows.xpath("child::td[contains(., '#{province.name_km}')]").first
|
73
|
+
if td.nil?
|
74
|
+
raise WebScraper::ElementNotFoundError,
|
75
|
+
"No cell containing '#{province.name_km}' was found in a table on #{URL}"
|
76
|
+
end
|
77
|
+
|
78
|
+
link = td.xpath("preceding-sibling::td/a").first
|
79
|
+
URI.join(URL, link[:href]).to_s
|
80
|
+
end
|
81
|
+
|
82
|
+
def province_table_rows
|
83
|
+
@province_table_rows ||= begin
|
84
|
+
sample_province = Province.all.first
|
85
|
+
|
86
|
+
sample_row = scraper.page.xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]").first
|
87
|
+
if sample_row.xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").empty?
|
88
|
+
raise WebScraper::ElementNotFoundError,
|
89
|
+
"No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
|
90
|
+
end
|
91
|
+
|
92
|
+
sample_row.parent.xpath("child::tr")
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class CambodianDistrictsScraper
|
98
|
+
URL = "https://en.wikipedia.org/wiki/List_of_districts,_municipalities_and_sections_in_Cambodia".freeze
|
99
|
+
|
100
|
+
def scrape!
|
101
|
+
District.all.each_with_object([]) do |district, result|
|
102
|
+
url = find_url(district)
|
103
|
+
next unless url
|
104
|
+
|
105
|
+
result << ScraperResult.new(code: district.id, wikipedia: url)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
def scraper
|
112
|
+
@scraper ||= WebScraper.new(URL)
|
113
|
+
end
|
114
|
+
|
115
|
+
def find_url(district)
|
116
|
+
identifier = district.id.chars.each_slice(2).map(&:join).join("-")
|
117
|
+
list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
|
118
|
+
|
119
|
+
return if list_items.empty?
|
120
|
+
|
121
|
+
if list_items.size > 1
|
122
|
+
raise WebScraper::ElementNotFoundError,
|
123
|
+
"More than one element was found with the identifier '#{identifier}' on #{URL}"
|
124
|
+
end
|
125
|
+
|
126
|
+
link = list_items.first.xpath("child::a[contains(@href, '/wiki/')]").first
|
127
|
+
return unless link
|
128
|
+
|
129
|
+
URI.join(URL, link[:href]).to_s
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class CambodianCommunesScraper
|
134
|
+
class CommuneNotFoundError < StandardError; end
|
135
|
+
class DuplicateCommuneError < StandardError; end
|
136
|
+
|
137
|
+
URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
|
138
|
+
Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
|
139
|
+
InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
|
140
|
+
|
141
|
+
MISSING_LOCATIONS = [
|
142
|
+
"Taing Kouk District",
|
143
|
+
"Bokor Municipality",
|
144
|
+
"Ta Lou Senchey District",
|
145
|
+
"Kaoh Rung Municipality",
|
146
|
+
"Borei Ou Svay Senchey District"
|
147
|
+
].freeze
|
148
|
+
|
149
|
+
INVALID_COMMUNE_LINKS = [
|
150
|
+
InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
|
151
|
+
InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
|
152
|
+
InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
|
153
|
+
InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
|
154
|
+
InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
|
155
|
+
InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
|
156
|
+
InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
|
157
|
+
InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
|
158
|
+
InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
|
159
|
+
InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
|
160
|
+
InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
|
161
|
+
InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
|
162
|
+
InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
|
163
|
+
InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
|
164
|
+
InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
|
165
|
+
InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
|
166
|
+
InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
|
167
|
+
InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
|
168
|
+
InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
|
169
|
+
InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
|
170
|
+
InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
|
171
|
+
InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
|
172
|
+
InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
|
173
|
+
InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
|
174
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
|
175
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
|
176
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
|
177
|
+
InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
|
178
|
+
InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
|
179
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
|
180
|
+
InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
|
181
|
+
InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
|
182
|
+
InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
|
183
|
+
InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
|
184
|
+
InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
|
185
|
+
InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
|
186
|
+
InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
|
187
|
+
InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
|
188
|
+
InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
|
189
|
+
InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
|
190
|
+
InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
|
191
|
+
InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
|
192
|
+
InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
|
193
|
+
InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
|
194
|
+
InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
|
195
|
+
InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
|
196
|
+
InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
|
197
|
+
InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
|
198
|
+
InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
|
199
|
+
InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
|
200
|
+
InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
|
201
|
+
InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
|
202
|
+
InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
|
203
|
+
InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
|
204
|
+
InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
|
205
|
+
InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
|
206
|
+
InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
|
207
|
+
InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
|
208
|
+
InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
|
209
|
+
InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
|
210
|
+
InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
|
211
|
+
InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
|
212
|
+
InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
|
213
|
+
InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
|
214
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
|
215
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
|
216
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
|
217
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
|
218
|
+
InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
|
219
|
+
InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
|
220
|
+
InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
|
221
|
+
InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
|
222
|
+
InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
|
223
|
+
InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
|
224
|
+
InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
|
225
|
+
InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
|
226
|
+
InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
|
227
|
+
InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
|
228
|
+
InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
|
229
|
+
InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
|
230
|
+
InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
|
231
|
+
InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
|
232
|
+
InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
|
233
|
+
InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
|
234
|
+
InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
|
235
|
+
InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
|
236
|
+
InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
|
237
|
+
InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
|
238
|
+
InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
|
239
|
+
InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
|
240
|
+
InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
|
241
|
+
InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
|
242
|
+
InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
|
243
|
+
InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
|
244
|
+
InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
|
245
|
+
InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
|
246
|
+
InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
|
247
|
+
InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
|
248
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
|
249
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
|
250
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
|
251
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
|
252
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
|
253
|
+
InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
|
254
|
+
InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
|
255
|
+
InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
|
256
|
+
InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
|
257
|
+
InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
|
258
|
+
InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
|
259
|
+
InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
|
260
|
+
InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
|
261
|
+
InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
|
262
|
+
InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
|
263
|
+
InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
|
264
|
+
InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
|
265
|
+
InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
|
266
|
+
InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
|
267
|
+
InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
|
268
|
+
].freeze
|
269
|
+
|
270
|
+
MISSPELLINGS = [
|
271
|
+
Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
|
272
|
+
Misspelling.new(
|
273
|
+
incorrect_text: "Mondulkiri Province",
|
274
|
+
correct_text: "Mondul Kiri Province"
|
275
|
+
),
|
276
|
+
Misspelling.new(
|
277
|
+
incorrect_text: "Ratanakiri Province",
|
278
|
+
correct_text: "Ratanak Kiri Province"
|
279
|
+
),
|
280
|
+
Misspelling.new(
|
281
|
+
incorrect_text: "Siem Reap Province",
|
282
|
+
correct_text: "Siemreap Province"
|
283
|
+
),
|
284
|
+
Misspelling.new(
|
285
|
+
incorrect_text: "Serei Saophoan District",
|
286
|
+
correct_text: "Serei Saophoan Municipality"
|
287
|
+
),
|
288
|
+
Misspelling.new(
|
289
|
+
incorrect_text: "Poipet Municipality",
|
290
|
+
correct_text: "Paoy Paet Municipality"
|
291
|
+
),
|
292
|
+
Misspelling.new(
|
293
|
+
incorrect_text: "Battambang District",
|
294
|
+
correct_text: "Battambang Municipality"
|
295
|
+
),
|
296
|
+
Misspelling.new(
|
297
|
+
incorrect_text: "Rotanak Mondol District",
|
298
|
+
correct_text: "Rotonak Mondol District"
|
299
|
+
),
|
300
|
+
Misspelling.new(
|
301
|
+
incorrect_text: "Sampov Loun District",
|
302
|
+
correct_text: "Sampov Lun District"
|
303
|
+
),
|
304
|
+
Misspelling.new(
|
305
|
+
incorrect_text: "Koh Kralor District",
|
306
|
+
correct_text: "Koas Krala District"
|
307
|
+
),
|
308
|
+
Misspelling.new(
|
309
|
+
incorrect_text: "Rukhak Kiri District",
|
310
|
+
correct_text: "Rukh Kiri District"
|
311
|
+
),
|
312
|
+
Misspelling.new(
|
313
|
+
incorrect_text: "Koh Sotin District",
|
314
|
+
correct_text: "Kaoh Soutin District"
|
315
|
+
),
|
316
|
+
Misspelling.new(
|
317
|
+
incorrect_text: "Srey Santhor District",
|
318
|
+
correct_text: "Srei Santhor District"
|
319
|
+
),
|
320
|
+
Misspelling.new(
|
321
|
+
incorrect_text: "Kong Pisey",
|
322
|
+
correct_text: "Kong Pisei District"
|
323
|
+
),
|
324
|
+
Misspelling.new(
|
325
|
+
incorrect_text: "Phnom Sruoch District",
|
326
|
+
correct_text: "Phnum Sruoch District"
|
327
|
+
),
|
328
|
+
Misspelling.new(
|
329
|
+
incorrect_text: "Stueng Saen District",
|
330
|
+
correct_text: "Stueng Saen Municipality"
|
331
|
+
),
|
332
|
+
Misspelling.new(
|
333
|
+
incorrect_text: "Prasat Balangk District",
|
334
|
+
correct_text: "Prasat Ballangk District"
|
335
|
+
),
|
336
|
+
Misspelling.new(
|
337
|
+
incorrect_text: "Kampot District",
|
338
|
+
correct_text: "Kampot Municipality"
|
339
|
+
),
|
340
|
+
Misspelling.new(
|
341
|
+
incorrect_text: "Kampot District",
|
342
|
+
correct_text: "Kampot Municipality"
|
343
|
+
),
|
344
|
+
Misspelling.new(
|
345
|
+
incorrect_text: "Koh Thum District",
|
346
|
+
correct_text: "Kaoh Thum District"
|
347
|
+
),
|
348
|
+
Misspelling.new(
|
349
|
+
incorrect_text: "Mukh Kamphool District",
|
350
|
+
correct_text: "Mukh Kampul District"
|
351
|
+
),
|
352
|
+
Misspelling.new(
|
353
|
+
incorrect_text: "Ponhea Leu District",
|
354
|
+
correct_text: "Ponhea Lueu District"
|
355
|
+
),
|
356
|
+
Misspelling.new(
|
357
|
+
incorrect_text: "Kiri Sakor",
|
358
|
+
correct_text: "Kiri Sakor District"
|
359
|
+
),
|
360
|
+
Misspelling.new(
|
361
|
+
incorrect_text: "Koh Kong",
|
362
|
+
correct_text: "Kaoh Kong District"
|
363
|
+
),
|
364
|
+
Misspelling.new(
|
365
|
+
incorrect_text: "Khemara Phoumin",
|
366
|
+
correct_text: "Khemara Phoumin Municipality"
|
367
|
+
),
|
368
|
+
Misspelling.new(
|
369
|
+
incorrect_text: "Mondol Seima",
|
370
|
+
correct_text: "Mondol Seima District"
|
371
|
+
),
|
372
|
+
Misspelling.new(
|
373
|
+
incorrect_text: "Srae Ambel",
|
374
|
+
correct_text: "Srae Ambel District"
|
375
|
+
),
|
376
|
+
Misspelling.new(
|
377
|
+
incorrect_text: "Thma Bang",
|
378
|
+
correct_text: "Thma Bang District"
|
379
|
+
),
|
380
|
+
Misspelling.new(
|
381
|
+
incorrect_text: "Kratie Municipality",
|
382
|
+
correct_text: "Kracheh Municipality"
|
383
|
+
),
|
384
|
+
Misspelling.new(
|
385
|
+
incorrect_text: "Preaek Prasab District",
|
386
|
+
correct_text: "Prek Prasab District"
|
387
|
+
),
|
388
|
+
Misspelling.new(
|
389
|
+
incorrect_text: "Krong Saen Monorom",
|
390
|
+
correct_text: "Saen Monourom Municipality"
|
391
|
+
),
|
392
|
+
Misspelling.new(
|
393
|
+
incorrect_text: "Khan Daun Penh",
|
394
|
+
correct_text: "Doun Penh Section"
|
395
|
+
),
|
396
|
+
Misspelling.new(
|
397
|
+
incorrect_text: "Khan Prampir Makara",
|
398
|
+
correct_text: "Prampir Meakkakra Section"
|
399
|
+
),
|
400
|
+
Misspelling.new(
|
401
|
+
incorrect_text: "Khan Meanchey",
|
402
|
+
correct_text: "Mean Chey Section"
|
403
|
+
),
|
404
|
+
Misspelling.new(
|
405
|
+
incorrect_text: "Khan Sen Sok",
|
406
|
+
correct_text: "Saensokh Section"
|
407
|
+
),
|
408
|
+
Misspelling.new(
|
409
|
+
incorrect_text: "Khan Por Sen Chey",
|
410
|
+
correct_text: "Pur SenChey Section"
|
411
|
+
),
|
412
|
+
Misspelling.new(
|
413
|
+
incorrect_text: "Khan Chrouy Changvar",
|
414
|
+
correct_text: "Chraoy Chongvar Section"
|
415
|
+
),
|
416
|
+
Misspelling.new(
|
417
|
+
incorrect_text: "Khan Prek Phnov",
|
418
|
+
correct_text: "Praek Pnov Section"
|
419
|
+
),
|
420
|
+
Misspelling.new(
|
421
|
+
incorrect_text: "Choam Khsant",
|
422
|
+
correct_text: "Choam Ksant District"
|
423
|
+
),
|
424
|
+
Misspelling.new(
|
425
|
+
incorrect_text: "Kulen",
|
426
|
+
correct_text: "Kuleaen District"
|
427
|
+
),
|
428
|
+
Misspelling.new(
|
429
|
+
incorrect_text: "Sangkom Thmei",
|
430
|
+
correct_text: "Sangkum Thmei District"
|
431
|
+
),
|
432
|
+
Misspelling.new(
|
433
|
+
incorrect_text: "Prey Veaeng",
|
434
|
+
correct_text: "Prey Veng Municipality"
|
435
|
+
),
|
436
|
+
Misspelling.new(
|
437
|
+
incorrect_text: "Por Reang",
|
438
|
+
correct_text: "Pur Rieng District"
|
439
|
+
),
|
440
|
+
Misspelling.new(
|
441
|
+
incorrect_text: "Veal Veng",
|
442
|
+
correct_text: "Veal Veaeng District"
|
443
|
+
),
|
444
|
+
Misspelling.new(
|
445
|
+
incorrect_text: "Krong Banlung",
|
446
|
+
correct_text: "Ban Lung Municipality"
|
447
|
+
),
|
448
|
+
Misspelling.new(
|
449
|
+
incorrect_text: "Angkor Thom",
|
450
|
+
correct_text: "Angkor Thum District"
|
451
|
+
),
|
452
|
+
Misspelling.new(
|
453
|
+
incorrect_text: "Sout Nikom",
|
454
|
+
correct_text: "Soutr Nikom District"
|
455
|
+
),
|
456
|
+
Misspelling.new(
|
457
|
+
incorrect_text: "Steung Hav",
|
458
|
+
correct_text: "Stueng Hav District"
|
459
|
+
),
|
460
|
+
Misspelling.new(
|
461
|
+
incorrect_text: "Krong Stung Treng",
|
462
|
+
correct_text: "Stueng Traeng Municipality"
|
463
|
+
),
|
464
|
+
Misspelling.new(
|
465
|
+
incorrect_text: "Bourei Cholsar District",
|
466
|
+
correct_text: "Borei Cholsar District"
|
467
|
+
),
|
468
|
+
Misspelling.new(
|
469
|
+
incorrect_text: "Damnak Chang'Eur",
|
470
|
+
correct_text: "Damnak Chang'aeur District"
|
471
|
+
),
|
472
|
+
Misspelling.new(
|
473
|
+
incorrect_text: "Krong Keb",
|
474
|
+
correct_text: "Kaeb Municipality"
|
475
|
+
),
|
476
|
+
Misspelling.new(
|
477
|
+
incorrect_text: "Sala Krao",
|
478
|
+
correct_text: "Sala Krau District"
|
479
|
+
),
|
480
|
+
Misspelling.new(
|
481
|
+
incorrect_text: "Dombae",
|
482
|
+
correct_text: "Dambae District"
|
483
|
+
),
|
484
|
+
Misspelling.new(
|
485
|
+
incorrect_text: "Krouch Chhma",
|
486
|
+
correct_text: "Krouch Chhmar District"
|
487
|
+
),
|
488
|
+
Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
|
489
|
+
Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
|
490
|
+
Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
|
491
|
+
Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
|
492
|
+
Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
|
493
|
+
Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
|
494
|
+
Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
|
495
|
+
Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
|
496
|
+
Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
|
497
|
+
Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
|
498
|
+
Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
|
499
|
+
Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
|
500
|
+
Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
|
501
|
+
Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
|
502
|
+
Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
|
503
|
+
Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
|
504
|
+
Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
|
505
|
+
Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
|
506
|
+
Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
|
507
|
+
Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
|
508
|
+
Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
|
509
|
+
Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
|
510
|
+
Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
|
511
|
+
Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
|
512
|
+
Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
|
513
|
+
Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
|
514
|
+
Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
|
515
|
+
Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
|
516
|
+
Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
|
517
|
+
Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
|
518
|
+
Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
|
519
|
+
Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
|
520
|
+
correct_text: "Tuol Svay Prey Ti Pir"),
|
521
|
+
Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
|
522
|
+
Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
|
523
|
+
Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
|
524
|
+
Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
|
525
|
+
Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
|
526
|
+
Misspelling.new(
|
527
|
+
incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
|
528
|
+
correct_text: "Kampeaeng"
|
529
|
+
),
|
530
|
+
Misspelling.new(
|
531
|
+
incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
|
532
|
+
correct_text: "Prey Rumdeng"
|
533
|
+
),
|
534
|
+
Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
|
535
|
+
Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
|
536
|
+
Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
|
537
|
+
Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
|
538
|
+
Misspelling.new(
|
539
|
+
incorrect_text: "Trapeang Kranhung Commune",
|
540
|
+
correct_text: "Trapeang Kranhoung"
|
541
|
+
),
|
542
|
+
Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
|
543
|
+
Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
|
544
|
+
Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
|
545
|
+
].freeze
|
546
|
+
|
547
|
+
def scrape!
|
548
|
+
result = []
|
549
|
+
|
550
|
+
District.all.each do |district|
|
551
|
+
province_section = find_section(
|
552
|
+
text: district.province.address_en,
|
553
|
+
section: scraper.page,
|
554
|
+
xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
|
555
|
+
).xpath("ancestor::h2/following-sibling::div").first
|
556
|
+
|
557
|
+
district_title = find_section(
|
558
|
+
text: [district.full_name_en, district.full_name_latin, district.name_latin],
|
559
|
+
section: province_section,
|
560
|
+
xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
|
561
|
+
)
|
562
|
+
|
563
|
+
next unless district_title
|
564
|
+
|
565
|
+
district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
|
566
|
+
commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
|
567
|
+
|
568
|
+
commune_links.each do |link|
|
569
|
+
invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
|
570
|
+
|
571
|
+
next if invalid_commune_link
|
572
|
+
|
573
|
+
commune = begin
|
574
|
+
find_commune(
|
575
|
+
district:,
|
576
|
+
names: {
|
577
|
+
name_latin: link.text,
|
578
|
+
full_name_en: link.text,
|
579
|
+
full_name_latin: link.text
|
580
|
+
}
|
581
|
+
)
|
582
|
+
rescue CommuneNotFoundError => e
|
583
|
+
misspelling = MISSPELLINGS.find do |m|
|
584
|
+
m.incorrect_text == link.text
|
585
|
+
end
|
586
|
+
|
587
|
+
raise(e) unless misspelling
|
588
|
+
|
589
|
+
find_commune(district:, names: { name_latin: misspelling.correct_text })
|
590
|
+
end
|
591
|
+
|
592
|
+
result << ScraperResult.new(code: commune.id,
|
593
|
+
wikipedia: URI.join(
|
594
|
+
URL, link[:href]
|
595
|
+
).to_s)
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
599
|
+
result
|
600
|
+
end
|
601
|
+
|
602
|
+
private
|
603
|
+
|
604
|
+
def build_commune_links(district:, pool:); end
|
605
|
+
|
606
|
+
def find_invalid_commune_link(district:, text:)
|
607
|
+
INVALID_COMMUNE_LINKS.find do |c|
|
608
|
+
c.district_code == district.id && c.name == text
|
609
|
+
end
|
610
|
+
end
|
611
|
+
|
612
|
+
def find_section(text:, section:, xpath_pattern:)
|
613
|
+
texts = Array(text)
|
614
|
+
default_text = texts.first
|
615
|
+
texts.each do |t|
|
616
|
+
return find_link(text: t, section:, xpath_pattern:)
|
617
|
+
rescue WebScraper::ElementNotFoundError => e
|
618
|
+
raise(e) if t == texts.last
|
619
|
+
end
|
620
|
+
rescue WebScraper::ElementNotFoundError => e
|
621
|
+
misspelling = MISSPELLINGS.find do |m|
|
622
|
+
m.correct_text == default_text
|
623
|
+
end
|
624
|
+
|
625
|
+
return if !misspelling && MISSING_LOCATIONS.include?(default_text)
|
626
|
+
raise(e) unless misspelling
|
627
|
+
|
628
|
+
find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
|
629
|
+
end
|
630
|
+
|
631
|
+
def find_link(text:, section:, xpath_pattern:)
|
632
|
+
xpath = format(xpath_pattern, text:)
|
633
|
+
result = section.xpath(xpath)
|
634
|
+
|
635
|
+
return result.first if result.size == 1
|
636
|
+
|
637
|
+
raise WebScraper::ElementNotFoundError,
|
638
|
+
"No link or many links found on #{URL} (xpath: '#{xpath}') "
|
639
|
+
end
|
640
|
+
|
641
|
+
def find_commune(district:, names:)
|
642
|
+
results = []
|
643
|
+
names.each do |k, v|
|
644
|
+
results = Commune.where(district_id: district.id, k => v)
|
645
|
+
|
646
|
+
break unless results.empty?
|
647
|
+
end
|
648
|
+
|
649
|
+
raise CommuneNotFoundError if results.empty?
|
650
|
+
|
651
|
+
if results.size > 1
|
652
|
+
raise DuplicateCommuneError,
|
653
|
+
"Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
|
654
|
+
end
|
655
|
+
|
656
|
+
results.first
|
657
|
+
end
|
658
|
+
|
659
|
+
def scraper
|
660
|
+
@scraper ||= WebScraper.new(URL)
|
661
|
+
end
|
662
|
+
end
|
663
|
+
end
|
664
|
+
end
|
665
|
+
end
|
data/lib/pumi/location.rb
CHANGED
@@ -6,7 +6,9 @@ module Pumi
|
|
6
6
|
:name_latin, :full_name_latin,
|
7
7
|
:name_en, :full_name_en,
|
8
8
|
:address_km, :address_latin, :address_en,
|
9
|
-
:administrative_unit,
|
9
|
+
:administrative_unit,
|
10
|
+
:links,
|
11
|
+
keyword_init: true
|
10
12
|
) do
|
11
13
|
class << self
|
12
14
|
attr_accessor :data_store_key
|