pumi 0.17.0 → 0.19.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +5 -0
- data/bin/parse_data +16 -3
- data/data/communes.yml +563 -1
- data/data/districts.yml +402 -0
- data/data/provinces.yml +52 -2
- data/lib/pumi/data_file.rb +32 -0
- data/lib/pumi/data_source/ncdd.rb +100 -0
- data/lib/pumi/data_source/wikipedia.rb +665 -0
- data/lib/pumi/data_source.rb +7 -0
- data/lib/pumi/location.rb +3 -1
- data/lib/pumi/parser.rb +20 -13
- data/lib/pumi/scraper/result.rb +5 -0
- data/lib/pumi/version.rb +1 -1
- data/lib/pumi.rb +2 -1
- data/pumi.gemspec +1 -0
- metadata +21 -3
- data/lib/pumi/data_parser.rb +0 -75
@@ -0,0 +1,665 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "open-uri"
|
3
|
+
|
4
|
+
module Pumi
|
5
|
+
module DataSource
|
6
|
+
class Wikipedia
|
7
|
+
attr_reader :data_file, :scraper
|
8
|
+
|
9
|
+
def initialize(data_file:, scraper:)
|
10
|
+
@data_file = data_file
|
11
|
+
@scraper = scraper
|
12
|
+
end
|
13
|
+
|
14
|
+
def load_data!(output_dir: "data")
|
15
|
+
data.each do |code, attributes|
|
16
|
+
location_data = scraped_data.find { |location| location.code == code }
|
17
|
+
next unless location_data
|
18
|
+
|
19
|
+
attributes["links"] ||= {}
|
20
|
+
attributes["links"]["wikipedia"] = location_data.wikipedia
|
21
|
+
end
|
22
|
+
|
23
|
+
write_data!(output_dir)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def scraped_data
|
29
|
+
@scraped_data ||= scraper.scrape!
|
30
|
+
end
|
31
|
+
|
32
|
+
def data
|
33
|
+
@data ||= data_file.read
|
34
|
+
end
|
35
|
+
|
36
|
+
def write_data!(data_directory)
|
37
|
+
data_file.write(data, data_directory:)
|
38
|
+
end
|
39
|
+
|
40
|
+
ScraperResult = Struct.new(:code, :wikipedia, keyword_init: true)
|
41
|
+
|
42
|
+
class WebScraper
|
43
|
+
class ElementNotFoundError < StandardError; end
|
44
|
+
|
45
|
+
attr_reader :url
|
46
|
+
|
47
|
+
def initialize(url)
|
48
|
+
@url = url
|
49
|
+
end
|
50
|
+
|
51
|
+
def page
|
52
|
+
@page ||= Nokogiri::HTML(URI.parse(url).open)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class CambodianProvincesScraper
|
57
|
+
URL = "https://en.wikipedia.org/wiki/Provinces_of_Cambodia".freeze
|
58
|
+
|
59
|
+
def scrape!
|
60
|
+
Province.all.each_with_object([]) do |province, result|
|
61
|
+
result << ScraperResult.new(code: province.id, wikipedia: find_url(province))
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def scraper
|
68
|
+
@scraper ||= WebScraper.new(URL)
|
69
|
+
end
|
70
|
+
|
71
|
+
def find_url(province)
|
72
|
+
td = province_table_rows.xpath("child::td[contains(., '#{province.name_km}')]").first
|
73
|
+
if td.nil?
|
74
|
+
raise WebScraper::ElementNotFoundError,
|
75
|
+
"No cell containing '#{province.name_km}' was found in a table on #{URL}"
|
76
|
+
end
|
77
|
+
|
78
|
+
link = td.xpath("preceding-sibling::td/a").first
|
79
|
+
URI.join(URL, link[:href]).to_s
|
80
|
+
end
|
81
|
+
|
82
|
+
def province_table_rows
|
83
|
+
@province_table_rows ||= begin
|
84
|
+
sample_province = Province.all.first
|
85
|
+
|
86
|
+
sample_row = scraper.page.xpath("//table/tbody/tr[td//text()[contains(., '#{sample_province.name_km}')]]").first
|
87
|
+
if sample_row.xpath("//a[text()[contains(., '#{sample_province.name_en}')]]").empty?
|
88
|
+
raise WebScraper::ElementNotFoundError,
|
89
|
+
"No link containing '#{sample_province.name_en}' was found in a table on #{URL}"
|
90
|
+
end
|
91
|
+
|
92
|
+
sample_row.parent.xpath("child::tr")
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
class CambodianDistrictsScraper
|
98
|
+
URL = "https://en.wikipedia.org/wiki/List_of_districts,_municipalities_and_sections_in_Cambodia".freeze
|
99
|
+
|
100
|
+
def scrape!
|
101
|
+
District.all.each_with_object([]) do |district, result|
|
102
|
+
url = find_url(district)
|
103
|
+
next unless url
|
104
|
+
|
105
|
+
result << ScraperResult.new(code: district.id, wikipedia: url)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
def scraper
|
112
|
+
@scraper ||= WebScraper.new(URL)
|
113
|
+
end
|
114
|
+
|
115
|
+
def find_url(district)
|
116
|
+
identifier = district.id.chars.each_slice(2).map(&:join).join("-")
|
117
|
+
list_items = scraper.page.xpath("//ol/li[text()[contains(., '#{identifier}')]]")
|
118
|
+
|
119
|
+
return if list_items.empty?
|
120
|
+
|
121
|
+
if list_items.size > 1
|
122
|
+
raise WebScraper::ElementNotFoundError,
|
123
|
+
"More than one element was found with the identifier '#{identifier}' on #{URL}"
|
124
|
+
end
|
125
|
+
|
126
|
+
link = list_items.first.xpath("child::a[contains(@href, '/wiki/')]").first
|
127
|
+
return unless link
|
128
|
+
|
129
|
+
URI.join(URL, link[:href]).to_s
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class CambodianCommunesScraper
|
134
|
+
class CommuneNotFoundError < StandardError; end
|
135
|
+
class DuplicateCommuneError < StandardError; end
|
136
|
+
|
137
|
+
URL = "https://en.wikipedia.org/wiki/List_of_communes_in_Cambodia".freeze
|
138
|
+
Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
|
139
|
+
InvalidCommuneLink = Struct.new(:district_code, :name, keyword_init: true)
|
140
|
+
|
141
|
+
MISSING_LOCATIONS = [
|
142
|
+
"Taing Kouk District",
|
143
|
+
"Bokor Municipality",
|
144
|
+
"Ta Lou Senchey District",
|
145
|
+
"Kaoh Rung Municipality",
|
146
|
+
"Borei Ou Svay Senchey District"
|
147
|
+
].freeze
|
148
|
+
|
149
|
+
INVALID_COMMUNE_LINKS = [
|
150
|
+
InvalidCommuneLink.new(district_code: "0301", name: "Prasat"),
|
151
|
+
InvalidCommuneLink.new(district_code: "0302", name: "Svay Teab"),
|
152
|
+
InvalidCommuneLink.new(district_code: "0306", name: "Kokor"),
|
153
|
+
InvalidCommuneLink.new(district_code: "0306", name: "Krala"),
|
154
|
+
InvalidCommuneLink.new(district_code: "0307", name: "Angkor Ban"),
|
155
|
+
InvalidCommuneLink.new(district_code: "0307", name: "Sdau"),
|
156
|
+
InvalidCommuneLink.new(district_code: "0308", name: "Koh Sotin"),
|
157
|
+
InvalidCommuneLink.new(district_code: "0601", name: "Treal"),
|
158
|
+
InvalidCommuneLink.new(district_code: "0601", name: "Baray"),
|
159
|
+
InvalidCommuneLink.new(district_code: "0313", name: "Mean"),
|
160
|
+
InvalidCommuneLink.new(district_code: "0313", name: "Lvea"),
|
161
|
+
InvalidCommuneLink.new(district_code: "0313", name: "Prey Chor"),
|
162
|
+
InvalidCommuneLink.new(district_code: "0314", name: "Baray"),
|
163
|
+
InvalidCommuneLink.new(district_code: "0314", name: "Mean Chey"),
|
164
|
+
InvalidCommuneLink.new(district_code: "0401", name: "Ponley"),
|
165
|
+
InvalidCommuneLink.new(district_code: "0405", name: "Longveaek"),
|
166
|
+
InvalidCommuneLink.new(district_code: "0405", name: "Saeb"),
|
167
|
+
InvalidCommuneLink.new(district_code: "0406", name: "Svay Chrum"),
|
168
|
+
InvalidCommuneLink.new(district_code: "0501", name: "Basedth"),
|
169
|
+
InvalidCommuneLink.new(district_code: "0503", name: "Srang"),
|
170
|
+
InvalidCommuneLink.new(district_code: "0503", name: "Veal"),
|
171
|
+
InvalidCommuneLink.new(district_code: "0507", name: "Samraong Tong"),
|
172
|
+
InvalidCommuneLink.new(district_code: "0510", name: "Mean Chey"),
|
173
|
+
InvalidCommuneLink.new(district_code: "0510", name: "Phnum Touch"),
|
174
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Chheu Teal"),
|
175
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Klaeng"),
|
176
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Mean Chey"),
|
177
|
+
InvalidCommuneLink.new(district_code: "0608", name: "Trea"),
|
178
|
+
InvalidCommuneLink.new(district_code: "0604", name: "Daung"),
|
179
|
+
InvalidCommuneLink.new(district_code: "0606", name: "Sandaan"),
|
180
|
+
InvalidCommuneLink.new(district_code: "0607", name: "Kokoh"),
|
181
|
+
InvalidCommuneLink.new(district_code: "0701", name: "Angkor Chey"),
|
182
|
+
InvalidCommuneLink.new(district_code: "0703", name: "Chhouk"),
|
183
|
+
InvalidCommuneLink.new(district_code: "0703", name: "Meanchey"),
|
184
|
+
InvalidCommuneLink.new(district_code: "0708", name: "Kampong Bay"),
|
185
|
+
InvalidCommuneLink.new(district_code: "0801", name: "Siem Reap"),
|
186
|
+
InvalidCommuneLink.new(district_code: "0801", name: "Trea"),
|
187
|
+
InvalidCommuneLink.new(district_code: "0802", name: "Chheu Teal"),
|
188
|
+
InvalidCommuneLink.new(district_code: "0802", name: "Kokir"),
|
189
|
+
InvalidCommuneLink.new(district_code: "0804", name: "Leuk Daek"),
|
190
|
+
InvalidCommuneLink.new(district_code: "0808", name: "Mkak"),
|
191
|
+
InvalidCommuneLink.new(district_code: "0809", name: "Ponhea Leu"),
|
192
|
+
InvalidCommuneLink.new(district_code: "0811", name: "Ta Khmau"),
|
193
|
+
InvalidCommuneLink.new(district_code: "0813", name: "Svay Chrum"),
|
194
|
+
InvalidCommuneLink.new(district_code: "0904", name: "Smach Meanchey"),
|
195
|
+
InvalidCommuneLink.new(district_code: "0906", name: "Srae Ambel"),
|
196
|
+
InvalidCommuneLink.new(district_code: "1001", name: "Chhloung"),
|
197
|
+
InvalidCommuneLink.new(district_code: "1003", name: "Preaek Prasab"),
|
198
|
+
InvalidCommuneLink.new(district_code: "1003", name: "Tamao"),
|
199
|
+
InvalidCommuneLink.new(district_code: "1004", name: "Sombo"),
|
200
|
+
InvalidCommuneLink.new(district_code: "1006", name: "Sambok"),
|
201
|
+
InvalidCommuneLink.new(district_code: "1303", name: "Choam Khsant"),
|
202
|
+
InvalidCommuneLink.new(district_code: "1304", name: "Phnom Penh"),
|
203
|
+
InvalidCommuneLink.new(district_code: "1305", name: "Ratanak"),
|
204
|
+
InvalidCommuneLink.new(district_code: "1308", name: "Pahal"),
|
205
|
+
InvalidCommuneLink.new(district_code: "1403", name: "Kampong Trabaek"),
|
206
|
+
InvalidCommuneLink.new(district_code: "1403", name: "Prey Chhor"),
|
207
|
+
InvalidCommuneLink.new(district_code: "1404", name: "Kanhchriech"),
|
208
|
+
InvalidCommuneLink.new(district_code: "1405", name: "Svay Chrum"),
|
209
|
+
InvalidCommuneLink.new(district_code: "1409", name: "Lvea"),
|
210
|
+
InvalidCommuneLink.new(district_code: "1409", name: "Preah Sdach"),
|
211
|
+
InvalidCommuneLink.new(district_code: "1410", name: "Baray"),
|
212
|
+
InvalidCommuneLink.new(district_code: "1410", name: "Kampong Leav"),
|
213
|
+
InvalidCommuneLink.new(district_code: "1411", name: "Takor"),
|
214
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Anlong Vil"),
|
215
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Kandieng"),
|
216
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Sya"),
|
217
|
+
InvalidCommuneLink.new(district_code: "1502", name: "Veal"),
|
218
|
+
InvalidCommuneLink.new(district_code: "1604", name: "Teun"),
|
219
|
+
InvalidCommuneLink.new(district_code: "1606", name: "Poy"),
|
220
|
+
InvalidCommuneLink.new(district_code: "1607", name: "Sesan"),
|
221
|
+
InvalidCommuneLink.new(district_code: "1607", name: "Yatung"),
|
222
|
+
InvalidCommuneLink.new(district_code: "1609", name: "Pong"),
|
223
|
+
InvalidCommuneLink.new(district_code: "1609", name: "Veun Sai"),
|
224
|
+
InvalidCommuneLink.new(district_code: "1701", name: "Koal"),
|
225
|
+
InvalidCommuneLink.new(district_code: "1702", name: "Svay Chek"),
|
226
|
+
InvalidCommuneLink.new(district_code: "1704", name: "Chi Kraeng"),
|
227
|
+
InvalidCommuneLink.new(district_code: "1704", name: "Kampong Kdei"),
|
228
|
+
InvalidCommuneLink.new(district_code: "1706", name: "Kralanh"),
|
229
|
+
InvalidCommuneLink.new(district_code: "1706", name: "Sen Sok"),
|
230
|
+
InvalidCommuneLink.new(district_code: "1707", name: "Lvea"),
|
231
|
+
InvalidCommuneLink.new(district_code: "1707", name: "Reul"),
|
232
|
+
InvalidCommuneLink.new(district_code: "1709", name: "Bakong"),
|
233
|
+
InvalidCommuneLink.new(district_code: "1709", name: "Meanchey"),
|
234
|
+
InvalidCommuneLink.new(district_code: "1710", name: "Nokor Thom"),
|
235
|
+
InvalidCommuneLink.new(district_code: "1711", name: "Popel"),
|
236
|
+
InvalidCommuneLink.new(district_code: "1713", name: "Svay Leu"),
|
237
|
+
InvalidCommuneLink.new(district_code: "1801", name: "Koh Rong"),
|
238
|
+
InvalidCommuneLink.new(district_code: "1802", name: "Ou Chrov"),
|
239
|
+
InvalidCommuneLink.new(district_code: "1802", name: "Prey Nob"),
|
240
|
+
InvalidCommuneLink.new(district_code: "1802", name: "Ream"),
|
241
|
+
InvalidCommuneLink.new(district_code: "1804", name: "Kampong Seila"),
|
242
|
+
InvalidCommuneLink.new(district_code: "1901", name: "Sdau"),
|
243
|
+
InvalidCommuneLink.new(district_code: "1902", name: "Siem Bouk"),
|
244
|
+
InvalidCommuneLink.new(district_code: "1903", name: "Sekong"),
|
245
|
+
InvalidCommuneLink.new(district_code: "2001", name: "Chantrea"),
|
246
|
+
InvalidCommuneLink.new(district_code: "2002", name: "Preah Ponlea"),
|
247
|
+
InvalidCommuneLink.new(district_code: "2003", name: "Svay Chek"),
|
248
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Ampel"),
|
249
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Daung"),
|
250
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Kampong Trach"),
|
251
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Kokir"),
|
252
|
+
InvalidCommuneLink.new(district_code: "2004", name: "Krasang"),
|
253
|
+
InvalidCommuneLink.new(district_code: "2005", name: "Bassak"),
|
254
|
+
InvalidCommuneLink.new(district_code: "2005", name: "Chheu Teal"),
|
255
|
+
InvalidCommuneLink.new(district_code: "2005", name: "Svay Chrum"),
|
256
|
+
InvalidCommuneLink.new(district_code: "2008", name: "Bavet"),
|
257
|
+
InvalidCommuneLink.new(district_code: "2008", name: "Prasat"),
|
258
|
+
InvalidCommuneLink.new(district_code: "2201", name: "Anlong Veaeng"),
|
259
|
+
InvalidCommuneLink.new(district_code: "2401", name: "Pailin"),
|
260
|
+
InvalidCommuneLink.new(district_code: "2502", name: "Chhouk"),
|
261
|
+
InvalidCommuneLink.new(district_code: "2502", name: "Trea"),
|
262
|
+
InvalidCommuneLink.new(district_code: "2503", name: "Memot"),
|
263
|
+
InvalidCommuneLink.new(district_code: "2503", name: "Kokir"),
|
264
|
+
InvalidCommuneLink.new(district_code: "2504", name: "Chork"),
|
265
|
+
InvalidCommuneLink.new(district_code: "2504", name: "Mean"),
|
266
|
+
InvalidCommuneLink.new(district_code: "2505", name: "Popel"),
|
267
|
+
InvalidCommuneLink.new(district_code: "2507", name: "Chikor")
|
268
|
+
].freeze
|
269
|
+
|
270
|
+
MISSPELLINGS = [
|
271
|
+
Misspelling.new(incorrect_text: "Kratié Province", correct_text: "Kratie Province"),
|
272
|
+
Misspelling.new(
|
273
|
+
incorrect_text: "Mondulkiri Province",
|
274
|
+
correct_text: "Mondul Kiri Province"
|
275
|
+
),
|
276
|
+
Misspelling.new(
|
277
|
+
incorrect_text: "Ratanakiri Province",
|
278
|
+
correct_text: "Ratanak Kiri Province"
|
279
|
+
),
|
280
|
+
Misspelling.new(
|
281
|
+
incorrect_text: "Siem Reap Province",
|
282
|
+
correct_text: "Siemreap Province"
|
283
|
+
),
|
284
|
+
Misspelling.new(
|
285
|
+
incorrect_text: "Serei Saophoan District",
|
286
|
+
correct_text: "Serei Saophoan Municipality"
|
287
|
+
),
|
288
|
+
Misspelling.new(
|
289
|
+
incorrect_text: "Poipet Municipality",
|
290
|
+
correct_text: "Paoy Paet Municipality"
|
291
|
+
),
|
292
|
+
Misspelling.new(
|
293
|
+
incorrect_text: "Battambang District",
|
294
|
+
correct_text: "Battambang Municipality"
|
295
|
+
),
|
296
|
+
Misspelling.new(
|
297
|
+
incorrect_text: "Rotanak Mondol District",
|
298
|
+
correct_text: "Rotonak Mondol District"
|
299
|
+
),
|
300
|
+
Misspelling.new(
|
301
|
+
incorrect_text: "Sampov Loun District",
|
302
|
+
correct_text: "Sampov Lun District"
|
303
|
+
),
|
304
|
+
Misspelling.new(
|
305
|
+
incorrect_text: "Koh Kralor District",
|
306
|
+
correct_text: "Koas Krala District"
|
307
|
+
),
|
308
|
+
Misspelling.new(
|
309
|
+
incorrect_text: "Rukhak Kiri District",
|
310
|
+
correct_text: "Rukh Kiri District"
|
311
|
+
),
|
312
|
+
Misspelling.new(
|
313
|
+
incorrect_text: "Koh Sotin District",
|
314
|
+
correct_text: "Kaoh Soutin District"
|
315
|
+
),
|
316
|
+
Misspelling.new(
|
317
|
+
incorrect_text: "Srey Santhor District",
|
318
|
+
correct_text: "Srei Santhor District"
|
319
|
+
),
|
320
|
+
Misspelling.new(
|
321
|
+
incorrect_text: "Kong Pisey",
|
322
|
+
correct_text: "Kong Pisei District"
|
323
|
+
),
|
324
|
+
Misspelling.new(
|
325
|
+
incorrect_text: "Phnom Sruoch District",
|
326
|
+
correct_text: "Phnum Sruoch District"
|
327
|
+
),
|
328
|
+
Misspelling.new(
|
329
|
+
incorrect_text: "Stueng Saen District",
|
330
|
+
correct_text: "Stueng Saen Municipality"
|
331
|
+
),
|
332
|
+
Misspelling.new(
|
333
|
+
incorrect_text: "Prasat Balangk District",
|
334
|
+
correct_text: "Prasat Ballangk District"
|
335
|
+
),
|
336
|
+
Misspelling.new(
|
337
|
+
incorrect_text: "Kampot District",
|
338
|
+
correct_text: "Kampot Municipality"
|
339
|
+
),
|
340
|
+
Misspelling.new(
|
341
|
+
incorrect_text: "Kampot District",
|
342
|
+
correct_text: "Kampot Municipality"
|
343
|
+
),
|
344
|
+
Misspelling.new(
|
345
|
+
incorrect_text: "Koh Thum District",
|
346
|
+
correct_text: "Kaoh Thum District"
|
347
|
+
),
|
348
|
+
Misspelling.new(
|
349
|
+
incorrect_text: "Mukh Kamphool District",
|
350
|
+
correct_text: "Mukh Kampul District"
|
351
|
+
),
|
352
|
+
Misspelling.new(
|
353
|
+
incorrect_text: "Ponhea Leu District",
|
354
|
+
correct_text: "Ponhea Lueu District"
|
355
|
+
),
|
356
|
+
Misspelling.new(
|
357
|
+
incorrect_text: "Kiri Sakor",
|
358
|
+
correct_text: "Kiri Sakor District"
|
359
|
+
),
|
360
|
+
Misspelling.new(
|
361
|
+
incorrect_text: "Koh Kong",
|
362
|
+
correct_text: "Kaoh Kong District"
|
363
|
+
),
|
364
|
+
Misspelling.new(
|
365
|
+
incorrect_text: "Khemara Phoumin",
|
366
|
+
correct_text: "Khemara Phoumin Municipality"
|
367
|
+
),
|
368
|
+
Misspelling.new(
|
369
|
+
incorrect_text: "Mondol Seima",
|
370
|
+
correct_text: "Mondol Seima District"
|
371
|
+
),
|
372
|
+
Misspelling.new(
|
373
|
+
incorrect_text: "Srae Ambel",
|
374
|
+
correct_text: "Srae Ambel District"
|
375
|
+
),
|
376
|
+
Misspelling.new(
|
377
|
+
incorrect_text: "Thma Bang",
|
378
|
+
correct_text: "Thma Bang District"
|
379
|
+
),
|
380
|
+
Misspelling.new(
|
381
|
+
incorrect_text: "Kratie Municipality",
|
382
|
+
correct_text: "Kracheh Municipality"
|
383
|
+
),
|
384
|
+
Misspelling.new(
|
385
|
+
incorrect_text: "Preaek Prasab District",
|
386
|
+
correct_text: "Prek Prasab District"
|
387
|
+
),
|
388
|
+
Misspelling.new(
|
389
|
+
incorrect_text: "Krong Saen Monorom",
|
390
|
+
correct_text: "Saen Monourom Municipality"
|
391
|
+
),
|
392
|
+
Misspelling.new(
|
393
|
+
incorrect_text: "Khan Daun Penh",
|
394
|
+
correct_text: "Doun Penh Section"
|
395
|
+
),
|
396
|
+
Misspelling.new(
|
397
|
+
incorrect_text: "Khan Prampir Makara",
|
398
|
+
correct_text: "Prampir Meakkakra Section"
|
399
|
+
),
|
400
|
+
Misspelling.new(
|
401
|
+
incorrect_text: "Khan Meanchey",
|
402
|
+
correct_text: "Mean Chey Section"
|
403
|
+
),
|
404
|
+
Misspelling.new(
|
405
|
+
incorrect_text: "Khan Sen Sok",
|
406
|
+
correct_text: "Saensokh Section"
|
407
|
+
),
|
408
|
+
Misspelling.new(
|
409
|
+
incorrect_text: "Khan Por Sen Chey",
|
410
|
+
correct_text: "Pur SenChey Section"
|
411
|
+
),
|
412
|
+
Misspelling.new(
|
413
|
+
incorrect_text: "Khan Chrouy Changvar",
|
414
|
+
correct_text: "Chraoy Chongvar Section"
|
415
|
+
),
|
416
|
+
Misspelling.new(
|
417
|
+
incorrect_text: "Khan Prek Phnov",
|
418
|
+
correct_text: "Praek Pnov Section"
|
419
|
+
),
|
420
|
+
Misspelling.new(
|
421
|
+
incorrect_text: "Choam Khsant",
|
422
|
+
correct_text: "Choam Ksant District"
|
423
|
+
),
|
424
|
+
Misspelling.new(
|
425
|
+
incorrect_text: "Kulen",
|
426
|
+
correct_text: "Kuleaen District"
|
427
|
+
),
|
428
|
+
Misspelling.new(
|
429
|
+
incorrect_text: "Sangkom Thmei",
|
430
|
+
correct_text: "Sangkum Thmei District"
|
431
|
+
),
|
432
|
+
Misspelling.new(
|
433
|
+
incorrect_text: "Prey Veaeng",
|
434
|
+
correct_text: "Prey Veng Municipality"
|
435
|
+
),
|
436
|
+
Misspelling.new(
|
437
|
+
incorrect_text: "Por Reang",
|
438
|
+
correct_text: "Pur Rieng District"
|
439
|
+
),
|
440
|
+
Misspelling.new(
|
441
|
+
incorrect_text: "Veal Veng",
|
442
|
+
correct_text: "Veal Veaeng District"
|
443
|
+
),
|
444
|
+
Misspelling.new(
|
445
|
+
incorrect_text: "Krong Banlung",
|
446
|
+
correct_text: "Ban Lung Municipality"
|
447
|
+
),
|
448
|
+
Misspelling.new(
|
449
|
+
incorrect_text: "Angkor Thom",
|
450
|
+
correct_text: "Angkor Thum District"
|
451
|
+
),
|
452
|
+
Misspelling.new(
|
453
|
+
incorrect_text: "Sout Nikom",
|
454
|
+
correct_text: "Soutr Nikom District"
|
455
|
+
),
|
456
|
+
Misspelling.new(
|
457
|
+
incorrect_text: "Steung Hav",
|
458
|
+
correct_text: "Stueng Hav District"
|
459
|
+
),
|
460
|
+
Misspelling.new(
|
461
|
+
incorrect_text: "Krong Stung Treng",
|
462
|
+
correct_text: "Stueng Traeng Municipality"
|
463
|
+
),
|
464
|
+
Misspelling.new(
|
465
|
+
incorrect_text: "Bourei Cholsar District",
|
466
|
+
correct_text: "Borei Cholsar District"
|
467
|
+
),
|
468
|
+
Misspelling.new(
|
469
|
+
incorrect_text: "Damnak Chang'Eur",
|
470
|
+
correct_text: "Damnak Chang'aeur District"
|
471
|
+
),
|
472
|
+
Misspelling.new(
|
473
|
+
incorrect_text: "Krong Keb",
|
474
|
+
correct_text: "Kaeb Municipality"
|
475
|
+
),
|
476
|
+
Misspelling.new(
|
477
|
+
incorrect_text: "Sala Krao",
|
478
|
+
correct_text: "Sala Krau District"
|
479
|
+
),
|
480
|
+
Misspelling.new(
|
481
|
+
incorrect_text: "Dombae",
|
482
|
+
correct_text: "Dambae District"
|
483
|
+
),
|
484
|
+
Misspelling.new(
|
485
|
+
incorrect_text: "Krouch Chhma",
|
486
|
+
correct_text: "Krouch Chhmar District"
|
487
|
+
),
|
488
|
+
Misspelling.new(incorrect_text: "Paoy Char", correct_text: "Poy Char"),
|
489
|
+
Misspelling.new(incorrect_text: "Phnom Dei", correct_text: "Phnum Dei"),
|
490
|
+
Misspelling.new(incorrect_text: "Spean Sraeng Rouk", correct_text: "Spean Sraeng"),
|
491
|
+
Misspelling.new(incorrect_text: "Chhnuor", correct_text: "Chnuor Mean Chey"),
|
492
|
+
Misspelling.new(incorrect_text: "Chob", correct_text: "Chob Vari"),
|
493
|
+
Misspelling.new(incorrect_text: "Prasat Char", correct_text: "Prasat"),
|
494
|
+
Misspelling.new(incorrect_text: "Preah Netr Preah", correct_text: "Preak Netr Preah"),
|
495
|
+
Misspelling.new(incorrect_text: "Rohal Rohal", correct_text: "Rohal"),
|
496
|
+
Misspelling.new(incorrect_text: "Tuek Chour Smach", correct_text: "Tuek Chour"),
|
497
|
+
Misspelling.new(incorrect_text: "Ou Bei Choan", correct_text: "Ou Beichoan"),
|
498
|
+
Misspelling.new(incorrect_text: "Ou Sampor", correct_text: "Ou Sampoar"),
|
499
|
+
Misspelling.new(incorrect_text: "Poipet", correct_text: "Paoy Paet"),
|
500
|
+
Misspelling.new(incorrect_text: "Tuol Ta Aek", correct_text: "Tuol Ta Ek"),
|
501
|
+
Misspelling.new(incorrect_text: "Preaek Preah Sdach", correct_text: "Tuol Ta Ek"),
|
502
|
+
Misspelling.new(incorrect_text: "Chamkar Samraong", correct_text: "Chomkar Somraong"),
|
503
|
+
Misspelling.new(incorrect_text: "Sla Kaet", correct_text: "Sla Ket"),
|
504
|
+
Misspelling.new(incorrect_text: "Ou Mal", correct_text: "OMal"),
|
505
|
+
Misspelling.new(incorrect_text: "Voat Kor", correct_text: "wat Kor"),
|
506
|
+
Misspelling.new(incorrect_text: "Svay Pao", correct_text: "Svay Por"),
|
507
|
+
Misspelling.new(incorrect_text: "Kdol Tahen", correct_text: "Kdol Ta Haen"),
|
508
|
+
Misspelling.new(incorrect_text: "Kaoh Chiveang Thvang", correct_text: "Kaoh Chiveang"),
|
509
|
+
Misspelling.new(incorrect_text: "Voat Ta Muem", correct_text: "Vaot Ta Muem"),
|
510
|
+
Misspelling.new(incorrect_text: "Ou Samrel", correct_text: "Ou Samril"),
|
511
|
+
Misspelling.new(incorrect_text: "Ta Krai", correct_text: "Ta Krei"),
|
512
|
+
Misspelling.new(incorrect_text: "Prek Chik", correct_text: "Preaek Chik"),
|
513
|
+
Misspelling.new(incorrect_text: "Prey Chor", correct_text: "Prey Chhor"),
|
514
|
+
Misspelling.new(incorrect_text: "Samraong Tong", correct_text: "Samrong Tong"),
|
515
|
+
Misspelling.new(incorrect_text: "Phnum Touch", correct_text: "Phnom Touch"),
|
516
|
+
Misspelling.new(incorrect_text: "Tonle Bassac", correct_text: "Tonle Basak"),
|
517
|
+
Misspelling.new(incorrect_text: "Chey Chumneas", correct_text: "Chey Chummeah"),
|
518
|
+
Misspelling.new(incorrect_text: "Boeung Prolit", correct_text: "Boeng Proluet"),
|
519
|
+
Misspelling.new(incorrect_text: "Tuol Svay Prey 2",
|
520
|
+
correct_text: "Tuol Svay Prey Ti Pir"),
|
521
|
+
Misspelling.new(incorrect_text: "Neak Leung", correct_text: "Neak Loeang"),
|
522
|
+
Misspelling.new(incorrect_text: "Kratie", correct_text: "Kracheh"),
|
523
|
+
Misspelling.new(incorrect_text: "Pate", correct_text: "Pa Te"),
|
524
|
+
Misspelling.new(incorrect_text: "Prek Phtoul Commune", correct_text: "Preaek Phtoul"),
|
525
|
+
Misspelling.new(incorrect_text: "Bourei Cholsar Commune", correct_text: "Borei Cholsar"),
|
526
|
+
Misspelling.new(
|
527
|
+
incorrect_text: "Kampeaeng Commune (Kiri Vong District)",
|
528
|
+
correct_text: "Kampeaeng"
|
529
|
+
),
|
530
|
+
Misspelling.new(
|
531
|
+
incorrect_text: "Prey Rumdeng Commune (Kiri Vong District)",
|
532
|
+
correct_text: "Prey Rumdeng"
|
533
|
+
),
|
534
|
+
Misspelling.new(incorrect_text: "Ta Our Commune", correct_text: "Ta Ou"),
|
535
|
+
Misspelling.new(incorrect_text: "Kompaeng Commune", correct_text: "Kampeaeng"),
|
536
|
+
Misspelling.new(incorrect_text: "Kompong Reab Commune", correct_text: "Kampong Reab"),
|
537
|
+
Misspelling.new(incorrect_text: "Our Saray Commune", correct_text: "Ou Saray"),
|
538
|
+
Misspelling.new(
|
539
|
+
incorrect_text: "Trapeang Kranhung Commune",
|
540
|
+
correct_text: "Trapeang Kranhoung"
|
541
|
+
),
|
542
|
+
Misspelling.new(incorrect_text: "Angk Kev Commune", correct_text: "Angk Kaev"),
|
543
|
+
Misspelling.new(incorrect_text: "Sanlong Commune", correct_text: "Sanlung"),
|
544
|
+
Misspelling.new(incorrect_text: "O Smach", correct_text: "Ou Smach")
|
545
|
+
].freeze
|
546
|
+
|
547
|
+
def scrape!
|
548
|
+
result = []
|
549
|
+
|
550
|
+
District.all.each do |district|
|
551
|
+
province_section = find_section(
|
552
|
+
text: district.province.address_en,
|
553
|
+
section: scraper.page,
|
554
|
+
xpath_pattern: "//h2//a[text()[contains(., \"%<text>s\")]]"
|
555
|
+
).xpath("ancestor::h2/following-sibling::div").first
|
556
|
+
|
557
|
+
district_title = find_section(
|
558
|
+
text: [district.full_name_en, district.full_name_latin, district.name_latin],
|
559
|
+
section: province_section,
|
560
|
+
xpath_pattern: "child::table//tr/td//h3//*[text()[contains(., \"%<text>s\")]]"
|
561
|
+
)
|
562
|
+
|
563
|
+
next unless district_title
|
564
|
+
|
565
|
+
district_section = district_title.xpath("ancestor::h3/following-sibling::*").first
|
566
|
+
commune_links = district_section.xpath("child::li//a[contains(@href, '/wiki/')]")
|
567
|
+
|
568
|
+
commune_links.each do |link|
|
569
|
+
invalid_commune_link = find_invalid_commune_link(district:, text: link.text)
|
570
|
+
|
571
|
+
next if invalid_commune_link
|
572
|
+
|
573
|
+
commune = begin
|
574
|
+
find_commune(
|
575
|
+
district:,
|
576
|
+
names: {
|
577
|
+
name_latin: link.text,
|
578
|
+
full_name_en: link.text,
|
579
|
+
full_name_latin: link.text
|
580
|
+
}
|
581
|
+
)
|
582
|
+
rescue CommuneNotFoundError => e
|
583
|
+
misspelling = MISSPELLINGS.find do |m|
|
584
|
+
m.incorrect_text == link.text
|
585
|
+
end
|
586
|
+
|
587
|
+
raise(e) unless misspelling
|
588
|
+
|
589
|
+
find_commune(district:, names: { name_latin: misspelling.correct_text })
|
590
|
+
end
|
591
|
+
|
592
|
+
result << ScraperResult.new(code: commune.id,
|
593
|
+
wikipedia: URI.join(
|
594
|
+
URL, link[:href]
|
595
|
+
).to_s)
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
599
|
+
result
|
600
|
+
end
|
601
|
+
|
602
|
+
private
|
603
|
+
|
604
|
+
def build_commune_links(district:, pool:); end
|
605
|
+
|
606
|
+
def find_invalid_commune_link(district:, text:)
|
607
|
+
INVALID_COMMUNE_LINKS.find do |c|
|
608
|
+
c.district_code == district.id && c.name == text
|
609
|
+
end
|
610
|
+
end
|
611
|
+
|
612
|
+
def find_section(text:, section:, xpath_pattern:)
|
613
|
+
texts = Array(text)
|
614
|
+
default_text = texts.first
|
615
|
+
texts.each do |t|
|
616
|
+
return find_link(text: t, section:, xpath_pattern:)
|
617
|
+
rescue WebScraper::ElementNotFoundError => e
|
618
|
+
raise(e) if t == texts.last
|
619
|
+
end
|
620
|
+
rescue WebScraper::ElementNotFoundError => e
|
621
|
+
misspelling = MISSPELLINGS.find do |m|
|
622
|
+
m.correct_text == default_text
|
623
|
+
end
|
624
|
+
|
625
|
+
return if !misspelling && MISSING_LOCATIONS.include?(default_text)
|
626
|
+
raise(e) unless misspelling
|
627
|
+
|
628
|
+
find_link(text: misspelling.incorrect_text, section:, xpath_pattern:)
|
629
|
+
end
|
630
|
+
|
631
|
+
def find_link(text:, section:, xpath_pattern:)
|
632
|
+
xpath = format(xpath_pattern, text:)
|
633
|
+
result = section.xpath(xpath)
|
634
|
+
|
635
|
+
return result.first if result.size == 1
|
636
|
+
|
637
|
+
raise WebScraper::ElementNotFoundError,
|
638
|
+
"No link or many links found on #{URL} (xpath: '#{xpath}') "
|
639
|
+
end
|
640
|
+
|
641
|
+
def find_commune(district:, names:)
|
642
|
+
results = []
|
643
|
+
names.each do |k, v|
|
644
|
+
results = Commune.where(district_id: district.id, k => v)
|
645
|
+
|
646
|
+
break unless results.empty?
|
647
|
+
end
|
648
|
+
|
649
|
+
raise CommuneNotFoundError if results.empty?
|
650
|
+
|
651
|
+
if results.size > 1
|
652
|
+
raise DuplicateCommuneError,
|
653
|
+
"Commune '#{identifier}' was found more than once for province: '#{province.name_en}'"
|
654
|
+
end
|
655
|
+
|
656
|
+
results.first
|
657
|
+
end
|
658
|
+
|
659
|
+
def scraper
|
660
|
+
@scraper ||= WebScraper.new(URL)
|
661
|
+
end
|
662
|
+
end
|
663
|
+
end
|
664
|
+
end
|
665
|
+
end
|
data/lib/pumi/location.rb
CHANGED
@@ -6,7 +6,9 @@ module Pumi
|
|
6
6
|
:name_latin, :full_name_latin,
|
7
7
|
:name_en, :full_name_en,
|
8
8
|
:address_km, :address_latin, :address_en,
|
9
|
-
:administrative_unit,
|
9
|
+
:administrative_unit,
|
10
|
+
:links,
|
11
|
+
keyword_init: true
|
10
12
|
) do
|
11
13
|
class << self
|
12
14
|
attr_accessor :data_store_key
|