pumi 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +8 -0
- data/bin/parse_data +0 -5
- data/data/communes.yml +9904 -0
- data/data/districts.yml +1576 -0
- data/data/provinces.yml +225 -0
- data/lib/pumi/bot/wikipedia/article.rb +13 -0
- data/lib/pumi/bot/wikipedia/communes_in_cambodia_article.rb +157 -0
- data/lib/pumi/bot/wikipedia/districts_in_cambodia_article.rb +122 -0
- data/lib/pumi/bot/wikipedia/templates/commune_list.wikitext.erb +46 -0
- data/lib/pumi/bot/wikipedia/templates/district_list.wikitext.erb +27 -0
- data/lib/pumi/bot/wikipedia.rb +10 -0
- data/lib/pumi/bot.rb +6 -0
- data/lib/pumi/data_source/geocoder.rb +251 -0
- data/lib/pumi/data_source/iso31662.rb +29 -0
- data/lib/pumi/data_source/wikipedia.rb +19 -524
- data/lib/pumi/data_source.rb +2 -0
- data/lib/pumi/geodata.rb +3 -0
- data/lib/pumi/location.rb +2 -0
- data/lib/pumi/parser.rb +7 -0
- data/lib/pumi/version.rb +1 -1
- data/lib/pumi/wikipedia/client.rb +68 -0
- data/lib/pumi/wikipedia/response.rb +15 -0
- data/lib/pumi/wikipedia.rb +7 -0
- data/lib/pumi.rb +3 -0
- data/pumi.gemspec +4 -1
- metadata +58 -4
- data/lib/pumi/scraper/result.rb +0 -5
@@ -0,0 +1,46 @@
|
|
1
|
+
<div id="communes-list">
|
2
|
+
<% provinces.each do |province| %>
|
3
|
+
<% province_page = URI.parse(province.links[:wikipedia]).path.split("/").last %>
|
4
|
+
==[[<%= province_page %>|<%= province.full_name_en %>]]==
|
5
|
+
<div id=province-communes-<%= province.id %>>
|
6
|
+
<%= province.name_en %> contains <%= province.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
|
7
|
+
|
8
|
+
<% province.districts.each do |district| %>
|
9
|
+
<% if district.links[:wikipedia] %>
|
10
|
+
<% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last %>
|
11
|
+
===[[<%= district_page %>|<%= district.full_name_en %>]]===
|
12
|
+
<% else %>
|
13
|
+
===<%= district.full_name_en %>===
|
14
|
+
<% end %>
|
15
|
+
|
16
|
+
<div id=district-communes-<%= district.id %>>
|
17
|
+
<%= district.name_en %> contains <%= district.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/district.castle?ds=<%= district.id %> |title=<%= district.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
|
18
|
+
|
19
|
+
{| class="wikitable sortable"
|
20
|
+
|-
|
21
|
+
! #
|
22
|
+
! Name
|
23
|
+
! Khmer
|
24
|
+
! Administrative Unit
|
25
|
+
! Geocode
|
26
|
+
|-
|
27
|
+
|
28
|
+
<% district.communes.each_with_index do |commune, index| %>
|
29
|
+
| <%= index + 1 %>
|
30
|
+
<% if commune.links[:wikipedia] %>
|
31
|
+
<% commune_page = URI.parse(commune.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
|
32
|
+
| [[<%= commune_page %>|<%= commune.name_en %>]]
|
33
|
+
<% else %>
|
34
|
+
| <%= commune.name_en %>
|
35
|
+
<% end %>
|
36
|
+
| <%= commune.name_km %>
|
37
|
+
| <%= "#{commune.administrative_unit.name_en} (#{commune.administrative_unit.name_km} #{commune.administrative_unit.name_latin})" %>
|
38
|
+
| <%= commune.id %>
|
39
|
+
|-
|
40
|
+
<% end %>
|
41
|
+
|}
|
42
|
+
</div>
|
43
|
+
<% end %>
|
44
|
+
</div>
|
45
|
+
<% end %>
|
46
|
+
</div>
|
@@ -0,0 +1,27 @@
|
|
1
|
+
<div id=province-districts-<%= province.id %>>
|
2
|
+
<%= province.name_en %> contains <%= districts_summary %>. <ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
|
3
|
+
|
4
|
+
{| class="wikitable sortable"
|
5
|
+
|-
|
6
|
+
! #
|
7
|
+
! Name
|
8
|
+
! Khmer
|
9
|
+
! Administrative Unit
|
10
|
+
! Geocode
|
11
|
+
|-
|
12
|
+
|
13
|
+
<% districts.each_with_index do |district, index| %>
|
14
|
+
| <%= index + 1 %>
|
15
|
+
<% if district.links[:wikipedia] %>
|
16
|
+
<% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
|
17
|
+
| [[<%= district_page %>|<%= district.name_en %>]]
|
18
|
+
<% else %>
|
19
|
+
| <%= district.name_en %>
|
20
|
+
<% end %>
|
21
|
+
| <%= district.name_km %>
|
22
|
+
| <%= "#{district.administrative_unit.name_en} (#{district.administrative_unit.name_km} #{district.administrative_unit.name_latin})" %>
|
23
|
+
| <%= district.id %>
|
24
|
+
|-
|
25
|
+
<% end %>
|
26
|
+
|}
|
27
|
+
</div>
|
data/lib/pumi/bot.rb
ADDED
@@ -0,0 +1,251 @@
|
|
1
|
+
require "geocoder"
|
2
|
+
|
3
|
+
module Pumi
|
4
|
+
module DataSource
|
5
|
+
class Geocoder
|
6
|
+
Result = Struct.new(:code, :lat, :long, :bounding_box, keyword_init: true)
|
7
|
+
|
8
|
+
class AbstractGeocoder
|
9
|
+
Result = Struct.new(
|
10
|
+
:lat, :long, :bounding_box, :country_code,
|
11
|
+
:types, :iso3166_2, :district_name_en,
|
12
|
+
:name,
|
13
|
+
keyword_init: true
|
14
|
+
)
|
15
|
+
|
16
|
+
Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
|
17
|
+
|
18
|
+
MISSPELLINGS = []
|
19
|
+
|
20
|
+
class AbstractProvider
|
21
|
+
attr_reader :geocoder, :name
|
22
|
+
|
23
|
+
def initialize(geocoder:, name:)
|
24
|
+
@geocoder = geocoder
|
25
|
+
@name = name
|
26
|
+
end
|
27
|
+
|
28
|
+
def search(term)
|
29
|
+
geocoder.search(term, lookup: name).map do |result|
|
30
|
+
build_result(result.data)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Google < AbstractProvider
|
36
|
+
private
|
37
|
+
|
38
|
+
def build_result(data)
|
39
|
+
province_name_en = find_address_component(
|
40
|
+
data,
|
41
|
+
"administrative_area_level_1"
|
42
|
+
)&.fetch("long_name")
|
43
|
+
province = Pumi::Province.where(full_name_en: province_name_en).first
|
44
|
+
Result.new(
|
45
|
+
name: data.dig("address_components", 0, "long_name"),
|
46
|
+
lat: data.dig("geometry", "location", "lat"),
|
47
|
+
long: data.dig("geometry", "location", "lng"),
|
48
|
+
bounding_box: [
|
49
|
+
data.dig("geometry", "bounds", "northeast", "lat"),
|
50
|
+
data.dig("geometry", "bounds", "northeast", "lng"),
|
51
|
+
data.dig("geometry", "bounds", "southwest", "lat"),
|
52
|
+
data.dig("geometry", "bounds", "southwest", "lng")
|
53
|
+
],
|
54
|
+
country_code: find_address_component(data, "country").fetch("short_name").upcase,
|
55
|
+
district_name_en: find_address_component(
|
56
|
+
data,
|
57
|
+
"administrative_area_level_2"
|
58
|
+
)&.fetch("long_name"),
|
59
|
+
types: data["types"],
|
60
|
+
iso3166_2: province&.iso3166_2
|
61
|
+
)
|
62
|
+
end
|
63
|
+
|
64
|
+
def find_address_component(data, type)
|
65
|
+
data.fetch("address_components").find do |c|
|
66
|
+
c.fetch("types").include?(type)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class Nominatim < AbstractProvider
|
72
|
+
private
|
73
|
+
|
74
|
+
def build_result(data)
|
75
|
+
Result.new(
|
76
|
+
name: nil,
|
77
|
+
lat: data["lat"],
|
78
|
+
long: data["lon"],
|
79
|
+
bounding_box: data["boundingbox"],
|
80
|
+
types: Array(data["type"]),
|
81
|
+
iso3166_2: data.dig("address", "ISO3166-2-lvl4"),
|
82
|
+
country_code: data.dig("address", "country_code")&.upcase,
|
83
|
+
district_name_en: data.dig("address", "county")
|
84
|
+
)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
PROVIDERS = {
|
89
|
+
nominatim: Nominatim,
|
90
|
+
google: Google
|
91
|
+
}.freeze
|
92
|
+
|
93
|
+
attr_reader :providers, :options
|
94
|
+
|
95
|
+
def initialize(geocoder: ::Geocoder, providers: PROVIDERS.keys, **options)
|
96
|
+
@options = options
|
97
|
+
|
98
|
+
geocoder.configure(
|
99
|
+
google: {
|
100
|
+
api_key: ENV["GOOGLE_API_KEY"]
|
101
|
+
}
|
102
|
+
)
|
103
|
+
|
104
|
+
@providers = Array(providers).map do |name|
|
105
|
+
PROVIDERS.fetch(name).new(geocoder:, name:)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def geocode_all
|
110
|
+
locations.each_with_object([]).with_index do |(location, results), _index|
|
111
|
+
next if !options[:regeocode] && !location.geodata.nil?
|
112
|
+
|
113
|
+
geocoder_result = geocode(location)
|
114
|
+
|
115
|
+
if geocoder_result.nil?
|
116
|
+
ungeocoded_locations << location
|
117
|
+
next
|
118
|
+
end
|
119
|
+
|
120
|
+
results << build_result(code: location.id, geocoder_result:)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
|
126
|
+
def geocode(location)
|
127
|
+
providers.each do |provider|
|
128
|
+
Array(build_search_term(location)).each do |search_term|
|
129
|
+
all_results = provider.search(search_term)
|
130
|
+
geocoder_result = filter(location, all_results)
|
131
|
+
|
132
|
+
return geocoder_result unless geocoder_result.nil?
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
nil
|
137
|
+
end
|
138
|
+
|
139
|
+
def build_result(code:, geocoder_result:)
|
140
|
+
Geocoder::Result.new(
|
141
|
+
code:,
|
142
|
+
lat: geocoder_result.lat,
|
143
|
+
long: geocoder_result.long,
|
144
|
+
bounding_box: geocoder_result.bounding_box
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
def build_search_term(location)
|
149
|
+
[location.full_name_km, location.name_km].map do |term|
|
150
|
+
MISSPELLINGS.find { |m| m.correct_text == term }&.incorrect_text || term
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def ungeocoded_locations
|
155
|
+
@ungeocoded_locations ||= []
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
class CambodianProvinces < AbstractGeocoder
|
160
|
+
private
|
161
|
+
|
162
|
+
def locations
|
163
|
+
@locations ||= Pumi::Province.all
|
164
|
+
end
|
165
|
+
|
166
|
+
def build_search_term(province)
|
167
|
+
province.iso3166_2
|
168
|
+
end
|
169
|
+
|
170
|
+
def filter(province, geocoder_results)
|
171
|
+
geocoder_results.find do |r|
|
172
|
+
r.iso3166_2 == province.iso3166_2 && r.types.include?("administrative")
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
class CambodianDistricts < AbstractGeocoder
|
178
|
+
private
|
179
|
+
|
180
|
+
def locations
|
181
|
+
@locations ||= Pumi::District.all
|
182
|
+
end
|
183
|
+
|
184
|
+
def filter(district, geocoder_results)
|
185
|
+
geocoder_results.find do |r|
|
186
|
+
r.country_code == "KH" &&
|
187
|
+
r.iso3166_2 == district.province.iso3166_2 && (
|
188
|
+
%w[administrative_area_level_2 town city administrative].any? do |type|
|
189
|
+
r.types.include?(type)
|
190
|
+
end || (%w[locality political].sort == r.types.sort)
|
191
|
+
)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
class CambodianCommunes < AbstractGeocoder
|
197
|
+
private
|
198
|
+
|
199
|
+
def locations
|
200
|
+
@locations ||= Pumi::Commune.all
|
201
|
+
end
|
202
|
+
|
203
|
+
def filter(commune, geocoder_results)
|
204
|
+
geocoder_results.find do |r|
|
205
|
+
r.country_code == "KH" &&
|
206
|
+
(r.iso3166_2 == commune.province.iso3166_2 || r.district_name_en.to_s.downcase.include?(commune.district.name_en.downcase)) &&
|
207
|
+
%w[administrative_area_level_3 village suburb neighbourhood].any? do |type|
|
208
|
+
r.types.include?(type)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
attr_reader :data_file, :geocoder
|
215
|
+
|
216
|
+
def initialize(data_file:, geocoder:)
|
217
|
+
@data_file = data_file
|
218
|
+
@geocoder = geocoder
|
219
|
+
end
|
220
|
+
|
221
|
+
def load_data!(output_dir: "data")
|
222
|
+
data.each do |code, attributes|
|
223
|
+
geocoded_result = geocoded_results.find { |r| r.code == code }
|
224
|
+
|
225
|
+
next if geocoded_result.nil?
|
226
|
+
|
227
|
+
attributes["geodata"] ||= {}
|
228
|
+
attributes["geodata"]["lat"] = geocoded_result.lat
|
229
|
+
attributes["geodata"]["long"] = geocoded_result.long
|
230
|
+
attributes["geodata"]["bounding_box"] = geocoded_result.bounding_box
|
231
|
+
end
|
232
|
+
|
233
|
+
write_data!(output_dir)
|
234
|
+
end
|
235
|
+
|
236
|
+
private
|
237
|
+
|
238
|
+
def data
|
239
|
+
@data ||= data_file.read
|
240
|
+
end
|
241
|
+
|
242
|
+
def write_data!(data_directory)
|
243
|
+
data_file.write(data, data_directory:)
|
244
|
+
end
|
245
|
+
|
246
|
+
def geocoded_results
|
247
|
+
@geocoded_results ||= geocoder.geocode_all
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Pumi
|
2
|
+
module DataSource
|
3
|
+
class ISO31662
|
4
|
+
attr_reader :data_file
|
5
|
+
|
6
|
+
def initialize(data_file: Pumi::DataFile.new(:provinces))
|
7
|
+
@data_file = data_file
|
8
|
+
end
|
9
|
+
|
10
|
+
def load_data!(output_dir: "data")
|
11
|
+
data.each do |code, attributes|
|
12
|
+
attributes["iso3166_2"] = "KH-#{code.to_i}"
|
13
|
+
end
|
14
|
+
|
15
|
+
write_data!(output_dir)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def data
|
21
|
+
@data ||= data_file.read
|
22
|
+
end
|
23
|
+
|
24
|
+
def write_data!(data_directory)
|
25
|
+
data_file.write(data, data_directory:)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|