pumi 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  module Pumi
2
- AdministrativeUnit = Struct.new(:name_km, :name_latin, :name_en, keyword_init: true)
2
+ AdministrativeUnit = Struct.new(:name_km, :name_latin, :name_en, :name_ungegn, keyword_init: true)
3
3
  end
@@ -1,4 +1,5 @@
1
1
  require "ostruct"
2
+ require "nokogiri"
2
3
 
3
4
  module Pumi
4
5
  module Bot
@@ -105,7 +106,7 @@ module Pumi
105
106
  def replace_communes_list
106
107
  provinces = Pumi::Province.all.map { |province| Province.new(province) }
107
108
  data = OpenStruct.new(provinces:)
108
- communes_list = ERB.new(TEMPLATE).result(data.instance_eval { binding })
109
+ communes_list = ERB.new(TEMPLATE, trim_mode: "-").result(data.instance_eval { binding })
109
110
  source.sub!(communes_list_section.to_html, communes_list)
110
111
  end
111
112
 
@@ -1,4 +1,5 @@
1
1
  require "ostruct"
2
+ require "erb"
2
3
 
3
4
  module Pumi
4
5
  module Bot
@@ -12,15 +13,10 @@ module Pumi
12
13
  Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
13
14
 
14
15
  MISSPELLINGS = [
15
- Misspelling.new(incorrect_text: "Kratié", correct_text: "Kratie"),
16
- Misspelling.new(incorrect_text: "Mondulkiri", correct_text: "Mondul Kiri"),
17
16
  Misspelling.new(
18
17
  incorrect_text: "Phnom Penh (autonomous municipality)",
19
18
  correct_text: "Phnom Penh"
20
- ),
21
- Misspelling.new(incorrect_text: "Ratanakiri", correct_text: "Ratanak Kiri"),
22
- Misspelling.new(incorrect_text: "Siem Reap", correct_text: "Siemreap"),
23
- Misspelling.new(incorrect_text: "Takéo", correct_text: "Takeo")
19
+ )
24
20
  ].freeze
25
21
 
26
22
  def publish
@@ -49,7 +45,7 @@ module Pumi
49
45
  districts:,
50
46
  districts_summary: generate_districts_summary(districts:)
51
47
  )
52
- result = ERB.new(DISTRICTS_TEMPLATE).result(data.instance_eval { binding })
48
+ result = ERB.new(DISTRICTS_TEMPLATE, trim_mode: "-").result(data.instance_eval { binding })
53
49
  "\n\n#{result}\n"
54
50
  end
55
51
 
@@ -2,7 +2,7 @@
2
2
  <% provinces.each do |province| %>
3
3
  <% province_page = URI.parse(province.links[:wikipedia]).path.split("/").last %>
4
4
  ==[[<%= province_page %>|<%= province.full_name_en %>]]==
5
- <div id=province-communes-<%= province.id %>>
5
+ <div id="province-communes-<%= province.id %>">
6
6
  <%= province.name_en %> contains <%= province.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
7
7
 
8
8
  <% province.districts.each do |district| %>
@@ -13,7 +13,7 @@
13
13
  ===<%= district.full_name_en %>===
14
14
  <% end %>
15
15
 
16
- <div id=district-communes-<%= district.id %>>
16
+ <div id="district-communes-<%= district.id %>">
17
17
  <%= district.name_en %> contains <%= district.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/district.castle?ds=<%= district.id %> |title=<%= district.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
18
18
 
19
19
  {| class="wikitable sortable"
@@ -21,26 +21,35 @@
21
21
  ! #
22
22
  ! Name
23
23
  ! Khmer
24
+ ! [[Romanization of Khmer#UNGEGN|UNGEGN]]
24
25
  ! Administrative Unit
25
26
  ! Geocode
26
27
  |-
27
-
28
28
  <% district.communes.each_with_index do |commune, index| %>
29
29
  | <%= index + 1 %>
30
- <% if commune.links[:wikipedia] %>
31
- <% commune_page = URI.parse(commune.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
30
+ <% if commune.links[:wikipedia] -%>
31
+ <% commune_page = URI.parse(commune.links[:wikipedia]).path.split("/").last -%>
32
+ <% if commune_page.gsub("_", " ") == commune.name_en -%>
33
+ | [[<%= commune.name_en %>]]
34
+ <% else -%>
32
35
  | [[<%= commune_page %>|<%= commune.name_en %>]]
33
- <% else %>
36
+ <% end -%>
37
+ <% else -%>
34
38
  | <%= commune.name_en %>
35
- <% end %>
39
+ <% end -%>
36
40
  | <%= commune.name_km %>
41
+ <% if commune.name_ungegn -%>
42
+ | {{transliteration|km|<%= commune.name_ungegn %>}}
43
+ <% else -%>
44
+ |
45
+ <% end -%>
37
46
  | <%= "#{commune.administrative_unit.name_en} (#{commune.administrative_unit.name_km} #{commune.administrative_unit.name_latin})" %>
38
47
  | <%= commune.id %>
39
48
  |-
40
- <% end %>
49
+ <% end -%>
41
50
  |}
42
51
  </div>
43
- <% end %>
52
+ <% end -%>
44
53
  </div>
45
- <% end %>
54
+ <% end -%>
46
55
  </div>
@@ -1,27 +1,36 @@
1
- <div id=province-districts-<%= province.id %>>
2
- <%= province.name_en %> contains <%= districts_summary %>. <ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
1
+ <div id="province-districts-<%= province.id %>">
2
+ <%= province.name_en %> contains <%= districts_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
3
3
 
4
4
  {| class="wikitable sortable"
5
5
  |-
6
6
  ! #
7
7
  ! Name
8
8
  ! Khmer
9
+ ! [[Romanization of Khmer#UNGEGN|UNGEGN]]
9
10
  ! Administrative Unit
10
11
  ! Geocode
11
12
  |-
12
-
13
13
  <% districts.each_with_index do |district, index| %>
14
14
  | <%= index + 1 %>
15
- <% if district.links[:wikipedia] %>
16
- <% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
15
+ <% if district.links[:wikipedia] -%>
16
+ <% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last -%>
17
+ <% if district_page.gsub("_", " ") == district.name_en -%>
18
+ | [[<%= district.name_en %>]]
19
+ <% else -%>
17
20
  | [[<%= district_page %>|<%= district.name_en %>]]
18
- <% else %>
21
+ <% end -%>
22
+ <% else -%>
19
23
  | <%= district.name_en %>
20
- <% end %>
24
+ <% end -%>
21
25
  | <%= district.name_km %>
26
+ <% if district.name_ungegn -%>
27
+ | {{transliteration|km|<%= district.name_ungegn %>}}
28
+ <% else -%>
29
+ |
30
+ <% end -%>
22
31
  | <%= "#{district.administrative_unit.name_en} (#{district.administrative_unit.name_km} #{district.administrative_unit.name_latin})" %>
23
32
  | <%= district.id %>
24
33
  |-
25
- <% end %>
34
+ <% end -%>
26
35
  |}
27
36
  </div>
@@ -14,7 +14,14 @@ module Pumi
14
14
  "1715" => { type: "ក្រុង" }
15
15
  }.freeze
16
16
 
17
- AdministrativeUnit = Struct.new(:en, :km, :latin, :code_length, :group, :type, keyword_init: true)
17
+ Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
18
+
19
+ MISSPELLINGS = [
20
+ Misspelling.new(incorrect_text: "Siem Reab", correct_text: "Siem Reap"),
21
+ Misspelling.new(incorrect_text: "Aoral", correct_text: "Aural")
22
+ ].freeze
23
+
24
+ AdministrativeUnit = Struct.new(:en, :km, :latin, :ungegn, :code_length, :group, :type, keyword_init: true)
18
25
  Row = Struct.new(:code, :name_km, :name_latin, :type, keyword_init: true) do
19
26
  def administrative_unit
20
27
  ADMINISTRATIVE_UNITS.fetch(type)
@@ -22,12 +29,54 @@ module Pumi
22
29
  end
23
30
 
24
31
  ADMINISTRATIVE_UNITS = {
25
- "ស្រុក" => AdministrativeUnit.new(en: "District", km: "ស្រុក", latin: "Srok", code_length: 4, group: "districts"),
26
- "ខណ្ឌ" => AdministrativeUnit.new(en: "Section", km: "ខណ្ឌ", latin: "Khan", code_length: 4, group: "districts"),
27
- "ក្រុង" => AdministrativeUnit.new(en: "Municipality", km: "ក្រុង", latin: "Krong", code_length: 4, group: "districts"),
28
- "ឃុំ" => AdministrativeUnit.new(en: "Commune", km: "ឃុំ", latin: "Khum", code_length: 6, group: "communes"),
29
- "សង្កាត់" => AdministrativeUnit.new(en: "Quarter", km: "សង្កាត់", latin: "Sangkat", code_length: 6, group: "communes"),
30
- "ភូមិ" => AdministrativeUnit.new(en: "Village", km: "ភូមិ", latin: "Phum", code_length: 8, group: "villages")
32
+ "ស្រុក" => AdministrativeUnit.new(
33
+ en: "District",
34
+ km: "ស្រុក",
35
+ latin: "Srok",
36
+ ungegn: "Srŏk",
37
+ code_length: 4,
38
+ group: "districts"
39
+ ),
40
+ "ខណ្ឌ" => AdministrativeUnit.new(
41
+ en: "Section",
42
+ km: "ខណ្ឌ",
43
+ latin: "Khan",
44
+ ungegn: "Khând",
45
+ code_length: 4,
46
+ group: "districts"
47
+ ),
48
+ "ក្រុង" => AdministrativeUnit.new(
49
+ en: "Municipality",
50
+ km: "ក្រុង",
51
+ latin: "Krong",
52
+ ungegn: "Krŏng",
53
+ code_length: 4,
54
+ group: "districts"
55
+ ),
56
+ "ឃុំ" => AdministrativeUnit.new(
57
+ en: "Commune",
58
+ km: "ឃុំ",
59
+ latin: "Khum",
60
+ ungegn: "Khŭm",
61
+ code_length: 6,
62
+ group: "communes"
63
+ ),
64
+ "សង្កាត់" => AdministrativeUnit.new(
65
+ en: "Quarter",
66
+ km: "សង្កាត់",
67
+ latin: "Sangkat",
68
+ ungegn: "Sângkéat",
69
+ code_length: 6,
70
+ group: "communes"
71
+ ),
72
+ "ភូមិ" => AdministrativeUnit.new(
73
+ en: "Village",
74
+ km: "ភូមិ",
75
+ latin: "Phum",
76
+ ungegn: "Phum",
77
+ code_length: 8,
78
+ group: "villages"
79
+ )
31
80
  }.freeze
32
81
 
33
82
  attr_accessor :existing_data
@@ -66,10 +115,13 @@ module Pumi
66
115
  def build_row(row)
67
116
  code = parse_location_code(row)
68
117
 
118
+ name_latin = row.fetch("name_latin")
119
+ name_latin = MISSPELLINGS.find { |m| m.incorrect_text == name_latin }&.correct_text || name_latin
120
+
69
121
  Row.new(
70
122
  code:,
71
123
  name_km: row.fetch("name_km"),
72
- name_latin: row.fetch("name_latin"),
124
+ name_latin:,
73
125
  type: row.fetch("type") || MISSING_DATA.dig(code, :type)
74
126
  )
75
127
  end
@@ -85,15 +137,17 @@ module Pumi
85
137
  def add_data(row)
86
138
  data[row.administrative_unit.group] ||= {}
87
139
  data[row.administrative_unit.group][row.code] = existing_data.dig(row.administrative_unit.group, row.code) || {}
140
+ data[row.administrative_unit.group][row.code]["name"] = existing_data.dig(row.administrative_unit.group, row.code, "name") || {}
141
+ data[row.administrative_unit.group][row.code]["name"].merge!(
142
+ "km" => row.name_km,
143
+ "latin" => row.name_latin
144
+ )
88
145
  data[row.administrative_unit.group][row.code].merge!(
89
- "name" => {
90
- "km" => row.name_km,
91
- "latin" => row.name_latin
92
- },
93
146
  "administrative_unit" => {
94
147
  "km" => row.administrative_unit.km,
95
148
  "latin" => row.administrative_unit.latin,
96
- "en" => row.administrative_unit.en
149
+ "en" => row.administrative_unit.en,
150
+ "ungegn" => row.administrative_unit.ungegn
97
151
  }
98
152
  )
99
153
  end
@@ -16,8 +16,12 @@ module Pumi
16
16
  location_data = scraped_data.find { |location| location.code == code }
17
17
  next unless location_data
18
18
 
19
- attributes["links"] ||= {}
20
- attributes["links"]["wikipedia"] = location_data.wikipedia
19
+ if location_data.wikipedia
20
+ attributes["links"] ||= {}
21
+ attributes["links"]["wikipedia"] = location_data.wikipedia
22
+ end
23
+
24
+ attributes["name"]["ungegn"] = location_data.name_ungegn if location_data.name_ungegn
21
25
  end
22
26
 
23
27
  write_data!(output_dir)
@@ -37,7 +41,7 @@ module Pumi
37
41
  data_file.write(data, data_directory:)
38
42
  end
39
43
 
40
- ScraperResult = Struct.new(:code, :wikipedia, keyword_init: true)
44
+ ScraperResult = Struct.new(:code, :wikipedia, :name_ungegn, keyword_init: true)
41
45
 
42
46
  class WebScraper
43
47
  class ElementNotFoundError < StandardError; end
@@ -58,7 +62,11 @@ module Pumi
58
62
 
59
63
  def scrape!
60
64
  Province.all.each_with_object([]) do |province, result|
61
- result << ScraperResult.new(code: province.id, wikipedia: find_url(province))
65
+ result << ScraperResult.new(
66
+ code: province.id,
67
+ wikipedia: find_url(province),
68
+ name_ungegn: find_ungegn(province)
69
+ )
62
70
  end
63
71
  end
64
72
 
@@ -69,14 +77,25 @@ module Pumi
69
77
  end
70
78
 
71
79
  def find_url(province)
80
+ td = find_khmer_name_td(province)
81
+ link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
82
+ URI.join(URL, link[:href]).to_s
83
+ end
84
+
85
+ def find_ungegn(province)
86
+ td = find_khmer_name_td(province)
87
+ td.at_xpath("following-sibling::td/span[contains(@title, 'Khmer-language romanization')]")&.text
88
+ end
89
+
90
+ def find_khmer_name_td(province)
72
91
  td = province_table_rows.at_xpath("child::td[contains(., '#{province.name_km}')]")
92
+
73
93
  if td.nil?
74
94
  raise WebScraper::ElementNotFoundError,
75
95
  "No cell containing '#{province.name_km}' was found in a table on #{URL}"
76
96
  end
77
97
 
78
- link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
79
- URI.join(URL, link[:href]).to_s
98
+ td
80
99
  end
81
100
 
82
101
  def province_table_rows
@@ -99,10 +118,11 @@ module Pumi
99
118
 
100
119
  def scrape!
101
120
  District.all.each_with_object([]) do |district, result|
102
- url = find_url(district)
103
- next unless url
104
-
105
- result << ScraperResult.new(code: district.id, wikipedia: url)
121
+ result << ScraperResult.new(
122
+ code: district.id,
123
+ wikipedia: find_url(district),
124
+ name_ungegn: find_ungegn(district)
125
+ )
106
126
  end
107
127
  end
108
128
 
@@ -113,16 +133,28 @@ module Pumi
113
133
  end
114
134
 
115
135
  def find_url(district)
116
- geocode = scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
136
+ geocode_td = find_geocode_td(district)
117
137
 
118
- return if geocode.nil?
138
+ return if geocode_td.nil?
119
139
 
120
- link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
140
+ link = geocode_td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
121
141
 
122
142
  return if link.nil?
123
143
 
124
144
  URI.join(URL, link[:href]).to_s
125
145
  end
146
+
147
+ def find_ungegn(district)
148
+ geocode_td = find_geocode_td(district)
149
+
150
+ return if geocode_td.nil?
151
+
152
+ geocode_td.at_xpath("preceding-sibling::td/span[contains(@title, 'Khmer-language romanization')]")&.text
153
+ end
154
+
155
+ def find_geocode_td(district)
156
+ scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
157
+ end
126
158
  end
127
159
 
128
160
  class CambodianCommunesScraper
@@ -130,27 +162,40 @@ module Pumi
130
162
 
131
163
  def scrape!
132
164
  Commune.all.each_with_object([]) do |commune, result|
133
- url = find_url(commune)
134
- next if url.nil?
135
-
136
- result << ScraperResult.new(code: commune.id, wikipedia: url)
165
+ result << ScraperResult.new(
166
+ code: commune.id,
167
+ wikipedia: find_url(commune),
168
+ name_ungegn: find_ungegn(commune)
169
+ )
137
170
  end
138
171
  end
139
172
 
140
173
  private
141
174
 
142
175
  def find_url(commune)
143
- geocode = scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
176
+ geocode_td = find_geocode_td(commune)
144
177
 
145
- return if geocode.nil?
178
+ return if geocode_td.nil?
146
179
 
147
- link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
180
+ link = geocode_td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
148
181
 
149
182
  return if link.nil?
150
183
 
151
184
  URI.join(URL, link[:href]).to_s
152
185
  end
153
186
 
187
+ def find_ungegn(commune)
188
+ geocode_td = find_geocode_td(commune)
189
+
190
+ return if geocode_td.nil?
191
+
192
+ geocode_td.at_xpath("preceding-sibling::td/span[contains(@title, 'Khmer-language romanization')]")&.text
193
+ end
194
+
195
+ def find_geocode_td(commune)
196
+ scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
197
+ end
198
+
154
199
  def scraper
155
200
  @scraper ||= WebScraper.new(URL)
156
201
  end
data/lib/pumi/location.rb CHANGED
@@ -5,6 +5,7 @@ module Pumi
5
5
  :name_km, :full_name_km,
6
6
  :name_latin, :full_name_latin,
7
7
  :name_en, :full_name_en,
8
+ :name_ungegn, :full_name_ungegn,
8
9
  :address_km, :address_latin, :address_en,
9
10
  :administrative_unit,
10
11
  :links,
data/lib/pumi/parser.rb CHANGED
@@ -82,6 +82,7 @@ module Pumi
82
82
  name = attributes.fetch("name")
83
83
  name_km = name.fetch("km")
84
84
  name_latin = name.fetch("latin")
85
+ name_ungegn = name["ungegn"]
85
86
  administrative_unit = build_administrative_unit(
86
87
  attributes.fetch("administrative_unit")
87
88
  )
@@ -94,20 +95,25 @@ module Pumi
94
95
  id:,
95
96
  administrative_unit:,
96
97
  name_km:,
98
+ name_en: name_latin,
97
99
  name_latin:,
100
+ name_ungegn:,
98
101
  geodata:,
99
102
  iso3166_2: attributes["iso3166_2"],
100
103
  links: attributes.fetch("links", {}).transform_keys(&:to_sym),
101
- name_en: name_latin,
102
104
  full_name_km: [
103
105
  administrative_unit_name(name_km, administrative_unit.name_km),
104
106
  name_km
105
107
  ].compact.join,
108
+ full_name_en: [name_latin, administrative_unit.name_en].join(" "),
106
109
  full_name_latin: [
107
110
  administrative_unit_name(name_latin, administrative_unit.name_latin),
108
111
  name_latin
109
112
  ].compact.join(" "),
110
- full_name_en: [name_latin, administrative_unit.name_en].join(" ")
113
+ full_name_ungegn: name_ungegn && [
114
+ administrative_unit_name(name_ungegn, administrative_unit.name_ungegn),
115
+ name_ungegn
116
+ ].compact.join(" ")
111
117
  }
112
118
  end
113
119
 
@@ -138,7 +144,8 @@ module Pumi
138
144
  AdministrativeUnit.new(
139
145
  name_km: attributes.fetch("km"),
140
146
  name_latin: attributes.fetch("latin"),
141
- name_en: attributes.fetch("en")
147
+ name_en: attributes.fetch("en"),
148
+ name_ungegn: attributes.fetch("ungegn")
142
149
  )
143
150
  end
144
151
 
data/lib/pumi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pumi
2
- VERSION = "0.25.0".freeze
2
+ VERSION = "0.26.0".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pumi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.25.0
4
+ version: 0.26.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Wilkie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-19 00:00:00.000000000 Z
11
+ date: 2024-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler