pumi 0.25.0 → 0.26.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,3 @@
1
1
  module Pumi
2
- AdministrativeUnit = Struct.new(:name_km, :name_latin, :name_en, keyword_init: true)
2
+ AdministrativeUnit = Struct.new(:name_km, :name_latin, :name_en, :name_ungegn, keyword_init: true)
3
3
  end
@@ -1,4 +1,5 @@
1
1
  require "ostruct"
2
+ require "nokogiri"
2
3
 
3
4
  module Pumi
4
5
  module Bot
@@ -105,7 +106,7 @@ module Pumi
105
106
  def replace_communes_list
106
107
  provinces = Pumi::Province.all.map { |province| Province.new(province) }
107
108
  data = OpenStruct.new(provinces:)
108
- communes_list = ERB.new(TEMPLATE).result(data.instance_eval { binding })
109
+ communes_list = ERB.new(TEMPLATE, trim_mode: "-").result(data.instance_eval { binding })
109
110
  source.sub!(communes_list_section.to_html, communes_list)
110
111
  end
111
112
 
@@ -1,4 +1,5 @@
1
1
  require "ostruct"
2
+ require "erb"
2
3
 
3
4
  module Pumi
4
5
  module Bot
@@ -12,15 +13,10 @@ module Pumi
12
13
  Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
13
14
 
14
15
  MISSPELLINGS = [
15
- Misspelling.new(incorrect_text: "Kratié", correct_text: "Kratie"),
16
- Misspelling.new(incorrect_text: "Mondulkiri", correct_text: "Mondul Kiri"),
17
16
  Misspelling.new(
18
17
  incorrect_text: "Phnom Penh (autonomous municipality)",
19
18
  correct_text: "Phnom Penh"
20
- ),
21
- Misspelling.new(incorrect_text: "Ratanakiri", correct_text: "Ratanak Kiri"),
22
- Misspelling.new(incorrect_text: "Siem Reap", correct_text: "Siemreap"),
23
- Misspelling.new(incorrect_text: "Takéo", correct_text: "Takeo")
19
+ )
24
20
  ].freeze
25
21
 
26
22
  def publish
@@ -49,7 +45,7 @@ module Pumi
49
45
  districts:,
50
46
  districts_summary: generate_districts_summary(districts:)
51
47
  )
52
- result = ERB.new(DISTRICTS_TEMPLATE).result(data.instance_eval { binding })
48
+ result = ERB.new(DISTRICTS_TEMPLATE, trim_mode: "-").result(data.instance_eval { binding })
53
49
  "\n\n#{result}\n"
54
50
  end
55
51
 
@@ -2,7 +2,7 @@
2
2
  <% provinces.each do |province| %>
3
3
  <% province_page = URI.parse(province.links[:wikipedia]).path.split("/").last %>
4
4
  ==[[<%= province_page %>|<%= province.full_name_en %>]]==
5
- <div id=province-communes-<%= province.id %>>
5
+ <div id="province-communes-<%= province.id %>">
6
6
  <%= province.name_en %> contains <%= province.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
7
7
 
8
8
  <% province.districts.each do |district| %>
@@ -13,7 +13,7 @@
13
13
  ===<%= district.full_name_en %>===
14
14
  <% end %>
15
15
 
16
- <div id=district-communes-<%= district.id %>>
16
+ <div id="district-communes-<%= district.id %>">
17
17
  <%= district.name_en %> contains <%= district.communes_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/district.castle?ds=<%= district.id %> |title=<%= district.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
18
18
 
19
19
  {| class="wikitable sortable"
@@ -21,26 +21,35 @@
21
21
  ! #
22
22
  ! Name
23
23
  ! Khmer
24
+ ! [[Romanization of Khmer#UNGEGN|UNGEGN]]
24
25
  ! Administrative Unit
25
26
  ! Geocode
26
27
  |-
27
-
28
28
  <% district.communes.each_with_index do |commune, index| %>
29
29
  | <%= index + 1 %>
30
- <% if commune.links[:wikipedia] %>
31
- <% commune_page = URI.parse(commune.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
30
+ <% if commune.links[:wikipedia] -%>
31
+ <% commune_page = URI.parse(commune.links[:wikipedia]).path.split("/").last -%>
32
+ <% if commune_page.gsub("_", " ") == commune.name_en -%>
33
+ | [[<%= commune.name_en %>]]
34
+ <% else -%>
32
35
  | [[<%= commune_page %>|<%= commune.name_en %>]]
33
- <% else %>
36
+ <% end -%>
37
+ <% else -%>
34
38
  | <%= commune.name_en %>
35
- <% end %>
39
+ <% end -%>
36
40
  | <%= commune.name_km %>
41
+ <% if commune.name_ungegn -%>
42
+ | {{transliteration|km|<%= commune.name_ungegn %>}}
43
+ <% else -%>
44
+ |
45
+ <% end -%>
37
46
  | <%= "#{commune.administrative_unit.name_en} (#{commune.administrative_unit.name_km} #{commune.administrative_unit.name_latin})" %>
38
47
  | <%= commune.id %>
39
48
  |-
40
- <% end %>
49
+ <% end -%>
41
50
  |}
42
51
  </div>
43
- <% end %>
52
+ <% end -%>
44
53
  </div>
45
- <% end %>
54
+ <% end -%>
46
55
  </div>
@@ -1,27 +1,36 @@
1
- <div id=province-districts-<%= province.id %>>
2
- <%= province.name_en %> contains <%= districts_summary %>. <ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
1
+ <div id="province-districts-<%= province.id %>">
2
+ <%= province.name_en %> contains <%= districts_summary %>.<ref>{{cite web|url=http://db.ncdd.gov.kh/gazetteer/view/province.castle?pv=<%= province.id %> |title=<%= province.name_en %> |publisher=National Committee for Sub-National Democratic Development }}</ref>
3
3
 
4
4
  {| class="wikitable sortable"
5
5
  |-
6
6
  ! #
7
7
  ! Name
8
8
  ! Khmer
9
+ ! [[Romanization of Khmer#UNGEGN|UNGEGN]]
9
10
  ! Administrative Unit
10
11
  ! Geocode
11
12
  |-
12
-
13
13
  <% districts.each_with_index do |district, index| %>
14
14
  | <%= index + 1 %>
15
- <% if district.links[:wikipedia] %>
16
- <% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last.gsub("_", " ") %>
15
+ <% if district.links[:wikipedia] -%>
16
+ <% district_page = URI.parse(district.links[:wikipedia]).path.split("/").last -%>
17
+ <% if district_page.gsub("_", " ") == district.name_en -%>
18
+ | [[<%= district.name_en %>]]
19
+ <% else -%>
17
20
  | [[<%= district_page %>|<%= district.name_en %>]]
18
- <% else %>
21
+ <% end -%>
22
+ <% else -%>
19
23
  | <%= district.name_en %>
20
- <% end %>
24
+ <% end -%>
21
25
  | <%= district.name_km %>
26
+ <% if district.name_ungegn -%>
27
+ | {{transliteration|km|<%= district.name_ungegn %>}}
28
+ <% else -%>
29
+ |
30
+ <% end -%>
22
31
  | <%= "#{district.administrative_unit.name_en} (#{district.administrative_unit.name_km} #{district.administrative_unit.name_latin})" %>
23
32
  | <%= district.id %>
24
33
  |-
25
- <% end %>
34
+ <% end -%>
26
35
  |}
27
36
  </div>
@@ -14,7 +14,14 @@ module Pumi
14
14
  "1715" => { type: "ក្រុង" }
15
15
  }.freeze
16
16
 
17
- AdministrativeUnit = Struct.new(:en, :km, :latin, :code_length, :group, :type, keyword_init: true)
17
+ Misspelling = Struct.new(:incorrect_text, :correct_text, keyword_init: true)
18
+
19
+ MISSPELLINGS = [
20
+ Misspelling.new(incorrect_text: "Siem Reab", correct_text: "Siem Reap"),
21
+ Misspelling.new(incorrect_text: "Aoral", correct_text: "Aural")
22
+ ].freeze
23
+
24
+ AdministrativeUnit = Struct.new(:en, :km, :latin, :ungegn, :code_length, :group, :type, keyword_init: true)
18
25
  Row = Struct.new(:code, :name_km, :name_latin, :type, keyword_init: true) do
19
26
  def administrative_unit
20
27
  ADMINISTRATIVE_UNITS.fetch(type)
@@ -22,12 +29,54 @@ module Pumi
22
29
  end
23
30
 
24
31
  ADMINISTRATIVE_UNITS = {
25
- "ស្រុក" => AdministrativeUnit.new(en: "District", km: "ស្រុក", latin: "Srok", code_length: 4, group: "districts"),
26
- "ខណ្ឌ" => AdministrativeUnit.new(en: "Section", km: "ខណ្ឌ", latin: "Khan", code_length: 4, group: "districts"),
27
- "ក្រុង" => AdministrativeUnit.new(en: "Municipality", km: "ក្រុង", latin: "Krong", code_length: 4, group: "districts"),
28
- "ឃុំ" => AdministrativeUnit.new(en: "Commune", km: "ឃុំ", latin: "Khum", code_length: 6, group: "communes"),
29
- "សង្កាត់" => AdministrativeUnit.new(en: "Quarter", km: "សង្កាត់", latin: "Sangkat", code_length: 6, group: "communes"),
30
- "ភូមិ" => AdministrativeUnit.new(en: "Village", km: "ភូមិ", latin: "Phum", code_length: 8, group: "villages")
32
+ "ស្រុក" => AdministrativeUnit.new(
33
+ en: "District",
34
+ km: "ស្រុក",
35
+ latin: "Srok",
36
+ ungegn: "Srŏk",
37
+ code_length: 4,
38
+ group: "districts"
39
+ ),
40
+ "ខណ្ឌ" => AdministrativeUnit.new(
41
+ en: "Section",
42
+ km: "ខណ្ឌ",
43
+ latin: "Khan",
44
+ ungegn: "Khând",
45
+ code_length: 4,
46
+ group: "districts"
47
+ ),
48
+ "ក្រុង" => AdministrativeUnit.new(
49
+ en: "Municipality",
50
+ km: "ក្រុង",
51
+ latin: "Krong",
52
+ ungegn: "Krŏng",
53
+ code_length: 4,
54
+ group: "districts"
55
+ ),
56
+ "ឃុំ" => AdministrativeUnit.new(
57
+ en: "Commune",
58
+ km: "ឃុំ",
59
+ latin: "Khum",
60
+ ungegn: "Khŭm",
61
+ code_length: 6,
62
+ group: "communes"
63
+ ),
64
+ "សង្កាត់" => AdministrativeUnit.new(
65
+ en: "Quarter",
66
+ km: "សង្កាត់",
67
+ latin: "Sangkat",
68
+ ungegn: "Sângkéat",
69
+ code_length: 6,
70
+ group: "communes"
71
+ ),
72
+ "ភូមិ" => AdministrativeUnit.new(
73
+ en: "Village",
74
+ km: "ភូមិ",
75
+ latin: "Phum",
76
+ ungegn: "Phum",
77
+ code_length: 8,
78
+ group: "villages"
79
+ )
31
80
  }.freeze
32
81
 
33
82
  attr_accessor :existing_data
@@ -66,10 +115,13 @@ module Pumi
66
115
  def build_row(row)
67
116
  code = parse_location_code(row)
68
117
 
118
+ name_latin = row.fetch("name_latin")
119
+ name_latin = MISSPELLINGS.find { |m| m.incorrect_text == name_latin }&.correct_text || name_latin
120
+
69
121
  Row.new(
70
122
  code:,
71
123
  name_km: row.fetch("name_km"),
72
- name_latin: row.fetch("name_latin"),
124
+ name_latin:,
73
125
  type: row.fetch("type") || MISSING_DATA.dig(code, :type)
74
126
  )
75
127
  end
@@ -85,15 +137,17 @@ module Pumi
85
137
  def add_data(row)
86
138
  data[row.administrative_unit.group] ||= {}
87
139
  data[row.administrative_unit.group][row.code] = existing_data.dig(row.administrative_unit.group, row.code) || {}
140
+ data[row.administrative_unit.group][row.code]["name"] = existing_data.dig(row.administrative_unit.group, row.code, "name") || {}
141
+ data[row.administrative_unit.group][row.code]["name"].merge!(
142
+ "km" => row.name_km,
143
+ "latin" => row.name_latin
144
+ )
88
145
  data[row.administrative_unit.group][row.code].merge!(
89
- "name" => {
90
- "km" => row.name_km,
91
- "latin" => row.name_latin
92
- },
93
146
  "administrative_unit" => {
94
147
  "km" => row.administrative_unit.km,
95
148
  "latin" => row.administrative_unit.latin,
96
- "en" => row.administrative_unit.en
149
+ "en" => row.administrative_unit.en,
150
+ "ungegn" => row.administrative_unit.ungegn
97
151
  }
98
152
  )
99
153
  end
@@ -16,8 +16,12 @@ module Pumi
16
16
  location_data = scraped_data.find { |location| location.code == code }
17
17
  next unless location_data
18
18
 
19
- attributes["links"] ||= {}
20
- attributes["links"]["wikipedia"] = location_data.wikipedia
19
+ if location_data.wikipedia
20
+ attributes["links"] ||= {}
21
+ attributes["links"]["wikipedia"] = location_data.wikipedia
22
+ end
23
+
24
+ attributes["name"]["ungegn"] = location_data.name_ungegn if location_data.name_ungegn
21
25
  end
22
26
 
23
27
  write_data!(output_dir)
@@ -37,7 +41,7 @@ module Pumi
37
41
  data_file.write(data, data_directory:)
38
42
  end
39
43
 
40
- ScraperResult = Struct.new(:code, :wikipedia, keyword_init: true)
44
+ ScraperResult = Struct.new(:code, :wikipedia, :name_ungegn, keyword_init: true)
41
45
 
42
46
  class WebScraper
43
47
  class ElementNotFoundError < StandardError; end
@@ -58,7 +62,11 @@ module Pumi
58
62
 
59
63
  def scrape!
60
64
  Province.all.each_with_object([]) do |province, result|
61
- result << ScraperResult.new(code: province.id, wikipedia: find_url(province))
65
+ result << ScraperResult.new(
66
+ code: province.id,
67
+ wikipedia: find_url(province),
68
+ name_ungegn: find_ungegn(province)
69
+ )
62
70
  end
63
71
  end
64
72
 
@@ -69,14 +77,25 @@ module Pumi
69
77
  end
70
78
 
71
79
  def find_url(province)
80
+ td = find_khmer_name_td(province)
81
+ link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
82
+ URI.join(URL, link[:href]).to_s
83
+ end
84
+
85
+ def find_ungegn(province)
86
+ td = find_khmer_name_td(province)
87
+ td.at_xpath("following-sibling::td/span[contains(@title, 'Khmer-language romanization')]")&.text
88
+ end
89
+
90
+ def find_khmer_name_td(province)
72
91
  td = province_table_rows.at_xpath("child::td[contains(., '#{province.name_km}')]")
92
+
73
93
  if td.nil?
74
94
  raise WebScraper::ElementNotFoundError,
75
95
  "No cell containing '#{province.name_km}' was found in a table on #{URL}"
76
96
  end
77
97
 
78
- link = td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
79
- URI.join(URL, link[:href]).to_s
98
+ td
80
99
  end
81
100
 
82
101
  def province_table_rows
@@ -99,10 +118,11 @@ module Pumi
99
118
 
100
119
  def scrape!
101
120
  District.all.each_with_object([]) do |district, result|
102
- url = find_url(district)
103
- next unless url
104
-
105
- result << ScraperResult.new(code: district.id, wikipedia: url)
121
+ result << ScraperResult.new(
122
+ code: district.id,
123
+ wikipedia: find_url(district),
124
+ name_ungegn: find_ungegn(district)
125
+ )
106
126
  end
107
127
  end
108
128
 
@@ -113,16 +133,28 @@ module Pumi
113
133
  end
114
134
 
115
135
  def find_url(district)
116
- geocode = scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
136
+ geocode_td = find_geocode_td(district)
117
137
 
118
- return if geocode.nil?
138
+ return if geocode_td.nil?
119
139
 
120
- link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
140
+ link = geocode_td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
121
141
 
122
142
  return if link.nil?
123
143
 
124
144
  URI.join(URL, link[:href]).to_s
125
145
  end
146
+
147
+ def find_ungegn(district)
148
+ geocode_td = find_geocode_td(district)
149
+
150
+ return if geocode_td.nil?
151
+
152
+ geocode_td.at_xpath("preceding-sibling::td/span[contains(@title, 'Khmer-language romanization')]")&.text
153
+ end
154
+
155
+ def find_geocode_td(district)
156
+ scraper.page.at_xpath("//td[text()[contains(., '#{district.id}')]]")
157
+ end
126
158
  end
127
159
 
128
160
  class CambodianCommunesScraper
@@ -130,27 +162,40 @@ module Pumi
130
162
 
131
163
  def scrape!
132
164
  Commune.all.each_with_object([]) do |commune, result|
133
- url = find_url(commune)
134
- next if url.nil?
135
-
136
- result << ScraperResult.new(code: commune.id, wikipedia: url)
165
+ result << ScraperResult.new(
166
+ code: commune.id,
167
+ wikipedia: find_url(commune),
168
+ name_ungegn: find_ungegn(commune)
169
+ )
137
170
  end
138
171
  end
139
172
 
140
173
  private
141
174
 
142
175
  def find_url(commune)
143
- geocode = scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
176
+ geocode_td = find_geocode_td(commune)
144
177
 
145
- return if geocode.nil?
178
+ return if geocode_td.nil?
146
179
 
147
- link = geocode.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
180
+ link = geocode_td.at_xpath("preceding-sibling::td/a[contains(@href, '/wiki/')]")
148
181
 
149
182
  return if link.nil?
150
183
 
151
184
  URI.join(URL, link[:href]).to_s
152
185
  end
153
186
 
187
+ def find_ungegn(commune)
188
+ geocode_td = find_geocode_td(commune)
189
+
190
+ return if geocode_td.nil?
191
+
192
+ geocode_td.at_xpath("preceding-sibling::td/span[contains(@title, 'Khmer-language romanization')]")&.text
193
+ end
194
+
195
+ def find_geocode_td(commune)
196
+ scraper.page.at_xpath("//td[text()[contains(., '#{commune.id}')]]")
197
+ end
198
+
154
199
  def scraper
155
200
  @scraper ||= WebScraper.new(URL)
156
201
  end
data/lib/pumi/location.rb CHANGED
@@ -5,6 +5,7 @@ module Pumi
5
5
  :name_km, :full_name_km,
6
6
  :name_latin, :full_name_latin,
7
7
  :name_en, :full_name_en,
8
+ :name_ungegn, :full_name_ungegn,
8
9
  :address_km, :address_latin, :address_en,
9
10
  :administrative_unit,
10
11
  :links,
data/lib/pumi/parser.rb CHANGED
@@ -82,6 +82,7 @@ module Pumi
82
82
  name = attributes.fetch("name")
83
83
  name_km = name.fetch("km")
84
84
  name_latin = name.fetch("latin")
85
+ name_ungegn = name["ungegn"]
85
86
  administrative_unit = build_administrative_unit(
86
87
  attributes.fetch("administrative_unit")
87
88
  )
@@ -94,20 +95,25 @@ module Pumi
94
95
  id:,
95
96
  administrative_unit:,
96
97
  name_km:,
98
+ name_en: name_latin,
97
99
  name_latin:,
100
+ name_ungegn:,
98
101
  geodata:,
99
102
  iso3166_2: attributes["iso3166_2"],
100
103
  links: attributes.fetch("links", {}).transform_keys(&:to_sym),
101
- name_en: name_latin,
102
104
  full_name_km: [
103
105
  administrative_unit_name(name_km, administrative_unit.name_km),
104
106
  name_km
105
107
  ].compact.join,
108
+ full_name_en: [name_latin, administrative_unit.name_en].join(" "),
106
109
  full_name_latin: [
107
110
  administrative_unit_name(name_latin, administrative_unit.name_latin),
108
111
  name_latin
109
112
  ].compact.join(" "),
110
- full_name_en: [name_latin, administrative_unit.name_en].join(" ")
113
+ full_name_ungegn: name_ungegn && [
114
+ administrative_unit_name(name_ungegn, administrative_unit.name_ungegn),
115
+ name_ungegn
116
+ ].compact.join(" ")
111
117
  }
112
118
  end
113
119
 
@@ -138,7 +144,8 @@ module Pumi
138
144
  AdministrativeUnit.new(
139
145
  name_km: attributes.fetch("km"),
140
146
  name_latin: attributes.fetch("latin"),
141
- name_en: attributes.fetch("en")
147
+ name_en: attributes.fetch("en"),
148
+ name_ungegn: attributes.fetch("ungegn")
142
149
  )
143
150
  end
144
151
 
data/lib/pumi/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Pumi
2
- VERSION = "0.25.0".freeze
2
+ VERSION = "0.26.0".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pumi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.25.0
4
+ version: 0.26.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Wilkie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-19 00:00:00.000000000 Z
11
+ date: 2024-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler