iev 0.4.4 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/iev/exporter.rb CHANGED
@@ -55,7 +55,9 @@ module Iev
55
55
  dataset = load_dataset
56
56
  collection = build_collection(dataset)
57
57
  add_subject_area_concepts(collection) if @include_areas
58
+ build_section_narrower_relations(collection) if @include_areas
58
59
  save_collection(collection)
60
+ save_register
59
61
  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
60
62
 
61
63
  @stats = {
@@ -85,7 +87,7 @@ module Iev
85
87
 
86
88
  exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
87
89
  raise ArgumentError,
88
- "Unsupported format: #{input_path.extname}. Supported: #{exts}"
90
+ "Unsupported format: #{input_path.extname}. Supported: #{exts}"
89
91
  end
90
92
 
91
93
  def input_format
@@ -139,13 +141,23 @@ module Iev
139
141
  term = TermBuilder.build_from(row)
140
142
  next unless term
141
143
 
144
+ # Parse IevCode once per concept — used by all helpers below.
145
+ code = IevCode.new(term.id)
146
+
142
147
  concept = concept_index[term.id] ||= begin
143
148
  c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
144
- c.data.domains = domain_references_for(term.id)
149
+ c.uuid = term.id
150
+ c.schema_version = "3"
151
+ c.data.domains = domain_references_for(code)
152
+ c.data.tags = tags_for(code)
153
+ add_section_broader(c, code)
145
154
  collection.store(c)
146
155
  c
147
156
  end
148
157
  concept.add_l10n(term)
158
+
159
+ promote_supersession(concept, term)
160
+ set_managed_status(concept, term)
149
161
  end
150
162
 
151
163
  collection
@@ -160,21 +172,181 @@ module Iev
160
172
  def save_collection(collection)
161
173
  concepts_dir = output_dir.expand_path.join("concepts")
162
174
  FileUtils.mkdir_p(concepts_dir)
163
- collection.save_to_files(concepts_dir.to_s)
175
+ collection.save_grouped_concepts_to_files(concepts_dir.to_s)
176
+ end
177
+
178
+ def save_register
179
+ areas = SubjectAreas.all
180
+ sections = build_section_tree(areas)
181
+
182
+ register = Glossarist::DatasetRegister.new(
183
+ schema_type: "glossarist",
184
+ schema_version: "3",
185
+ id: "iev",
186
+ ref: "IEC 60050:2011",
187
+ year: 2011,
188
+ urn: IEV_SOURCE,
189
+ urn_aliases: ["#{IEV_SOURCE}*"],
190
+ status: "current",
191
+ owner: "IEC",
192
+ source_repo: "https://github.com/glossarist/iev-data",
193
+ tags: %w[electrotechnical vocabulary iec],
194
+ languages: %w[eng fra],
195
+ language_order: %w[eng fra],
196
+ ordering: "systematic",
197
+ sections: sections,
198
+ )
199
+
200
+ register_path = output_dir.expand_path.join("register.yaml")
201
+ FileUtils.mkdir_p(register_path.dirname)
202
+ File.write(register_path, register.to_yaml, encoding: "utf-8")
203
+ puts "Written register.yaml with #{sections.length} areas" if $stdout.tty?
204
+ end
205
+
206
+ def build_section_tree(areas)
207
+ areas.sort_by { |a| a.code.to_i }.map do |area|
208
+ children = area.sections.sort_by do |s|
209
+ s.code.split("-").map(&:to_i)
210
+ end.map do |sec|
211
+ Glossarist::Section.new(
212
+ id: sec.code,
213
+ names: { "eng" => sec.title },
214
+ )
215
+ end
216
+
217
+ Glossarist::Section.new(
218
+ id: area.code,
219
+ names: { "eng" => area.title },
220
+ children: children.empty? ? nil : children,
221
+ )
222
+ end
164
223
  end
165
224
 
166
225
  def localized_count(collection)
167
226
  collection.sum { |c| c.localized_concepts.count }
168
227
  end
169
228
 
170
- def domain_references_for(ievref)
171
- parts = ievref.to_s.split("-")
172
- return [] unless parts.length >= 2
229
+ # Build domain ConceptReferences for a concept.
230
+ #
231
+ # Per the concept model, ConceptReferenceType distinguishes:
232
+ # - "domain" → thematic/subject-area classification (area level)
233
+ # - "section" → structural section membership (section level)
234
+ #
235
+ # Every concept gets both: a "domain" ref to its area and a "section"
236
+ # ref to its section. Concepts with only an area code (no section)
237
+ # get only a "domain" ref.
238
+ #
239
+ # @param code [IevCode] pre-parsed IEV code
240
+ # @return [Array<Glossarist::ConceptReference>]
241
+ def domain_references_for(code)
242
+ refs = []
243
+
244
+ # Domain reference: thematic classification at the area level
245
+ refs << domain_ref(code.area_uri)
246
+
247
+ # Section reference: structural membership in the section
248
+ if code.section_code
249
+ refs << section_ref(code.section_uri)
250
+ end
251
+
252
+ refs
253
+ end
254
+
255
+ # @param code [IevCode] pre-parsed IEV code
256
+ # @return [Array<String>]
257
+ def tags_for(code)
258
+ tags = []
259
+ area = SubjectAreas.find_area(code.area_code)
260
+ tags << area.title if area
261
+ section = code.section_code && SubjectAreas.find_section(code.section_code)
262
+ tags << section.title if section
263
+ tags
264
+ end
265
+
266
+ # @param concept [Glossarist::ManagedConcept]
267
+ # @param code [IevCode] pre-parsed IEV code
268
+ def add_section_broader(concept, code)
269
+ return unless code.section_uri
270
+
271
+ concept.related ||= []
272
+ return if concept.related.any? do |r|
273
+ r.type == "broader" && r.ref&.id == code.section_uri
274
+ end
275
+
276
+ concept.related << Glossarist::RelatedConcept.new(
277
+ type: "broader",
278
+ content: code.section_uri,
279
+ ref: Glossarist::ConceptRef.new(source: "IEV", id: code.section_uri),
280
+ )
281
+ end
282
+
283
+ def build_section_narrower_relations(collection)
284
+ mc_index = collection.each_with_object({}) do |c, h|
285
+ h[c.data&.id] = c if c.data&.id
286
+ end
287
+
288
+ section_children = {}
289
+ mc_index.each_key do |concept_id|
290
+ code = IevCode.new(concept_id)
291
+ next unless code.section_uri
292
+
293
+ (section_children[code.section_uri] ||= []) << concept_id
294
+ end
295
+
296
+ section_children.each do |section_uri, child_ids|
297
+ section_mc = mc_index[section_uri]
298
+ next unless section_mc
299
+
300
+ narrower = child_ids.sort.map do |child_id|
301
+ Glossarist::RelatedConcept.new(
302
+ type: "narrower",
303
+ content: child_id,
304
+ ref: Glossarist::ConceptRef.new(source: "IEV", id: child_id),
305
+ )
306
+ end
307
+
308
+ section_mc.related ||= []
309
+ section_mc.related.concat(narrower)
310
+ end
311
+ end
312
+
313
+ # Promote supersedes relations from localized ConceptData to managed level.
314
+ # Supersession is language-independent (REPLACES column is per-concept).
315
+ def promote_supersession(concept, term)
316
+ related = term.data&.related
317
+ return unless related&.any?
318
+
319
+ concept.related ||= []
320
+ related.each do |r|
321
+ next if concept.related.any? do |er|
322
+ er.type == r.type && er.ref&.id == r.ref&.id
323
+ end
324
+
325
+ concept.related << r
326
+ end
327
+ term.data.related = nil
328
+ end
329
+
330
+ # Derive managed concept status from the localization's entry_status.
331
+ def set_managed_status(concept, term)
332
+ return if concept.status
333
+
334
+ status = term.entry_status
335
+ concept.status = status if status && !status.empty?
336
+ end
337
+
338
+ # --- ConceptReference factory helpers ---
339
+
340
+ def domain_ref(concept_id)
341
+ ref = Glossarist::ConceptReference.domain(concept_id)
342
+ ref.source = IEV_SOURCE
343
+ ref
344
+ end
173
345
 
174
- [
175
- SubjectAreas.area_uri(parts[0]),
176
- SubjectAreas.section_uri(parts[0..1].join("-")),
177
- ].map { |id| Glossarist::ConceptReference.domain(id) }
346
+ def section_ref(concept_id)
347
+ ref = Glossarist::ConceptReference.section(concept_id)
348
+ ref.source = IEV_SOURCE
349
+ ref
178
350
  end
179
351
  end
180
352
  end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Immutable value object that decomposes an IEV concept code
5
+ # into its structural parts: area code, section code, and number.
6
+ #
7
+ # The IEV code format is AAA-BB-CC where:
8
+ # AAA = area code (e.g. "103")
9
+ # BB = section sub-code (e.g. "01")
10
+ # CC = concept number (e.g. "02")
11
+ #
12
+ # @example Full concept code
13
+ # code = Iev::IevCode.new("103-01-02")
14
+ # code.area_code #=> "103"
15
+ # code.section_code #=> "103-01"
16
+ # code.number #=> "02"
17
+ # code.area_uri #=> "area-103"
18
+ # code.section_uri #=> "section-103-01"
19
+ #
20
+ # @example Section code (no concept number)
21
+ # code = Iev::IevCode.new("103-01")
22
+ # code.area_code #=> "103"
23
+ # code.section_code #=> "103-01"
24
+ # code.number #=> nil
25
+ # code.section_uri #=> "section-103-01"
26
+ #
27
+ class IevCode
28
+ include Comparable
29
+
30
+ attr_reader :raw, :area_code, :section_code, :number
31
+
32
+ # @param code [#to_s] IEV reference, e.g. "103-01-02"
33
+ def initialize(code)
34
+ @raw = code.to_s
35
+ parts = @raw.split("-")
36
+ @area_code = parts[0]
37
+ @section_code = parts.length >= 2 ? "#{parts[0]}-#{parts[1]}" : nil
38
+ @number = parts.length >= 3 ? parts[2] : nil
39
+ freeze
40
+ end
41
+
42
+ def area_uri
43
+ "area-#{area_code}"
44
+ end
45
+
46
+ def section_uri
47
+ "section-#{section_code}" if section_code
48
+ end
49
+
50
+ def to_s
51
+ @raw
52
+ end
53
+
54
+ def to_str
55
+ @raw
56
+ end
57
+
58
+ def ==(other)
59
+ other.is_a?(self.class) && raw == other.raw
60
+ end
61
+ alias_method :eql?, :==
62
+
63
+ def hash
64
+ raw.hash
65
+ end
66
+
67
+ def <=>(other)
68
+ to_s <=> other.to_s
69
+ end
70
+
71
+ # Safe constructor that returns nil for codes that don't parse.
72
+ # @param code [#to_s]
73
+ # @return [IevCode, nil]
74
+ def self.parse(code)
75
+ new(code)
76
+ rescue ArgumentError
77
+ nil
78
+ end
79
+ end
80
+ end
@@ -6,7 +6,8 @@
6
6
  module Iev
7
7
  # @todo This needs to be rewritten.
8
8
  class Iso639Code
9
- COUNTRY_CODES = YAML.load(IO.read(File.join(__dir__, "iso_639_2.yaml")))
9
+ COUNTRY_CODES = YAML.safe_load_file(File.join(__dir__,
10
+ "iso_639_2.yaml"), permitted_classes: [Symbol]).freeze
10
11
  # rubocop:disable Style/MutableConstant
11
12
  THREE_CHAR_MEMO = {} # Memoization cache, must be mutable
12
13
  # rubocop:enable Style/MutableConstant
@@ -36,7 +36,7 @@ module Iev
36
36
  rescue StandardError
37
37
  raise unless curr_attempt <= attempts
38
38
 
39
- sleep(2**curr_attempt * 0.1)
39
+ sleep((2**curr_attempt) * 0.1)
40
40
  curr_attempt += 1
41
41
  retry
42
42
  end
@@ -3,100 +3,102 @@
3
3
  require "ferrum"
4
4
 
5
5
  module Iev
6
- # Shared headless browser utilities for fetching pages behind AWS WAF.
7
- module ScraperBrowser
8
- USER_AGENT_PROFILES = [
9
- {
10
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
11
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
12
- "Chrome/131.0.0.0 Safari/537.36",
13
- platform: '"macOS"',
14
- chrome_version: "131",
15
- },
16
- {
17
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
18
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
19
- "Chrome/130.0.0.0 Safari/537.36",
20
- platform: '"Windows"',
21
- chrome_version: "130",
22
- },
23
- {
24
- user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
25
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
26
- "Chrome/131.0.0.0 Safari/537.36",
27
- platform: '"Linux"',
28
- chrome_version: "131",
29
- },
30
- {
31
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
32
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
33
- "Chrome/129.0.0.0 Safari/537.36",
34
- platform: '"macOS"',
35
- chrome_version: "129",
36
- },
37
- {
38
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
39
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
40
- "Chrome/131.0.0.0 Safari/537.36",
41
- platform: '"Windows"',
42
- chrome_version: "131",
43
- },
44
- ].freeze
45
-
46
- # Fetch a URL using headless Chrome, returning the page HTML.
47
- # Handles AWS WAF challenge pages by waiting for JS execution.
48
- def self.fetch(url, browser_opts: {})
49
- browser = Ferrum::Browser.new(
50
- headless: "new",
51
- timeout: 30,
52
- window_size: [1366, 768],
53
- browser_options: {
54
- "disable-blink-features" => "AutomationControlled",
6
+ class Scraper
7
+ # Shared headless browser utilities for fetching pages behind AWS WAF.
8
+ module Browser
9
+ USER_AGENT_PROFILES = [
10
+ {
11
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
12
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
13
+ "Chrome/131.0.0.0 Safari/537.36",
14
+ platform: '"macOS"',
15
+ chrome_version: "131",
16
+ },
17
+ {
18
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
19
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
20
+ "Chrome/130.0.0.0 Safari/537.36",
21
+ platform: '"Windows"',
22
+ chrome_version: "130",
23
+ },
24
+ {
25
+ user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
26
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
27
+ "Chrome/131.0.0.0 Safari/537.36",
28
+ platform: '"Linux"',
29
+ chrome_version: "131",
30
+ },
31
+ {
32
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
33
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
34
+ "Chrome/129.0.0.0 Safari/537.36",
35
+ platform: '"macOS"',
36
+ chrome_version: "129",
37
+ },
38
+ {
39
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
40
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
41
+ "Chrome/131.0.0.0 Safari/537.36",
42
+ platform: '"Windows"',
43
+ chrome_version: "131",
55
44
  },
56
- **browser_opts,
57
- )
45
+ ].freeze
58
46
 
59
- browser.headers.set(random_headers)
60
- browser.go_to(url)
61
- browser.network.wait_for_idle(timeout: 15)
62
- html = browser.body
47
+ # Fetch a URL using headless Chrome, returning the page HTML.
48
+ # Handles AWS WAF challenge pages by waiting for JS execution.
49
+ def self.fetch(url, browser_opts: {})
50
+ browser = Ferrum::Browser.new(
51
+ headless: "new",
52
+ timeout: 30,
53
+ window_size: [1366, 768],
54
+ browser_options: {
55
+ "disable-blink-features" => "AutomationControlled",
56
+ },
57
+ **browser_opts,
58
+ )
63
59
 
64
- if html.include?("403 ERROR") || html.include?("Request blocked")
65
- warn "IEV: AWS WAF blocked request for #{url}"
66
- return nil
67
- end
60
+ browser.headers.set(random_headers)
61
+ browser.go_to(url)
62
+ browser.network.wait_for_idle(timeout: 15)
63
+ html = browser.body
68
64
 
69
- html
70
- rescue Ferrum::Error, Ferrum::BrowserError => e
71
- warn "IEV: Browser error fetching #{url}: #{e.message}"
72
- nil
73
- ensure
74
- browser&.quit
75
- end
65
+ if html.include?("403 ERROR") || html.include?("Request blocked")
66
+ warn "IEV: AWS WAF blocked request for #{url}"
67
+ return nil
68
+ end
76
69
 
77
- def self.random_headers
78
- profile = USER_AGENT_PROFILES.sample
79
- sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
80
- "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
81
- "\"Not_A Brand\";v=\"24\""
70
+ html
71
+ rescue Ferrum::Error, Ferrum::BrowserError => e
72
+ warn "IEV: Browser error fetching #{url}: #{e.message}"
73
+ nil
74
+ ensure
75
+ browser&.quit
76
+ end
82
77
 
83
- {
84
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
85
- "image/avif,image/webp,image/apng,*/*;q=0.8," \
86
- "application/signed-exchange;v=b3;q=0.7",
87
- "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
88
- "Cache-Control" => "no-cache",
89
- "Pragma" => "no-cache",
90
- "Sec-Ch-Ua" => sec_ch_ua,
91
- "Sec-Ch-Ua-Mobile" => "?0",
92
- "Sec-Ch-Ua-Platform" => profile[:platform],
93
- "Sec-Fetch-Dest" => "document",
94
- "Sec-Fetch-Mode" => "navigate",
95
- "Sec-Fetch-Site" => "cross-site",
96
- "Sec-Fetch-User" => "?1",
97
- "Upgrade-Insecure-Requests" => "1",
98
- "User-Agent" => profile[:user_agent],
99
- }
78
+ def self.random_headers
79
+ profile = USER_AGENT_PROFILES.sample
80
+ sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
81
+ "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
82
+ "\"Not_A Brand\";v=\"24\""
83
+
84
+ {
85
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
86
+ "image/avif,image/webp,image/apng,*/*;q=0.8," \
87
+ "application/signed-exchange;v=b3;q=0.7",
88
+ "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
89
+ "Cache-Control" => "no-cache",
90
+ "Pragma" => "no-cache",
91
+ "Sec-Ch-Ua" => sec_ch_ua,
92
+ "Sec-Ch-Ua-Mobile" => "?0",
93
+ "Sec-Ch-Ua-Platform" => profile[:platform],
94
+ "Sec-Fetch-Dest" => "document",
95
+ "Sec-Fetch-Mode" => "navigate",
96
+ "Sec-Fetch-Site" => "cross-site",
97
+ "Sec-Fetch-User" => "?1",
98
+ "Upgrade-Insecure-Requests" => "1",
99
+ "User-Agent" => profile[:user_agent],
100
+ }
101
+ end
100
102
  end
101
103
  end
102
104
  end
data/lib/iev/scraper.rb CHANGED
@@ -4,6 +4,9 @@ require "nokogiri"
4
4
 
5
5
  module Iev
6
6
  class Scraper
7
+ autoload :Browser, "iev/scraper/browser"
8
+ autoload :PageParser, "iev/scraper/page_parser"
9
+
7
10
  BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
8
11
  "display?openform&ievref="
9
12
 
@@ -14,7 +17,8 @@ module Iev
14
17
  # Fetch the Electropedia page HTML for a given IEV code.
15
18
  # Returns a Nokogiri document.
16
19
  def fetch_page(code)
17
- html = ScraperBrowser.fetch("#{BASE_URL}#{code}", browser_opts: @browser_opts)
20
+ html = Browser.fetch("#{BASE_URL}#{code}",
21
+ browser_opts: @browser_opts)
18
22
  return nil unless html
19
23
 
20
24
  Nokogiri::HTML(html)
@@ -30,6 +34,3 @@ module Iev
30
34
  end
31
35
  end
32
36
  end
33
-
34
- require_relative "scraper/browser"
35
- require_relative "scraper/page_parser"
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Immutable value object representing an IEV section (e.g. "103-01").
5
+ #
6
+ # A section belongs to exactly one area, identified by +area_code+.
7
+ class Section
8
+ attr_reader :code, :title, :area_code
9
+
10
+ # @param code [#to_s] section code, e.g. "103-01"
11
+ # @param title [#to_s] section title, e.g. "General concepts on functions"
12
+ # @param area_code [#to_s] parent area code, e.g. "103"
13
+ def initialize(code:, title:, area_code:)
14
+ @code = code.to_s
15
+ @title = title.to_s
16
+ @area_code = area_code.to_s
17
+ freeze
18
+ end
19
+
20
+ def uri
21
+ "section-#{code}"
22
+ end
23
+
24
+ def to_h
25
+ { "code" => code, "title" => title }
26
+ end
27
+
28
+ def ==(other)
29
+ other.is_a?(self.class) && code == other.code
30
+ end
31
+ alias_method :eql?, :==
32
+
33
+ def hash
34
+ code.hash
35
+ end
36
+ end
37
+ end