iev 0.4.5 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,100 +3,102 @@
3
3
  require "ferrum"
4
4
 
5
5
  module Iev
6
- # Shared headless browser utilities for fetching pages behind AWS WAF.
7
- module ScraperBrowser
8
- USER_AGENT_PROFILES = [
9
- {
10
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
11
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
12
- "Chrome/131.0.0.0 Safari/537.36",
13
- platform: '"macOS"',
14
- chrome_version: "131",
15
- },
16
- {
17
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
18
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
19
- "Chrome/130.0.0.0 Safari/537.36",
20
- platform: '"Windows"',
21
- chrome_version: "130",
22
- },
23
- {
24
- user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
25
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
26
- "Chrome/131.0.0.0 Safari/537.36",
27
- platform: '"Linux"',
28
- chrome_version: "131",
29
- },
30
- {
31
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
32
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
33
- "Chrome/129.0.0.0 Safari/537.36",
34
- platform: '"macOS"',
35
- chrome_version: "129",
36
- },
37
- {
38
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
39
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
40
- "Chrome/131.0.0.0 Safari/537.36",
41
- platform: '"Windows"',
42
- chrome_version: "131",
43
- },
44
- ].freeze
45
-
46
- # Fetch a URL using headless Chrome, returning the page HTML.
47
- # Handles AWS WAF challenge pages by waiting for JS execution.
48
- def self.fetch(url, browser_opts: {})
49
- browser = Ferrum::Browser.new(
50
- headless: "new",
51
- timeout: 30,
52
- window_size: [1366, 768],
53
- browser_options: {
54
- "disable-blink-features" => "AutomationControlled",
6
+ class Scraper
7
+ # Shared headless browser utilities for fetching pages behind AWS WAF.
8
+ module Browser
9
+ USER_AGENT_PROFILES = [
10
+ {
11
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
12
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
13
+ "Chrome/131.0.0.0 Safari/537.36",
14
+ platform: '"macOS"',
15
+ chrome_version: "131",
16
+ },
17
+ {
18
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
19
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
20
+ "Chrome/130.0.0.0 Safari/537.36",
21
+ platform: '"Windows"',
22
+ chrome_version: "130",
23
+ },
24
+ {
25
+ user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
26
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
27
+ "Chrome/131.0.0.0 Safari/537.36",
28
+ platform: '"Linux"',
29
+ chrome_version: "131",
30
+ },
31
+ {
32
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
33
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
34
+ "Chrome/129.0.0.0 Safari/537.36",
35
+ platform: '"macOS"',
36
+ chrome_version: "129",
37
+ },
38
+ {
39
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
40
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
41
+ "Chrome/131.0.0.0 Safari/537.36",
42
+ platform: '"Windows"',
43
+ chrome_version: "131",
55
44
  },
56
- **browser_opts,
57
- )
45
+ ].freeze
58
46
 
59
- browser.headers.set(random_headers)
60
- browser.go_to(url)
61
- browser.network.wait_for_idle(timeout: 15)
62
- html = browser.body
47
+ # Fetch a URL using headless Chrome, returning the page HTML.
48
+ # Handles AWS WAF challenge pages by waiting for JS execution.
49
+ def self.fetch(url, browser_opts: {})
50
+ browser = Ferrum::Browser.new(
51
+ headless: "new",
52
+ timeout: 30,
53
+ window_size: [1366, 768],
54
+ browser_options: {
55
+ "disable-blink-features" => "AutomationControlled",
56
+ },
57
+ **browser_opts,
58
+ )
63
59
 
64
- if html.include?("403 ERROR") || html.include?("Request blocked")
65
- warn "IEV: AWS WAF blocked request for #{url}"
66
- return nil
67
- end
60
+ browser.headers.set(random_headers)
61
+ browser.go_to(url)
62
+ browser.network.wait_for_idle(timeout: 15)
63
+ html = browser.body
68
64
 
69
- html
70
- rescue Ferrum::Error, Ferrum::BrowserError => e
71
- warn "IEV: Browser error fetching #{url}: #{e.message}"
72
- nil
73
- ensure
74
- browser&.quit
75
- end
65
+ if html.include?("403 ERROR") || html.include?("Request blocked")
66
+ warn "IEV: AWS WAF blocked request for #{url}"
67
+ return nil
68
+ end
76
69
 
77
- def self.random_headers
78
- profile = USER_AGENT_PROFILES.sample
79
- sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
80
- "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
81
- "\"Not_A Brand\";v=\"24\""
70
+ html
71
+ rescue Ferrum::Error, Ferrum::BrowserError => e
72
+ warn "IEV: Browser error fetching #{url}: #{e.message}"
73
+ nil
74
+ ensure
75
+ browser&.quit
76
+ end
82
77
 
83
- {
84
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
85
- "image/avif,image/webp,image/apng,*/*;q=0.8," \
86
- "application/signed-exchange;v=b3;q=0.7",
87
- "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
88
- "Cache-Control" => "no-cache",
89
- "Pragma" => "no-cache",
90
- "Sec-Ch-Ua" => sec_ch_ua,
91
- "Sec-Ch-Ua-Mobile" => "?0",
92
- "Sec-Ch-Ua-Platform" => profile[:platform],
93
- "Sec-Fetch-Dest" => "document",
94
- "Sec-Fetch-Mode" => "navigate",
95
- "Sec-Fetch-Site" => "cross-site",
96
- "Sec-Fetch-User" => "?1",
97
- "Upgrade-Insecure-Requests" => "1",
98
- "User-Agent" => profile[:user_agent],
99
- }
78
+ def self.random_headers
79
+ profile = USER_AGENT_PROFILES.sample
80
+ sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
81
+ "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
82
+ "\"Not_A Brand\";v=\"24\""
83
+
84
+ {
85
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
86
+ "image/avif,image/webp,image/apng,*/*;q=0.8," \
87
+ "application/signed-exchange;v=b3;q=0.7",
88
+ "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
89
+ "Cache-Control" => "no-cache",
90
+ "Pragma" => "no-cache",
91
+ "Sec-Ch-Ua" => sec_ch_ua,
92
+ "Sec-Ch-Ua-Mobile" => "?0",
93
+ "Sec-Ch-Ua-Platform" => profile[:platform],
94
+ "Sec-Fetch-Dest" => "document",
95
+ "Sec-Fetch-Mode" => "navigate",
96
+ "Sec-Fetch-Site" => "cross-site",
97
+ "Sec-Fetch-User" => "?1",
98
+ "Upgrade-Insecure-Requests" => "1",
99
+ "User-Agent" => profile[:user_agent],
100
+ }
101
+ end
100
102
  end
101
103
  end
102
104
  end
data/lib/iev/scraper.rb CHANGED
@@ -4,6 +4,9 @@ require "nokogiri"
4
4
 
5
5
  module Iev
6
6
  class Scraper
7
+ autoload :Browser, "iev/scraper/browser"
8
+ autoload :PageParser, "iev/scraper/page_parser"
9
+
7
10
  BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
8
11
  "display?openform&ievref="
9
12
 
@@ -14,7 +17,8 @@ module Iev
14
17
  # Fetch the Electropedia page HTML for a given IEV code.
15
18
  # Returns a Nokogiri document.
16
19
  def fetch_page(code)
17
- html = ScraperBrowser.fetch("#{BASE_URL}#{code}", browser_opts: @browser_opts)
20
+ html = Browser.fetch("#{BASE_URL}#{code}",
21
+ browser_opts: @browser_opts)
18
22
  return nil unless html
19
23
 
20
24
  Nokogiri::HTML(html)
@@ -30,6 +34,3 @@ module Iev
30
34
  end
31
35
  end
32
36
  end
33
-
34
- require_relative "scraper/browser"
35
- require_relative "scraper/page_parser"
@@ -12,6 +12,7 @@ module Iev
12
12
  # SourceParser.new(cell_data_string).parsed_sources
13
13
  class SourceParser
14
14
  include Utilities
15
+
15
16
  using DataConversions
16
17
 
17
18
  # When false, obtain_source_link skips Relaton network calls.
@@ -112,13 +113,13 @@ module Iev
112
113
  # IEC 62313:2009, 3.6, modifié
113
114
 
114
115
  str
115
- .gsub(/CEI/, "IEC")
116
- .gsub(/Guide IEC/, "IEC Guide")
116
+ .gsub("CEI", "IEC")
117
+ .gsub("Guide IEC", "IEC Guide")
117
118
  .gsub(%r{Guide ISO/IEC}, "ISO/IEC Guide")
118
- .gsub(/VEI/, "IEV")
119
- .gsub(/UIT/, "ITU")
120
- .gsub(/IUT-R/, "ITU-R")
121
- .gsub(/UTI-R/, "ITU-R")
119
+ .gsub("VEI", "IEV")
120
+ .gsub("UIT", "ITU")
121
+ .gsub("IUT-R", "ITU-R")
122
+ .gsub("UTI-R", "ITU-R")
122
123
  .gsub(/Recomm[ea]ndation ITU-T/, "ITU-T Recommendation")
123
124
  .gsub(/ITU-T (\w.\d{3}):(\d{4})/, 'ITU-T Recommendation \1 (\2)')
124
125
  .gsub(/ITU-R Rec. (\d+)/, 'ITU-R Recommendation \1')
@@ -290,17 +291,15 @@ module Iev
290
291
  ].map do |regex, _rule|
291
292
  # TODO: Rubocop complains about unused rule -- need to make sure
292
293
  # that no one forgot about something.
293
- res = []
294
294
  # puts "str is '#{str}'"
295
295
  # puts "regex is '#{regex.to_s}'"
296
- str.scan(regex).each do |result|
296
+ str.scan(regex).map do |result|
297
297
  # puts "result is #{result.first}"
298
- res << {
298
+ {
299
299
  index: $LAST_MATCH_INFO.offset(0)[0],
300
300
  clause: result.first.strip,
301
301
  }
302
302
  end
303
- res
304
303
  # sort by index and also the length of match
305
304
  end.flatten.sort_by { |hash| [hash[:index], -hash[:clause].length] }
306
305
 
@@ -15,13 +15,11 @@ module Iev
15
15
  # (added by Exporter)
16
16
  #
17
17
  # Classification (separate from hierarchy):
18
- # - Each concept's ManagedConceptData#domains includes area and
19
- # section ConceptReferences
20
- # - Each concept's ConceptData#domain references its section URI
18
+ # - Each concept's ManagedConceptData#domains includes domain and
19
+ # section ConceptReferences (per ConceptReferenceType)
21
20
  # - Each section concept's ConceptData#domain references parent area
21
+ # title text (a LocalizedString, not a URI)
22
22
  module SubjectAreaConcepts
23
- IEV_SOURCE = "urn:iec:std:iec:60050"
24
-
25
23
  class << self
26
24
  # Build all area and section concepts and add them to the collection.
27
25
  #
@@ -41,14 +39,6 @@ module Iev
41
39
 
42
40
  private
43
41
 
44
- def domain_ref(concept_id)
45
- Glossarist::ConceptReference.new(
46
- concept_id: concept_id,
47
- source: IEV_SOURCE,
48
- ref_type: "domain",
49
- )
50
- end
51
-
52
42
  def build_area_concept(area)
53
43
  id = area.uri
54
44
 
@@ -56,12 +46,16 @@ module Iev
56
46
  data: Glossarist::ManagedConceptData.new(
57
47
  id: id,
58
48
  domains: [domain_ref(id)],
49
+ tags: [area.title],
59
50
  ),
60
51
  )
61
52
  mc.uuid = id
53
+ mc.schema_version = "3"
62
54
 
63
- mc.add_localization(build_localization(id, area.title, "eng"))
64
- mc.related = area.sections.map { |s| build_narrower_relation(s.uri) }
55
+ mc.add_localization(
56
+ build_localization(id, build_concept_data(id, area.title, "eng")),
57
+ )
58
+ mc.related = area.sections.map { |s| narrower_relation(s.uri) }
65
59
  mc.related = nil if mc.related.empty?
66
60
 
67
61
  mc
@@ -75,18 +69,22 @@ module Iev
75
69
  id: id,
76
70
  domains: [
77
71
  domain_ref(area.uri),
78
- domain_ref(id),
72
+ section_ref(id),
79
73
  ],
74
+ tags: [area.title, section.title],
80
75
  ),
81
76
  )
82
77
  mc.uuid = id
78
+ mc.schema_version = "3"
83
79
 
84
80
  cd = build_concept_data(id, section.title, "eng")
85
- cd.domain = area.uri
86
-
87
- mc.add_localization(build_localization_from_data(id, cd))
81
+ # ConceptData#domain is a LocalizedString — use the area title text,
82
+ # not a URI. The structural relationship is expressed via domains[]
83
+ # and related[].
84
+ cd.domain = area.title
88
85
 
89
- mc.related = [build_broader_relation(area.uri)]
86
+ mc.add_localization(build_localization(id, cd))
87
+ mc.related = [broader_relation(area.uri)]
90
88
 
91
89
  mc
92
90
  end
@@ -105,27 +103,32 @@ module Iev
105
103
  )
106
104
  end
107
105
 
108
- def build_localization(id, title, lang_code)
109
- cd = build_concept_data(id, title, lang_code)
110
-
106
+ def build_localization(id, concept_data)
111
107
  l10n = Glossarist::LocalizedConcept.new
112
- l10n.data = cd
108
+ l10n.data = concept_data
113
109
  l10n.id = id
114
110
  l10n.entry_status = "valid"
115
111
  l10n.data.review_decision_event = "published"
116
112
  l10n
117
113
  end
118
114
 
119
- def build_localization_from_data(id, concept_data)
120
- l10n = Glossarist::LocalizedConcept.new
121
- l10n.data = concept_data
122
- l10n.id = id
123
- l10n.entry_status = "valid"
124
- l10n.data.review_decision_event = "published"
125
- l10n
115
+ # --- ConceptReference factory methods ---
116
+
117
+ def domain_ref(concept_id)
118
+ ref = Glossarist::ConceptReference.domain(concept_id)
119
+ ref.source = IEV_SOURCE
120
+ ref
126
121
  end
127
122
 
128
- def build_broader_relation(target_uri)
123
+ def section_ref(concept_id)
124
+ ref = Glossarist::ConceptReference.section(concept_id)
125
+ ref.source = IEV_SOURCE
126
+ ref
127
+ end
128
+
129
+ # --- RelatedConcept factory methods ---
130
+
131
+ def broader_relation(target_uri)
129
132
  Glossarist::RelatedConcept.new(
130
133
  type: "broader",
131
134
  content: target_uri,
@@ -133,7 +136,7 @@ module Iev
133
136
  )
134
137
  end
135
138
 
136
- def build_narrower_relation(target_uri)
139
+ def narrower_relation(target_uri)
137
140
  Glossarist::RelatedConcept.new(
138
141
  type: "narrower",
139
142
  content: target_uri,
@@ -3,7 +3,6 @@
3
3
  require "yaml"
4
4
  require "nokogiri"
5
5
  require "fileutils"
6
- require "iev/config"
7
6
 
8
7
  module Iev
9
8
  module SubjectAreas
@@ -44,7 +43,7 @@ module Iev
44
43
  # Return all subject areas with their sections.
45
44
  # @return [Array<SubjectArea>]
46
45
  def all
47
- @typed_areas ||= raw_data["areas"].map { |h| build_area(h) }
46
+ @all ||= raw_data["areas"].map { |h| build_area(h) }
48
47
  end
49
48
 
50
49
  # Find a single subject area by its numeric code. O(1) indexed.
@@ -105,7 +104,7 @@ module Iev
105
104
  puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
106
105
 
107
106
  # Merge: keep existing sections, add new areas
108
- existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a }
107
+ existing = areas.to_h { |a| [a["code"], a] }
109
108
  fresh_areas.each do |fa|
110
109
  existing[fa["code"]] ||= fa
111
110
  end
@@ -119,13 +118,13 @@ module Iev
119
118
  area["fetched"] = true
120
119
  rescue FetchError
121
120
  area["sections"] ||= []
122
- warn "IEV: Skipping area #{area["code"]} due to WAF"
121
+ warn "IEV: Skipping area #{area['code']} due to WAF"
123
122
  end
124
123
 
125
- puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty?
124
+ puts "[#{i + 1}/#{areas.length}] #{area['code']}: #{area['title']} — #{area['sections'].length} sections" if $stdout.tty?
126
125
 
127
126
  # Save progress every 10 areas so partial results survive WAF failures
128
- if (i + 1) % 10 == 0
127
+ if ((i + 1) % 10).zero?
129
128
  write_cache("subject_areas.yaml", { "areas" => areas })
130
129
  end
131
130
 
@@ -201,7 +200,8 @@ module Iev
201
200
  @raw_data ||= begin
202
201
  path = File.exist?(DATA_FILE) ? DATA_FILE : nil
203
202
  if path
204
- YAML.safe_load(File.read(path, encoding: "utf-8")) || { "areas" => [] }
203
+ YAML.safe_load(File.read(path,
204
+ encoding: "utf-8")) || { "areas" => [] }
205
205
  else
206
206
  { "areas" => [] }
207
207
  end
@@ -209,7 +209,7 @@ module Iev
209
209
  end
210
210
 
211
211
  def area_index
212
- @area_index ||= all.each_with_object({}) { |a, h| h[a.code] = a }
212
+ @area_index ||= all.to_h { |a| [a.code, a] }
213
213
  end
214
214
 
215
215
  def section_index
@@ -233,10 +233,8 @@ module Iev
233
233
  end
234
234
 
235
235
  def fetch_page_with_retry(url, retries: MAX_RETRIES)
236
- require "iev/scraper/browser"
237
-
238
236
  retries.times do |attempt|
239
- html = ScraperBrowser.fetch(url)
237
+ html = Scraper::Browser.fetch(url)
240
238
  raise FetchError, "Failed to fetch #{url}" unless html
241
239
 
242
240
  unless captcha_page?(html)
@@ -68,7 +68,7 @@ module Iev
68
68
  extract_usage_info(curr_str)
69
69
  extract_prefix(curr_str)
70
70
 
71
- return unless /\p{Word}/.match?(curr_str)
71
+ nil unless /\p{Word}/.match?(curr_str)
72
72
 
73
73
  # Term attributes could not be parsed completely
74
74
  end
@@ -7,6 +7,7 @@ module Iev
7
7
  class TermBuilder
8
8
  include Cli::Ui
9
9
  include Utilities
10
+
10
11
  using DataConversions
11
12
 
12
13
  def initialize(data)
@@ -102,18 +103,22 @@ module Iev
102
103
  end
103
104
 
104
105
  # Derives the domain (subject area section) from the IEVREF identifier.
105
- # IEVREF format: "AAA-BB-CC" where AAA = area, AAA-BB = section.
106
- # Returns a URI reference to the section concept (e.g. "section-103-01").
106
+ #
107
+ # Returns the section or area title text as a localized string.
108
+ # Per the concept model, ConceptData#domain is a LocalizedString
109
+ # (the domain name), not a URI. Structural membership is expressed
110
+ # via ManagedConceptData#domains[] with ConceptReference objects.
107
111
  def extract_domain
108
112
  return nil unless term_id
109
113
 
110
- section_code = term_id.split("-")[0..1].join("-")
111
- section = Iev.find_section(section_code)
112
- return SubjectAreas.section_uri(section_code) if section
114
+ code = IevCode.new(term_id)
115
+ section = Iev.find_section(code.section_code) if code.section_code
116
+ return section.title if section
113
117
 
114
- area_code = term_id.split("-")[0]
115
- SubjectAreas.area_uri(area_code)
116
- rescue StandardError
118
+ area = Iev.find_subject_area(code.area_code)
119
+ area&.title
120
+ rescue StandardError => e
121
+ warn "IEV: extract_domain failed for #{term_id}: #{e.message}"
117
122
  nil
118
123
  end
119
124
 
@@ -142,7 +147,7 @@ module Iev
142
147
  Note&nbsp;\d+\sto\sentry: |
143
148
  Note\s*\d+\sto\sthe\sentry: |
144
149
  Note\sto\sentry\s*\d+: |
145
- Note\s*\d+?\sà\sl['']article: |
150
+ Note\s*\d+?\sà\sl'article: |
146
151
  <NOTE/?>?\s*\d?\s+[–-]\s* |
147
152
  NOTE(?:\s+-)?\s* |
148
153
  Note\s+\d+\s[–-]\s* |
data/lib/iev/utilities.rb CHANGED
@@ -5,6 +5,10 @@ module Iev
5
5
  IMAGE_PATH_PREFIX = "image::/assets/images/parts"
6
6
  IEV_CODE_RE = /\A(IEV)?\s*(\d{2,3}-\d{2,3}-\d{2,3})\z/
7
7
 
8
+ # Pattern matching an anchor's inner text that is just an IEV code
9
+ # (not a meaningful term designation).
10
+ IEV_CODE_TEXT_RE = /\A\s*IEV\s*\d{2,3}-\d{2,3}-\d{2,3}\s*\z/
11
+
8
12
  # SIMG/Figure patterns — custom IEV XML, pre-processed before Nokogiri.
9
13
  # Uses [^>] and [^<] instead of . to avoid polynomial backtracking.
10
14
  SIMG_PATH_REGEX = /<simg [^>]*\/\$file\/([\d\-\w.]+)>/
@@ -134,12 +138,36 @@ module Iev
134
138
 
135
139
  if href.match?(IEV_CODE_RE)
136
140
  iev_code = href.sub(/\AIEV\s*/, "")
137
- "{{#{inner}, urn:iec:std:iec:60050-#{iev_code}}}"
141
+ display = render_term_for(inner, iev_code)
142
+ "{{urn:iec:std:iec:60050-#{iev_code}, #{display}}}"
138
143
  elsif !href.empty?
139
144
  "#{href}[#{inner}]"
140
145
  else
141
146
  inner
142
147
  end
143
148
  end
149
+
150
+ # Resolve the display (render) text for an IEV cross-reference.
151
+ #
152
+ # When the anchor text is already a meaningful term (e.g. "adjective"),
153
+ # use it directly. When it's just a bare IEV code (e.g. "IEV 102-01-10"),
154
+ # try to look up the actual term designation via DataSource.
155
+ #
156
+ # @param inner_text [String] the anchor element's inner text
157
+ # @param iev_code [String] the extracted numeric IEV code (e.g. "102-01-10")
158
+ # @return [String] the term designation to use as render text
159
+ def render_term_for(inner_text, iev_code)
160
+ stripped = inner_text.strip
161
+ return stripped unless iev_code_only?(stripped)
162
+
163
+ Iev.get(iev_code, "en") || stripped
164
+ rescue StandardError
165
+ stripped
166
+ end
167
+
168
+ # True when the anchor text is just a raw IEV code, not a term designation.
169
+ def iev_code_only?(text)
170
+ text.match?(IEV_CODE_TEXT_RE)
171
+ end
144
172
  end
145
173
  end
data/lib/iev/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iev
4
- VERSION = "0.4.5"
4
+ VERSION = "0.4.6"
5
5
  end
data/lib/iev.rb CHANGED
@@ -1,14 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "iev/version"
4
- require "iev/config"
5
- require "iev/data_source"
6
-
7
- require "yaml"
8
4
 
9
5
  # plurimath and unitsml both depend on mml, which has a transitive
10
6
  # dependency version mismatch with lutaml-model in some environments.
11
- # Load them when available; the DataSource APIs work without them.
7
+ # Load them when available; the DataSource/Db APIs work without them.
12
8
  begin
13
9
  require "plurimath"
14
10
  rescue LoadError
@@ -22,6 +18,9 @@ rescue LoadError
22
18
  end
23
19
 
24
20
  module Iev
21
+ # IEV dataset URN — single source of truth for all concept references.
22
+ IEV_SOURCE = "urn:iec:std:iec:60050"
23
+
25
24
  autoload :Cli, "iev/cli"
26
25
  autoload :Config, "iev/config"
27
26
  autoload :Converter, "iev/converter"
@@ -44,6 +43,26 @@ module Iev
44
43
  autoload :TermBuilder, "iev/term_builder"
45
44
  autoload :Utilities, "iev/utilities"
46
45
 
46
+ # --- Configuration ---
47
+
48
+ # @return [Config]
49
+ def self.config
50
+ @config ||= Config.new
51
+ end
52
+
53
+ # Yield the config object for initialization.
54
+ # @yield [Config]
55
+ def self.configure
56
+ yield(config) if block_given?
57
+ end
58
+
59
+ # Reset config (useful in tests).
60
+ def self.reset_config!
61
+ @config = nil
62
+ end
63
+
64
+ # --- Data fetching ---
65
+
47
66
  # Fetch term designation from IEV data.
48
67
  #
49
68
  # @param [String] code for example "103-01-02"
@@ -51,7 +70,6 @@ module Iev
51
70
  #
52
71
  # @return [String, nil] if found then term,
53
72
  # if code or language not found then nil.
54
- #
55
73
  def self.get(code, lang)
56
74
  DataSource.fetch_term_designation(code, lang)
57
75
  rescue DataSource::NotFoundError
@@ -77,6 +95,8 @@ module Iev
77
95
  DataSource.fetch_term(code, lang)
78
96
  end
79
97
 
98
+ # --- Scraping ---
99
+
80
100
  # Scrape concept data from Electropedia for a given IEV code.
81
101
  # Uses Ferrum (headless Chrome) to handle AWS WAF challenge.
82
102
  #
@@ -86,6 +106,8 @@ module Iev
86
106
  Scraper.new.fetch_concept(code)
87
107
  end
88
108
 
109
+ # --- Subject area / section queries ---
110
+
89
111
  # Return all IEV subject areas with their sections (from bundled data).
90
112
  # @return [Array<SubjectArea>]
91
113
  def self.subject_areas