iev 0.4.5 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +98 -21
- data/CLAUDE.md +17 -5
- data/Gemfile +8 -0
- data/README.adoc +10 -3
- data/iev.gemspec +3 -2
- data/lib/iev/cli/command.rb +3 -2
- data/lib/iev/cli/ui.rb +5 -5
- data/lib/iev/config.rb +1 -15
- data/lib/iev/data_source.rb +4 -2
- data/lib/iev/db_writer.rb +1 -0
- data/lib/iev/exporter.rb +106 -21
- data/lib/iev/iso_639_code.rb +2 -1
- data/lib/iev/relaton_db.rb +1 -1
- data/lib/iev/scraper/browser.rb +90 -88
- data/lib/iev/scraper.rb +5 -4
- data/lib/iev/source_parser.rb +9 -10
- data/lib/iev/subject_area_concepts.rb +36 -33
- data/lib/iev/subject_areas.rb +9 -11
- data/lib/iev/term_attrs_parser.rb +1 -1
- data/lib/iev/term_builder.rb +14 -9
- data/lib/iev/utilities.rb +29 -1
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +28 -6
- metadata +22 -27
data/lib/iev/scraper/browser.rb
CHANGED
|
@@ -3,100 +3,102 @@
|
|
|
3
3
|
require "ferrum"
|
|
4
4
|
|
|
5
5
|
module Iev
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
].freeze
|
|
45
|
-
|
|
46
|
-
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
47
|
-
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
48
|
-
def self.fetch(url, browser_opts: {})
|
|
49
|
-
browser = Ferrum::Browser.new(
|
|
50
|
-
headless: "new",
|
|
51
|
-
timeout: 30,
|
|
52
|
-
window_size: [1366, 768],
|
|
53
|
-
browser_options: {
|
|
54
|
-
"disable-blink-features" => "AutomationControlled",
|
|
6
|
+
class Scraper
|
|
7
|
+
# Shared headless browser utilities for fetching pages behind AWS WAF.
|
|
8
|
+
module Browser
|
|
9
|
+
USER_AGENT_PROFILES = [
|
|
10
|
+
{
|
|
11
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
12
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
13
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
14
|
+
platform: '"macOS"',
|
|
15
|
+
chrome_version: "131",
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
19
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
20
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
21
|
+
platform: '"Windows"',
|
|
22
|
+
chrome_version: "130",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
26
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
27
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
28
|
+
platform: '"Linux"',
|
|
29
|
+
chrome_version: "131",
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
33
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
34
|
+
"Chrome/129.0.0.0 Safari/537.36",
|
|
35
|
+
platform: '"macOS"',
|
|
36
|
+
chrome_version: "129",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
40
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
41
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
42
|
+
platform: '"Windows"',
|
|
43
|
+
chrome_version: "131",
|
|
55
44
|
},
|
|
56
|
-
|
|
57
|
-
)
|
|
45
|
+
].freeze
|
|
58
46
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
47
|
+
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
48
|
+
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
49
|
+
def self.fetch(url, browser_opts: {})
|
|
50
|
+
browser = Ferrum::Browser.new(
|
|
51
|
+
headless: "new",
|
|
52
|
+
timeout: 30,
|
|
53
|
+
window_size: [1366, 768],
|
|
54
|
+
browser_options: {
|
|
55
|
+
"disable-blink-features" => "AutomationControlled",
|
|
56
|
+
},
|
|
57
|
+
**browser_opts,
|
|
58
|
+
)
|
|
63
59
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
60
|
+
browser.headers.set(random_headers)
|
|
61
|
+
browser.go_to(url)
|
|
62
|
+
browser.network.wait_for_idle(timeout: 15)
|
|
63
|
+
html = browser.body
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
ensure
|
|
74
|
-
browser&.quit
|
|
75
|
-
end
|
|
65
|
+
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
66
|
+
warn "IEV: AWS WAF blocked request for #{url}"
|
|
67
|
+
return nil
|
|
68
|
+
end
|
|
76
69
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
70
|
+
html
|
|
71
|
+
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
72
|
+
warn "IEV: Browser error fetching #{url}: #{e.message}"
|
|
73
|
+
nil
|
|
74
|
+
ensure
|
|
75
|
+
browser&.quit
|
|
76
|
+
end
|
|
82
77
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
78
|
+
def self.random_headers
|
|
79
|
+
profile = USER_AGENT_PROFILES.sample
|
|
80
|
+
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
81
|
+
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
82
|
+
"\"Not_A Brand\";v=\"24\""
|
|
83
|
+
|
|
84
|
+
{
|
|
85
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
86
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
87
|
+
"application/signed-exchange;v=b3;q=0.7",
|
|
88
|
+
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
89
|
+
"Cache-Control" => "no-cache",
|
|
90
|
+
"Pragma" => "no-cache",
|
|
91
|
+
"Sec-Ch-Ua" => sec_ch_ua,
|
|
92
|
+
"Sec-Ch-Ua-Mobile" => "?0",
|
|
93
|
+
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
94
|
+
"Sec-Fetch-Dest" => "document",
|
|
95
|
+
"Sec-Fetch-Mode" => "navigate",
|
|
96
|
+
"Sec-Fetch-Site" => "cross-site",
|
|
97
|
+
"Sec-Fetch-User" => "?1",
|
|
98
|
+
"Upgrade-Insecure-Requests" => "1",
|
|
99
|
+
"User-Agent" => profile[:user_agent],
|
|
100
|
+
}
|
|
101
|
+
end
|
|
100
102
|
end
|
|
101
103
|
end
|
|
102
104
|
end
|
data/lib/iev/scraper.rb
CHANGED
|
@@ -4,6 +4,9 @@ require "nokogiri"
|
|
|
4
4
|
|
|
5
5
|
module Iev
|
|
6
6
|
class Scraper
|
|
7
|
+
autoload :Browser, "iev/scraper/browser"
|
|
8
|
+
autoload :PageParser, "iev/scraper/page_parser"
|
|
9
|
+
|
|
7
10
|
BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
|
|
8
11
|
"display?openform&ievref="
|
|
9
12
|
|
|
@@ -14,7 +17,8 @@ module Iev
|
|
|
14
17
|
# Fetch the Electropedia page HTML for a given IEV code.
|
|
15
18
|
# Returns a Nokogiri document.
|
|
16
19
|
def fetch_page(code)
|
|
17
|
-
html =
|
|
20
|
+
html = Browser.fetch("#{BASE_URL}#{code}",
|
|
21
|
+
browser_opts: @browser_opts)
|
|
18
22
|
return nil unless html
|
|
19
23
|
|
|
20
24
|
Nokogiri::HTML(html)
|
|
@@ -30,6 +34,3 @@ module Iev
|
|
|
30
34
|
end
|
|
31
35
|
end
|
|
32
36
|
end
|
|
33
|
-
|
|
34
|
-
require_relative "scraper/browser"
|
|
35
|
-
require_relative "scraper/page_parser"
|
data/lib/iev/source_parser.rb
CHANGED
|
@@ -12,6 +12,7 @@ module Iev
|
|
|
12
12
|
# SourceParser.new(cell_data_string).parsed_sources
|
|
13
13
|
class SourceParser
|
|
14
14
|
include Utilities
|
|
15
|
+
|
|
15
16
|
using DataConversions
|
|
16
17
|
|
|
17
18
|
# When false, obtain_source_link skips Relaton network calls.
|
|
@@ -112,13 +113,13 @@ module Iev
|
|
|
112
113
|
# IEC 62313:2009, 3.6, modifié
|
|
113
114
|
|
|
114
115
|
str
|
|
115
|
-
.gsub(
|
|
116
|
-
.gsub(
|
|
116
|
+
.gsub("CEI", "IEC")
|
|
117
|
+
.gsub("Guide IEC", "IEC Guide")
|
|
117
118
|
.gsub(%r{Guide ISO/IEC}, "ISO/IEC Guide")
|
|
118
|
-
.gsub(
|
|
119
|
-
.gsub(
|
|
120
|
-
.gsub(
|
|
121
|
-
.gsub(
|
|
119
|
+
.gsub("VEI", "IEV")
|
|
120
|
+
.gsub("UIT", "ITU")
|
|
121
|
+
.gsub("IUT-R", "ITU-R")
|
|
122
|
+
.gsub("UTI-R", "ITU-R")
|
|
122
123
|
.gsub(/Recomm[ea]ndation ITU-T/, "ITU-T Recommendation")
|
|
123
124
|
.gsub(/ITU-T (\w.\d{3}):(\d{4})/, 'ITU-T Recommendation \1 (\2)')
|
|
124
125
|
.gsub(/ITU-R Rec. (\d+)/, 'ITU-R Recommendation \1')
|
|
@@ -290,17 +291,15 @@ module Iev
|
|
|
290
291
|
].map do |regex, _rule|
|
|
291
292
|
# TODO: Rubocop complains about unused rule -- need to make sure
|
|
292
293
|
# that no one forgot about something.
|
|
293
|
-
res = []
|
|
294
294
|
# puts "str is '#{str}'"
|
|
295
295
|
# puts "regex is '#{regex.to_s}'"
|
|
296
|
-
str.scan(regex).
|
|
296
|
+
str.scan(regex).map do |result|
|
|
297
297
|
# puts "result is #{result.first}"
|
|
298
|
-
|
|
298
|
+
{
|
|
299
299
|
index: $LAST_MATCH_INFO.offset(0)[0],
|
|
300
300
|
clause: result.first.strip,
|
|
301
301
|
}
|
|
302
302
|
end
|
|
303
|
-
res
|
|
304
303
|
# sort by index and also the length of match
|
|
305
304
|
end.flatten.sort_by { |hash| [hash[:index], -hash[:clause].length] }
|
|
306
305
|
|
|
@@ -15,13 +15,11 @@ module Iev
|
|
|
15
15
|
# (added by Exporter)
|
|
16
16
|
#
|
|
17
17
|
# Classification (separate from hierarchy):
|
|
18
|
-
# - Each concept's ManagedConceptData#domains includes
|
|
19
|
-
# section ConceptReferences
|
|
20
|
-
# - Each concept's ConceptData#domain references its section URI
|
|
18
|
+
# - Each concept's ManagedConceptData#domains includes domain and
|
|
19
|
+
# section ConceptReferences (per ConceptReferenceType)
|
|
21
20
|
# - Each section concept's ConceptData#domain references parent area
|
|
21
|
+
# title text (a LocalizedString, not a URI)
|
|
22
22
|
module SubjectAreaConcepts
|
|
23
|
-
IEV_SOURCE = "urn:iec:std:iec:60050"
|
|
24
|
-
|
|
25
23
|
class << self
|
|
26
24
|
# Build all area and section concepts and add them to the collection.
|
|
27
25
|
#
|
|
@@ -41,14 +39,6 @@ module Iev
|
|
|
41
39
|
|
|
42
40
|
private
|
|
43
41
|
|
|
44
|
-
def domain_ref(concept_id)
|
|
45
|
-
Glossarist::ConceptReference.new(
|
|
46
|
-
concept_id: concept_id,
|
|
47
|
-
source: IEV_SOURCE,
|
|
48
|
-
ref_type: "domain",
|
|
49
|
-
)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
42
|
def build_area_concept(area)
|
|
53
43
|
id = area.uri
|
|
54
44
|
|
|
@@ -56,12 +46,16 @@ module Iev
|
|
|
56
46
|
data: Glossarist::ManagedConceptData.new(
|
|
57
47
|
id: id,
|
|
58
48
|
domains: [domain_ref(id)],
|
|
49
|
+
tags: [area.title],
|
|
59
50
|
),
|
|
60
51
|
)
|
|
61
52
|
mc.uuid = id
|
|
53
|
+
mc.schema_version = "3"
|
|
62
54
|
|
|
63
|
-
mc.add_localization(
|
|
64
|
-
|
|
55
|
+
mc.add_localization(
|
|
56
|
+
build_localization(id, build_concept_data(id, area.title, "eng")),
|
|
57
|
+
)
|
|
58
|
+
mc.related = area.sections.map { |s| narrower_relation(s.uri) }
|
|
65
59
|
mc.related = nil if mc.related.empty?
|
|
66
60
|
|
|
67
61
|
mc
|
|
@@ -75,18 +69,22 @@ module Iev
|
|
|
75
69
|
id: id,
|
|
76
70
|
domains: [
|
|
77
71
|
domain_ref(area.uri),
|
|
78
|
-
|
|
72
|
+
section_ref(id),
|
|
79
73
|
],
|
|
74
|
+
tags: [area.title, section.title],
|
|
80
75
|
),
|
|
81
76
|
)
|
|
82
77
|
mc.uuid = id
|
|
78
|
+
mc.schema_version = "3"
|
|
83
79
|
|
|
84
80
|
cd = build_concept_data(id, section.title, "eng")
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
81
|
+
# ConceptData#domain is a LocalizedString — use the area title text,
|
|
82
|
+
# not a URI. The structural relationship is expressed via domains[]
|
|
83
|
+
# and related[].
|
|
84
|
+
cd.domain = area.title
|
|
88
85
|
|
|
89
|
-
mc.
|
|
86
|
+
mc.add_localization(build_localization(id, cd))
|
|
87
|
+
mc.related = [broader_relation(area.uri)]
|
|
90
88
|
|
|
91
89
|
mc
|
|
92
90
|
end
|
|
@@ -105,27 +103,32 @@ module Iev
|
|
|
105
103
|
)
|
|
106
104
|
end
|
|
107
105
|
|
|
108
|
-
def build_localization(id,
|
|
109
|
-
cd = build_concept_data(id, title, lang_code)
|
|
110
|
-
|
|
106
|
+
def build_localization(id, concept_data)
|
|
111
107
|
l10n = Glossarist::LocalizedConcept.new
|
|
112
|
-
l10n.data =
|
|
108
|
+
l10n.data = concept_data
|
|
113
109
|
l10n.id = id
|
|
114
110
|
l10n.entry_status = "valid"
|
|
115
111
|
l10n.data.review_decision_event = "published"
|
|
116
112
|
l10n
|
|
117
113
|
end
|
|
118
114
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
l10n
|
|
115
|
+
# --- ConceptReference factory methods ---
|
|
116
|
+
|
|
117
|
+
def domain_ref(concept_id)
|
|
118
|
+
ref = Glossarist::ConceptReference.domain(concept_id)
|
|
119
|
+
ref.source = IEV_SOURCE
|
|
120
|
+
ref
|
|
126
121
|
end
|
|
127
122
|
|
|
128
|
-
def
|
|
123
|
+
def section_ref(concept_id)
|
|
124
|
+
ref = Glossarist::ConceptReference.section(concept_id)
|
|
125
|
+
ref.source = IEV_SOURCE
|
|
126
|
+
ref
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# --- RelatedConcept factory methods ---
|
|
130
|
+
|
|
131
|
+
def broader_relation(target_uri)
|
|
129
132
|
Glossarist::RelatedConcept.new(
|
|
130
133
|
type: "broader",
|
|
131
134
|
content: target_uri,
|
|
@@ -133,7 +136,7 @@ module Iev
|
|
|
133
136
|
)
|
|
134
137
|
end
|
|
135
138
|
|
|
136
|
-
def
|
|
139
|
+
def narrower_relation(target_uri)
|
|
137
140
|
Glossarist::RelatedConcept.new(
|
|
138
141
|
type: "narrower",
|
|
139
142
|
content: target_uri,
|
data/lib/iev/subject_areas.rb
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
require "yaml"
|
|
4
4
|
require "nokogiri"
|
|
5
5
|
require "fileutils"
|
|
6
|
-
require "iev/config"
|
|
7
6
|
|
|
8
7
|
module Iev
|
|
9
8
|
module SubjectAreas
|
|
@@ -44,7 +43,7 @@ module Iev
|
|
|
44
43
|
# Return all subject areas with their sections.
|
|
45
44
|
# @return [Array<SubjectArea>]
|
|
46
45
|
def all
|
|
47
|
-
@
|
|
46
|
+
@all ||= raw_data["areas"].map { |h| build_area(h) }
|
|
48
47
|
end
|
|
49
48
|
|
|
50
49
|
# Find a single subject area by its numeric code. O(1) indexed.
|
|
@@ -105,7 +104,7 @@ module Iev
|
|
|
105
104
|
puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
|
|
106
105
|
|
|
107
106
|
# Merge: keep existing sections, add new areas
|
|
108
|
-
existing = areas.
|
|
107
|
+
existing = areas.to_h { |a| [a["code"], a] }
|
|
109
108
|
fresh_areas.each do |fa|
|
|
110
109
|
existing[fa["code"]] ||= fa
|
|
111
110
|
end
|
|
@@ -119,13 +118,13 @@ module Iev
|
|
|
119
118
|
area["fetched"] = true
|
|
120
119
|
rescue FetchError
|
|
121
120
|
area["sections"] ||= []
|
|
122
|
-
warn "IEV: Skipping area #{area[
|
|
121
|
+
warn "IEV: Skipping area #{area['code']} due to WAF"
|
|
123
122
|
end
|
|
124
123
|
|
|
125
|
-
puts "[#{i + 1}/#{areas.length}] #{area[
|
|
124
|
+
puts "[#{i + 1}/#{areas.length}] #{area['code']}: #{area['title']} — #{area['sections'].length} sections" if $stdout.tty?
|
|
126
125
|
|
|
127
126
|
# Save progress every 10 areas so partial results survive WAF failures
|
|
128
|
-
if (i + 1) % 10
|
|
127
|
+
if ((i + 1) % 10).zero?
|
|
129
128
|
write_cache("subject_areas.yaml", { "areas" => areas })
|
|
130
129
|
end
|
|
131
130
|
|
|
@@ -201,7 +200,8 @@ module Iev
|
|
|
201
200
|
@raw_data ||= begin
|
|
202
201
|
path = File.exist?(DATA_FILE) ? DATA_FILE : nil
|
|
203
202
|
if path
|
|
204
|
-
YAML.safe_load(File.read(path,
|
|
203
|
+
YAML.safe_load(File.read(path,
|
|
204
|
+
encoding: "utf-8")) || { "areas" => [] }
|
|
205
205
|
else
|
|
206
206
|
{ "areas" => [] }
|
|
207
207
|
end
|
|
@@ -209,7 +209,7 @@ module Iev
|
|
|
209
209
|
end
|
|
210
210
|
|
|
211
211
|
def area_index
|
|
212
|
-
@area_index ||= all.
|
|
212
|
+
@area_index ||= all.to_h { |a| [a.code, a] }
|
|
213
213
|
end
|
|
214
214
|
|
|
215
215
|
def section_index
|
|
@@ -233,10 +233,8 @@ module Iev
|
|
|
233
233
|
end
|
|
234
234
|
|
|
235
235
|
def fetch_page_with_retry(url, retries: MAX_RETRIES)
|
|
236
|
-
require "iev/scraper/browser"
|
|
237
|
-
|
|
238
236
|
retries.times do |attempt|
|
|
239
|
-
html =
|
|
237
|
+
html = Scraper::Browser.fetch(url)
|
|
240
238
|
raise FetchError, "Failed to fetch #{url}" unless html
|
|
241
239
|
|
|
242
240
|
unless captcha_page?(html)
|
data/lib/iev/term_builder.rb
CHANGED
|
@@ -7,6 +7,7 @@ module Iev
|
|
|
7
7
|
class TermBuilder
|
|
8
8
|
include Cli::Ui
|
|
9
9
|
include Utilities
|
|
10
|
+
|
|
10
11
|
using DataConversions
|
|
11
12
|
|
|
12
13
|
def initialize(data)
|
|
@@ -102,18 +103,22 @@ module Iev
|
|
|
102
103
|
end
|
|
103
104
|
|
|
104
105
|
# Derives the domain (subject area section) from the IEVREF identifier.
|
|
105
|
-
#
|
|
106
|
-
# Returns
|
|
106
|
+
#
|
|
107
|
+
# Returns the section or area title text as a localized string.
|
|
108
|
+
# Per the concept model, ConceptData#domain is a LocalizedString
|
|
109
|
+
# (the domain name), not a URI. Structural membership is expressed
|
|
110
|
+
# via ManagedConceptData#domains[] with ConceptReference objects.
|
|
107
111
|
def extract_domain
|
|
108
112
|
return nil unless term_id
|
|
109
113
|
|
|
110
|
-
|
|
111
|
-
section = Iev.find_section(section_code)
|
|
112
|
-
return
|
|
114
|
+
code = IevCode.new(term_id)
|
|
115
|
+
section = Iev.find_section(code.section_code) if code.section_code
|
|
116
|
+
return section.title if section
|
|
113
117
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
rescue StandardError
|
|
118
|
+
area = Iev.find_subject_area(code.area_code)
|
|
119
|
+
area&.title
|
|
120
|
+
rescue StandardError => e
|
|
121
|
+
warn "IEV: extract_domain failed for #{term_id}: #{e.message}"
|
|
117
122
|
nil
|
|
118
123
|
end
|
|
119
124
|
|
|
@@ -142,7 +147,7 @@ module Iev
|
|
|
142
147
|
Note \d+\sto\sentry: |
|
|
143
148
|
Note\s*\d+\sto\sthe\sentry: |
|
|
144
149
|
Note\sto\sentry\s*\d+: |
|
|
145
|
-
Note\s*\d+?\sà\sl
|
|
150
|
+
Note\s*\d+?\sà\sl'article: |
|
|
146
151
|
<NOTE/?>?\s*\d?\s+[–-]\s* |
|
|
147
152
|
NOTE(?:\s+-)?\s* |
|
|
148
153
|
Note\s+\d+\s[–-]\s* |
|
data/lib/iev/utilities.rb
CHANGED
|
@@ -5,6 +5,10 @@ module Iev
|
|
|
5
5
|
IMAGE_PATH_PREFIX = "image::/assets/images/parts"
|
|
6
6
|
IEV_CODE_RE = /\A(IEV)?\s*(\d{2,3}-\d{2,3}-\d{2,3})\z/
|
|
7
7
|
|
|
8
|
+
# Pattern matching an anchor's inner text that is just an IEV code
|
|
9
|
+
# (not a meaningful term designation).
|
|
10
|
+
IEV_CODE_TEXT_RE = /\A\s*IEV\s*\d{2,3}-\d{2,3}-\d{2,3}\s*\z/
|
|
11
|
+
|
|
8
12
|
# SIMG/Figure patterns — custom IEV XML, pre-processed before Nokogiri.
|
|
9
13
|
# Uses [^>] and [^<] instead of . to avoid polynomial backtracking.
|
|
10
14
|
SIMG_PATH_REGEX = /<simg [^>]*\/\$file\/([\d\-\w.]+)>/
|
|
@@ -134,12 +138,36 @@ module Iev
|
|
|
134
138
|
|
|
135
139
|
if href.match?(IEV_CODE_RE)
|
|
136
140
|
iev_code = href.sub(/\AIEV\s*/, "")
|
|
137
|
-
|
|
141
|
+
display = render_term_for(inner, iev_code)
|
|
142
|
+
"{{urn:iec:std:iec:60050-#{iev_code}, #{display}}}"
|
|
138
143
|
elsif !href.empty?
|
|
139
144
|
"#{href}[#{inner}]"
|
|
140
145
|
else
|
|
141
146
|
inner
|
|
142
147
|
end
|
|
143
148
|
end
|
|
149
|
+
|
|
150
|
+
# Resolve the display (render) text for an IEV cross-reference.
|
|
151
|
+
#
|
|
152
|
+
# When the anchor text is already a meaningful term (e.g. "adjective"),
|
|
153
|
+
# use it directly. When it's just a bare IEV code (e.g. "IEV 102-01-10"),
|
|
154
|
+
# try to look up the actual term designation via DataSource.
|
|
155
|
+
#
|
|
156
|
+
# @param inner_text [String] the anchor element's inner text
|
|
157
|
+
# @param iev_code [String] the extracted numeric IEV code (e.g. "102-01-10")
|
|
158
|
+
# @return [String] the term designation to use as render text
|
|
159
|
+
def render_term_for(inner_text, iev_code)
|
|
160
|
+
stripped = inner_text.strip
|
|
161
|
+
return stripped unless iev_code_only?(stripped)
|
|
162
|
+
|
|
163
|
+
Iev.get(iev_code, "en") || stripped
|
|
164
|
+
rescue StandardError
|
|
165
|
+
stripped
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# True when the anchor text is just a raw IEV code, not a term designation.
|
|
169
|
+
def iev_code_only?(text)
|
|
170
|
+
text.match?(IEV_CODE_TEXT_RE)
|
|
171
|
+
end
|
|
144
172
|
end
|
|
145
173
|
end
|
data/lib/iev/version.rb
CHANGED
data/lib/iev.rb
CHANGED
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "iev/version"
|
|
4
|
-
require "iev/config"
|
|
5
|
-
require "iev/data_source"
|
|
6
|
-
|
|
7
|
-
require "yaml"
|
|
8
4
|
|
|
9
5
|
# plurimath and unitsml both depend on mml, which has a transitive
|
|
10
6
|
# dependency version mismatch with lutaml-model in some environments.
|
|
11
|
-
# Load them when available; the DataSource APIs work without them.
|
|
7
|
+
# Load them when available; the DataSource/Db APIs work without them.
|
|
12
8
|
begin
|
|
13
9
|
require "plurimath"
|
|
14
10
|
rescue LoadError
|
|
@@ -22,6 +18,9 @@ rescue LoadError
|
|
|
22
18
|
end
|
|
23
19
|
|
|
24
20
|
module Iev
|
|
21
|
+
# IEV dataset URN — single source of truth for all concept references.
|
|
22
|
+
IEV_SOURCE = "urn:iec:std:iec:60050"
|
|
23
|
+
|
|
25
24
|
autoload :Cli, "iev/cli"
|
|
26
25
|
autoload :Config, "iev/config"
|
|
27
26
|
autoload :Converter, "iev/converter"
|
|
@@ -44,6 +43,26 @@ module Iev
|
|
|
44
43
|
autoload :TermBuilder, "iev/term_builder"
|
|
45
44
|
autoload :Utilities, "iev/utilities"
|
|
46
45
|
|
|
46
|
+
# --- Configuration ---
|
|
47
|
+
|
|
48
|
+
# @return [Config]
|
|
49
|
+
def self.config
|
|
50
|
+
@config ||= Config.new
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Yield the config object for initialization.
|
|
54
|
+
# @yield [Config]
|
|
55
|
+
def self.configure
|
|
56
|
+
yield(config) if block_given?
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Reset config (useful in tests).
|
|
60
|
+
def self.reset_config!
|
|
61
|
+
@config = nil
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# --- Data fetching ---
|
|
65
|
+
|
|
47
66
|
# Fetch term designation from IEV data.
|
|
48
67
|
#
|
|
49
68
|
# @param [String] code for example "103-01-02"
|
|
@@ -51,7 +70,6 @@ module Iev
|
|
|
51
70
|
#
|
|
52
71
|
# @return [String, nil] if found then term,
|
|
53
72
|
# if code or language not found then nil.
|
|
54
|
-
#
|
|
55
73
|
def self.get(code, lang)
|
|
56
74
|
DataSource.fetch_term_designation(code, lang)
|
|
57
75
|
rescue DataSource::NotFoundError
|
|
@@ -77,6 +95,8 @@ module Iev
|
|
|
77
95
|
DataSource.fetch_term(code, lang)
|
|
78
96
|
end
|
|
79
97
|
|
|
98
|
+
# --- Scraping ---
|
|
99
|
+
|
|
80
100
|
# Scrape concept data from Electropedia for a given IEV code.
|
|
81
101
|
# Uses Ferrum (headless Chrome) to handle AWS WAF challenge.
|
|
82
102
|
#
|
|
@@ -86,6 +106,8 @@ module Iev
|
|
|
86
106
|
Scraper.new.fetch_concept(code)
|
|
87
107
|
end
|
|
88
108
|
|
|
109
|
+
# --- Subject area / section queries ---
|
|
110
|
+
|
|
89
111
|
# Return all IEV subject areas with their sections (from bundled data).
|
|
90
112
|
# @return [Array<SubjectArea>]
|
|
91
113
|
def self.subject_areas
|