iev 0.4.3 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +7 -4
- data/.github/workflows/release.yml +2 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +4 -2
- data/CLAUDE.md +3 -0
- data/Gemfile +0 -19
- data/README.adoc +412 -0
- data/data/subject_areas.yaml +1920 -0
- data/exe/iev +1 -1
- data/iev.gemspec +1 -1
- data/lib/iev/cli/command.rb +24 -0
- data/lib/iev/cli/command_helper.rb +1 -2
- data/lib/iev/exporter.rb +108 -2
- data/lib/iev/iev_code.rb +80 -0
- data/lib/iev/iso_639_code.rb +1 -1
- data/lib/iev/scraper/browser.rb +102 -0
- data/lib/iev/scraper.rb +5 -105
- data/lib/iev/section.rb +37 -0
- data/lib/iev/source_parser.rb +48 -1
- data/lib/iev/subject_area.rb +46 -0
- data/lib/iev/subject_area_concepts.rb +145 -0
- data/lib/iev/subject_areas.rb +273 -0
- data/lib/iev/supersession_parser.rb +1 -2
- data/lib/iev/term_builder.rb +19 -0
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +46 -0
- metadata +17 -4
data/exe/iev
CHANGED
data/iev.gemspec
CHANGED
|
@@ -22,7 +22,7 @@ Gem::Specification.new do |spec|
|
|
|
22
22
|
spec.required_ruby_version = Gem::Requirement.new(">= 3.2.0")
|
|
23
23
|
|
|
24
24
|
spec.add_dependency "creek", "~> 2.6"
|
|
25
|
-
spec.add_dependency "glossarist", ">= 2.
|
|
25
|
+
spec.add_dependency "glossarist", "~> 2.6", ">= 2.6.7"
|
|
26
26
|
spec.add_dependency "ferrum", "~> 0.15"
|
|
27
27
|
spec.add_dependency "nokogiri", "~> 1.19"
|
|
28
28
|
spec.add_dependency "plurimath"
|
data/lib/iev/cli/command.rb
CHANGED
|
@@ -142,6 +142,30 @@ module Iev
|
|
|
142
142
|
summary
|
|
143
143
|
end
|
|
144
144
|
|
|
145
|
+
desc "subject_areas", "Fetch IEV subject areas and sections from Electropedia."
|
|
146
|
+
option :output, desc: "Output YAML file (default: stdout)", aliases: :o
|
|
147
|
+
option :refresh, type: :boolean, default: false,
|
|
148
|
+
desc: "Force re-fetch even if cached"
|
|
149
|
+
def subject_areas
|
|
150
|
+
if options[:refresh]
|
|
151
|
+
cache_path = File.join(Iev.config.cache_dir, "subject_areas.yaml")
|
|
152
|
+
FileUtils.rm_f(cache_path) if File.exist?(cache_path)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
result = Iev::SubjectAreas.fetch
|
|
156
|
+
|
|
157
|
+
yaml = YAML.dump(result)
|
|
158
|
+
if options[:output]
|
|
159
|
+
File.write(options[:output], yaml, encoding: "utf-8")
|
|
160
|
+
puts "Written to #{options[:output]}"
|
|
161
|
+
else
|
|
162
|
+
puts yaml
|
|
163
|
+
end
|
|
164
|
+
rescue Iev::SubjectAreas::FetchError => e
|
|
165
|
+
error e.message
|
|
166
|
+
exit 1
|
|
167
|
+
end
|
|
168
|
+
|
|
145
169
|
desc "fetch CODE", "Fetch an IEV concept and output YAML to stdout."
|
|
146
170
|
option :scrape, type: :boolean, default: false,
|
|
147
171
|
desc: "Scrape from Electropedia instead of using cached data"
|
|
@@ -111,8 +111,7 @@ module Iev
|
|
|
111
111
|
|
|
112
112
|
definition = entry["definition"]
|
|
113
113
|
if definition
|
|
114
|
-
|
|
115
|
-
cd.definition = [Glossarist::DetailedDefinition.new(content: content)]
|
|
114
|
+
cd.definition = [Glossarist::DetailedDefinition.new(content: definition)]
|
|
116
115
|
end
|
|
117
116
|
|
|
118
117
|
l10n = Glossarist::LocalizedConcept.new
|
data/lib/iev/exporter.rb
CHANGED
|
@@ -28,16 +28,19 @@ module Iev
|
|
|
28
28
|
# @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
|
|
29
29
|
# @param only_languages [String, nil] comma-separated language codes
|
|
30
30
|
# @param fetch_relaton_links [Boolean] fetch source URLs via Relaton
|
|
31
|
+
# @param include_areas [Boolean] create area/section hierarchy concepts
|
|
31
32
|
# @param on_progress [Proc, nil] callback (current, total) during build
|
|
32
33
|
def initialize(input_path, output_dir: Dir.pwd,
|
|
33
34
|
only_concepts: nil, only_languages: nil,
|
|
34
35
|
fetch_relaton_links: false,
|
|
36
|
+
include_areas: true,
|
|
35
37
|
on_progress: nil)
|
|
36
38
|
@input_path = Pathname.new(input_path)
|
|
37
39
|
validate_input!
|
|
38
40
|
|
|
39
41
|
@output_dir = Pathname.new(output_dir)
|
|
40
42
|
@fetch_relaton_links = fetch_relaton_links
|
|
43
|
+
@include_areas = include_areas
|
|
41
44
|
@on_progress = on_progress
|
|
42
45
|
@filters = {
|
|
43
46
|
only_concepts: only_concepts,
|
|
@@ -51,6 +54,8 @@ module Iev
|
|
|
51
54
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
52
55
|
dataset = load_dataset
|
|
53
56
|
collection = build_collection(dataset)
|
|
57
|
+
add_subject_area_concepts(collection) if @include_areas
|
|
58
|
+
build_section_narrower_relations(collection) if @include_areas
|
|
54
59
|
save_collection(collection)
|
|
55
60
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
56
61
|
|
|
@@ -81,7 +86,7 @@ module Iev
|
|
|
81
86
|
|
|
82
87
|
exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
|
|
83
88
|
raise ArgumentError,
|
|
84
|
-
|
|
89
|
+
"Unsupported format: #{input_path.extname}. Supported: #{exts}"
|
|
85
90
|
end
|
|
86
91
|
|
|
87
92
|
def input_format
|
|
@@ -137,10 +142,16 @@ module Iev
|
|
|
137
142
|
|
|
138
143
|
concept = concept_index[term.id] ||= begin
|
|
139
144
|
c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
|
|
145
|
+
c.uuid = term.id
|
|
146
|
+
c.data.domains = domain_references_for(term.id)
|
|
147
|
+
add_section_broader(c, term.id)
|
|
140
148
|
collection.store(c)
|
|
141
149
|
c
|
|
142
150
|
end
|
|
143
151
|
concept.add_l10n(term)
|
|
152
|
+
|
|
153
|
+
promote_supersession(concept, term)
|
|
154
|
+
set_managed_status(concept, term)
|
|
144
155
|
end
|
|
145
156
|
|
|
146
157
|
collection
|
|
@@ -148,14 +159,109 @@ module Iev
|
|
|
148
159
|
SourceParser.relaton_enabled = true
|
|
149
160
|
end
|
|
150
161
|
|
|
162
|
+
def add_subject_area_concepts(collection)
|
|
163
|
+
SubjectAreaConcepts.add_to(collection)
|
|
164
|
+
end
|
|
165
|
+
|
|
151
166
|
def save_collection(collection)
|
|
152
167
|
concepts_dir = output_dir.expand_path.join("concepts")
|
|
153
168
|
FileUtils.mkdir_p(concepts_dir)
|
|
154
|
-
collection.
|
|
169
|
+
collection.save_grouped_concepts_to_files(concepts_dir.to_s)
|
|
155
170
|
end
|
|
156
171
|
|
|
157
172
|
def localized_count(collection)
|
|
158
173
|
collection.sum { |c| c.localized_concepts.count }
|
|
159
174
|
end
|
|
175
|
+
|
|
176
|
+
IEV_SOURCE = "urn:iec:std:iec:60050"
|
|
177
|
+
|
|
178
|
+
def domain_references_for(ievref)
|
|
179
|
+
code = IevCode.new(ievref.to_s)
|
|
180
|
+
refs = []
|
|
181
|
+
if code.area_code
|
|
182
|
+
refs << Glossarist::ConceptReference.new(
|
|
183
|
+
concept_id: code.area_uri,
|
|
184
|
+
source: IEV_SOURCE,
|
|
185
|
+
ref_type: "domain",
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
if code.section_code
|
|
189
|
+
refs << Glossarist::ConceptReference.new(
|
|
190
|
+
concept_id: code.section_uri,
|
|
191
|
+
source: IEV_SOURCE,
|
|
192
|
+
ref_type: "domain",
|
|
193
|
+
)
|
|
194
|
+
end
|
|
195
|
+
refs
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def add_section_broader(concept, ievref)
|
|
199
|
+
code = IevCode.new(ievref.to_s)
|
|
200
|
+
return unless code.section_uri
|
|
201
|
+
|
|
202
|
+
concept.related ||= []
|
|
203
|
+
return if concept.related.any? do |r|
|
|
204
|
+
r.type == "broader" && r.ref&.id == code.section_uri
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
concept.related << Glossarist::RelatedConcept.new(
|
|
208
|
+
type: "broader",
|
|
209
|
+
content: code.section_uri,
|
|
210
|
+
ref: Glossarist::ConceptRef.new(source: "IEV", id: code.section_uri),
|
|
211
|
+
)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def build_section_narrower_relations(collection)
|
|
215
|
+
mc_index = collection.each_with_object({}) do |c, h|
|
|
216
|
+
h[c.data&.id] = c if c.data&.id
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
section_children = {}
|
|
220
|
+
mc_index.each_key do |concept_id|
|
|
221
|
+
code = IevCode.new(concept_id)
|
|
222
|
+
next unless code.section_uri
|
|
223
|
+
|
|
224
|
+
(section_children[code.section_uri] ||= []) << concept_id
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
section_children.each do |section_uri, child_ids|
|
|
228
|
+
section_mc = mc_index[section_uri]
|
|
229
|
+
next unless section_mc
|
|
230
|
+
|
|
231
|
+
narrower = child_ids.sort.map do |child_id|
|
|
232
|
+
Glossarist::RelatedConcept.new(
|
|
233
|
+
type: "narrower",
|
|
234
|
+
content: child_id,
|
|
235
|
+
ref: Glossarist::ConceptRef.new(source: "IEV", id: child_id),
|
|
236
|
+
)
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
section_mc.related ||= []
|
|
240
|
+
section_mc.related.concat(narrower)
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Promote supersedes relations from localized ConceptData to managed level.
|
|
245
|
+
# Supersession is language-independent (REPLACES column is per-concept).
|
|
246
|
+
def promote_supersession(concept, term)
|
|
247
|
+
related = term.data&.related
|
|
248
|
+
return unless related&.any?
|
|
249
|
+
|
|
250
|
+
concept.related ||= []
|
|
251
|
+
related.each do |r|
|
|
252
|
+
next if concept.related.any? { |er| er.type == r.type && er.ref&.id == r.ref&.id }
|
|
253
|
+
|
|
254
|
+
concept.related << r
|
|
255
|
+
end
|
|
256
|
+
term.data.related = nil
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Derive managed concept status from the localization's entry_status.
|
|
260
|
+
def set_managed_status(concept, term)
|
|
261
|
+
return if concept.status
|
|
262
|
+
|
|
263
|
+
status = term.entry_status
|
|
264
|
+
concept.status = status if status && !status.empty?
|
|
265
|
+
end
|
|
160
266
|
end
|
|
161
267
|
end
|
data/lib/iev/iev_code.rb
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Immutable value object that decomposes an IEV concept code
|
|
5
|
+
# into its structural parts: area code, section code, and number.
|
|
6
|
+
#
|
|
7
|
+
# The IEV code format is AAA-BB-CC where:
|
|
8
|
+
# AAA = area code (e.g. "103")
|
|
9
|
+
# BB = section sub-code (e.g. "01")
|
|
10
|
+
# CC = concept number (e.g. "02")
|
|
11
|
+
#
|
|
12
|
+
# @example Full concept code
|
|
13
|
+
# code = Iev::IevCode.new("103-01-02")
|
|
14
|
+
# code.area_code #=> "103"
|
|
15
|
+
# code.section_code #=> "103-01"
|
|
16
|
+
# code.number #=> "02"
|
|
17
|
+
# code.area_uri #=> "area-103"
|
|
18
|
+
# code.section_uri #=> "section-103-01"
|
|
19
|
+
#
|
|
20
|
+
# @example Section code (no concept number)
|
|
21
|
+
# code = Iev::IevCode.new("103-01")
|
|
22
|
+
# code.area_code #=> "103"
|
|
23
|
+
# code.section_code #=> "103-01"
|
|
24
|
+
# code.number #=> nil
|
|
25
|
+
# code.section_uri #=> "section-103-01"
|
|
26
|
+
#
|
|
27
|
+
class IevCode
|
|
28
|
+
include Comparable
|
|
29
|
+
|
|
30
|
+
attr_reader :raw, :area_code, :section_code, :number
|
|
31
|
+
|
|
32
|
+
# @param code [#to_s] IEV reference, e.g. "103-01-02"
|
|
33
|
+
def initialize(code)
|
|
34
|
+
@raw = code.to_s
|
|
35
|
+
parts = @raw.split("-")
|
|
36
|
+
@area_code = parts[0]
|
|
37
|
+
@section_code = parts.length >= 2 ? "#{parts[0]}-#{parts[1]}" : nil
|
|
38
|
+
@number = parts.length >= 3 ? parts[2] : nil
|
|
39
|
+
freeze
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def area_uri
|
|
43
|
+
"area-#{area_code}"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def section_uri
|
|
47
|
+
"section-#{section_code}" if section_code
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def to_s
|
|
51
|
+
@raw
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def to_str
|
|
55
|
+
@raw
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def ==(other)
|
|
59
|
+
other.is_a?(self.class) && raw == other.raw
|
|
60
|
+
end
|
|
61
|
+
alias_method :eql?, :==
|
|
62
|
+
|
|
63
|
+
def hash
|
|
64
|
+
raw.hash
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def <=>(other)
|
|
68
|
+
to_s <=> other.to_s
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Safe constructor that returns nil for codes that don't parse.
|
|
72
|
+
# @param code [#to_s]
|
|
73
|
+
# @return [IevCode, nil]
|
|
74
|
+
def self.parse(code)
|
|
75
|
+
new(code)
|
|
76
|
+
rescue ArgumentError
|
|
77
|
+
nil
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/iev/iso_639_code.rb
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
module Iev
|
|
7
7
|
# @todo This needs to be rewritten.
|
|
8
8
|
class Iso639Code
|
|
9
|
-
COUNTRY_CODES = YAML.
|
|
9
|
+
COUNTRY_CODES = YAML.safe_load(IO.read(File.join(__dir__, "iso_639_2.yaml")), permitted_classes: [Symbol]).freeze
|
|
10
10
|
# rubocop:disable Style/MutableConstant
|
|
11
11
|
THREE_CHAR_MEMO = {} # Memoization cache, must be mutable
|
|
12
12
|
# rubocop:enable Style/MutableConstant
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ferrum"
|
|
4
|
+
|
|
5
|
+
module Iev
|
|
6
|
+
# Shared headless browser utilities for fetching pages behind AWS WAF.
|
|
7
|
+
module ScraperBrowser
|
|
8
|
+
USER_AGENT_PROFILES = [
|
|
9
|
+
{
|
|
10
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
11
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
12
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
13
|
+
platform: '"macOS"',
|
|
14
|
+
chrome_version: "131",
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
18
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
19
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
20
|
+
platform: '"Windows"',
|
|
21
|
+
chrome_version: "130",
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
25
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
26
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
27
|
+
platform: '"Linux"',
|
|
28
|
+
chrome_version: "131",
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
32
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
33
|
+
"Chrome/129.0.0.0 Safari/537.36",
|
|
34
|
+
platform: '"macOS"',
|
|
35
|
+
chrome_version: "129",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
39
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
40
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
41
|
+
platform: '"Windows"',
|
|
42
|
+
chrome_version: "131",
|
|
43
|
+
},
|
|
44
|
+
].freeze
|
|
45
|
+
|
|
46
|
+
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
47
|
+
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
48
|
+
def self.fetch(url, browser_opts: {})
|
|
49
|
+
browser = Ferrum::Browser.new(
|
|
50
|
+
headless: "new",
|
|
51
|
+
timeout: 30,
|
|
52
|
+
window_size: [1366, 768],
|
|
53
|
+
browser_options: {
|
|
54
|
+
"disable-blink-features" => "AutomationControlled",
|
|
55
|
+
},
|
|
56
|
+
**browser_opts,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
browser.headers.set(random_headers)
|
|
60
|
+
browser.go_to(url)
|
|
61
|
+
browser.network.wait_for_idle(timeout: 15)
|
|
62
|
+
html = browser.body
|
|
63
|
+
|
|
64
|
+
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
65
|
+
warn "IEV: AWS WAF blocked request for #{url}"
|
|
66
|
+
return nil
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
html
|
|
70
|
+
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
71
|
+
warn "IEV: Browser error fetching #{url}: #{e.message}"
|
|
72
|
+
nil
|
|
73
|
+
ensure
|
|
74
|
+
browser&.quit
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def self.random_headers
|
|
78
|
+
profile = USER_AGENT_PROFILES.sample
|
|
79
|
+
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
80
|
+
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
81
|
+
"\"Not_A Brand\";v=\"24\""
|
|
82
|
+
|
|
83
|
+
{
|
|
84
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
85
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
86
|
+
"application/signed-exchange;v=b3;q=0.7",
|
|
87
|
+
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
88
|
+
"Cache-Control" => "no-cache",
|
|
89
|
+
"Pragma" => "no-cache",
|
|
90
|
+
"Sec-Ch-Ua" => sec_ch_ua,
|
|
91
|
+
"Sec-Ch-Ua-Mobile" => "?0",
|
|
92
|
+
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
93
|
+
"Sec-Fetch-Dest" => "document",
|
|
94
|
+
"Sec-Fetch-Mode" => "navigate",
|
|
95
|
+
"Sec-Fetch-Site" => "cross-site",
|
|
96
|
+
"Sec-Fetch-User" => "?1",
|
|
97
|
+
"Upgrade-Insecure-Requests" => "1",
|
|
98
|
+
"User-Agent" => profile[:user_agent],
|
|
99
|
+
}
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
data/lib/iev/scraper.rb
CHANGED
|
@@ -1,59 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
3
5
|
module Iev
|
|
4
|
-
# Scrapes IEV term data from Electropedia (electropedia.org).
|
|
5
|
-
#
|
|
6
|
-
# Electropedia is behind AWS WAF which requires JavaScript execution,
|
|
7
|
-
# so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
|
|
8
|
-
#
|
|
9
|
-
# @example
|
|
10
|
-
# scraper = Iev::Scraper.new
|
|
11
|
-
# concept = scraper.fetch_concept("103-01-02")
|
|
12
|
-
# doc = scraper.fetch_page("103-01-02")
|
|
13
6
|
class Scraper
|
|
14
7
|
BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
|
|
15
8
|
"display?openform&ievref="
|
|
16
9
|
|
|
17
|
-
# Pool of realistic Chrome User-Agent strings with matching platform hints.
|
|
18
|
-
# Rotated per request to reduce fingerprinting by AWS WAF.
|
|
19
|
-
USER_AGENT_PROFILES = [
|
|
20
|
-
{
|
|
21
|
-
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
22
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
23
|
-
"Chrome/131.0.0.0 Safari/537.36",
|
|
24
|
-
platform: '"macOS"',
|
|
25
|
-
chrome_version: "131",
|
|
26
|
-
},
|
|
27
|
-
{
|
|
28
|
-
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
29
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
30
|
-
"Chrome/130.0.0.0 Safari/537.36",
|
|
31
|
-
platform: '"Windows"',
|
|
32
|
-
chrome_version: "130",
|
|
33
|
-
},
|
|
34
|
-
{
|
|
35
|
-
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
36
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
37
|
-
"Chrome/131.0.0.0 Safari/537.36",
|
|
38
|
-
platform: '"Linux"',
|
|
39
|
-
chrome_version: "131",
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
43
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
44
|
-
"Chrome/129.0.0.0 Safari/537.36",
|
|
45
|
-
platform: '"macOS"',
|
|
46
|
-
chrome_version: "129",
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
50
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
51
|
-
"Chrome/131.0.0.0 Safari/537.36",
|
|
52
|
-
platform: '"Windows"',
|
|
53
|
-
chrome_version: "131",
|
|
54
|
-
},
|
|
55
|
-
].freeze
|
|
56
|
-
|
|
57
10
|
def initialize(browser_opts: {})
|
|
58
11
|
@browser_opts = browser_opts
|
|
59
12
|
end
|
|
@@ -61,37 +14,10 @@ module Iev
|
|
|
61
14
|
# Fetch the Electropedia page HTML for a given IEV code.
|
|
62
15
|
# Returns a Nokogiri document.
|
|
63
16
|
def fetch_page(code)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
url = "#{BASE_URL}#{code}"
|
|
68
|
-
browser = Ferrum::Browser.new(
|
|
69
|
-
headless: "new",
|
|
70
|
-
timeout: 30,
|
|
71
|
-
window_size: [1366, 768],
|
|
72
|
-
browser_options: {
|
|
73
|
-
"disable-blink-features" => "AutomationControlled",
|
|
74
|
-
},
|
|
75
|
-
**@browser_opts,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
browser.headers.set(random_headers)
|
|
79
|
-
browser.go_to(url)
|
|
80
|
-
browser.network.wait_for_idle(timeout: 15)
|
|
81
|
-
html = browser.body
|
|
82
|
-
|
|
83
|
-
# Check if we got a real page or a WAF block
|
|
84
|
-
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
85
|
-
warn "IEV Scraper: AWS WAF blocked request for #{code}"
|
|
86
|
-
return nil
|
|
87
|
-
end
|
|
17
|
+
html = ScraperBrowser.fetch("#{BASE_URL}#{code}", browser_opts: @browser_opts)
|
|
18
|
+
return nil unless html
|
|
88
19
|
|
|
89
20
|
Nokogiri::HTML(html)
|
|
90
|
-
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
91
|
-
warn "IEV Scraper error for #{code}: #{e.message}"
|
|
92
|
-
nil
|
|
93
|
-
ensure
|
|
94
|
-
browser&.quit
|
|
95
21
|
end
|
|
96
22
|
|
|
97
23
|
# Fetch and parse concept data for an IEV code.
|
|
@@ -102,34 +28,8 @@ module Iev
|
|
|
102
28
|
|
|
103
29
|
PageParser.new(doc, code).parse
|
|
104
30
|
end
|
|
105
|
-
|
|
106
|
-
private
|
|
107
|
-
|
|
108
|
-
def random_headers
|
|
109
|
-
profile = USER_AGENT_PROFILES.sample
|
|
110
|
-
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
111
|
-
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
112
|
-
"\"Not_A Brand\";v=\"24\""
|
|
113
|
-
|
|
114
|
-
{
|
|
115
|
-
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
116
|
-
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
117
|
-
"application/signed-exchange;v=b3;q=0.7",
|
|
118
|
-
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
119
|
-
"Cache-Control" => "no-cache",
|
|
120
|
-
"Pragma" => "no-cache",
|
|
121
|
-
"Sec-Ch-Ua" => sec_ch_ua,
|
|
122
|
-
"Sec-Ch-Ua-Mobile" => "?0",
|
|
123
|
-
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
124
|
-
"Sec-Fetch-Dest" => "document",
|
|
125
|
-
"Sec-Fetch-Mode" => "navigate",
|
|
126
|
-
"Sec-Fetch-Site" => "cross-site",
|
|
127
|
-
"Sec-Fetch-User" => "?1",
|
|
128
|
-
"Upgrade-Insecure-Requests" => "1",
|
|
129
|
-
"User-Agent" => profile[:user_agent],
|
|
130
|
-
}
|
|
131
|
-
end
|
|
132
31
|
end
|
|
133
32
|
end
|
|
134
33
|
|
|
34
|
+
require_relative "scraper/browser"
|
|
135
35
|
require_relative "scraper/page_parser"
|
data/lib/iev/section.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Immutable value object representing an IEV section (e.g. "103-01").
|
|
5
|
+
#
|
|
6
|
+
# A section belongs to exactly one area, identified by +area_code+.
|
|
7
|
+
class Section
|
|
8
|
+
attr_reader :code, :title, :area_code
|
|
9
|
+
|
|
10
|
+
# @param code [#to_s] section code, e.g. "103-01"
|
|
11
|
+
# @param title [#to_s] section title, e.g. "General concepts on functions"
|
|
12
|
+
# @param area_code [#to_s] parent area code, e.g. "103"
|
|
13
|
+
def initialize(code:, title:, area_code:)
|
|
14
|
+
@code = code.to_s
|
|
15
|
+
@title = title.to_s
|
|
16
|
+
@area_code = area_code.to_s
|
|
17
|
+
freeze
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def uri
|
|
21
|
+
"section-#{code}"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def to_h
|
|
25
|
+
{ "code" => code, "title" => title }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def ==(other)
|
|
29
|
+
other.is_a?(self.class) && code == other.code
|
|
30
|
+
end
|
|
31
|
+
alias_method :eql?, :==
|
|
32
|
+
|
|
33
|
+
def hash
|
|
34
|
+
code.hash
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
data/lib/iev/source_parser.rb
CHANGED
|
@@ -79,10 +79,11 @@ module Iev
|
|
|
79
79
|
relationship = extract_source_relationship(raw_ref)
|
|
80
80
|
clean_ref = normalize_ref_string(raw_ref)
|
|
81
81
|
source_ref = extract_source_ref(clean_ref)
|
|
82
|
+
ref_source, ref_id = split_ref(source_ref)
|
|
82
83
|
clause = extract_source_clause(clean_ref)
|
|
83
84
|
|
|
84
85
|
origin = Glossarist::Citation.new(
|
|
85
|
-
ref:
|
|
86
|
+
ref: Glossarist::Citation::Ref.new(source: ref_source, id: ref_id),
|
|
86
87
|
locality: build_locality(clause),
|
|
87
88
|
link: obtain_source_link(source_ref),
|
|
88
89
|
original: Iev::Converter.mathml_to_asciimath(
|
|
@@ -351,6 +352,52 @@ module Iev
|
|
|
351
352
|
)
|
|
352
353
|
end
|
|
353
354
|
|
|
355
|
+
# Splits a normalized bibliographic reference into [source, id] for
|
|
356
|
+
# structured Citation::Ref construction. The full string is still
|
|
357
|
+
# passed to Relaton for link resolution — only the Citation::Ref
|
|
358
|
+
# model receives the split form.
|
|
359
|
+
#
|
|
360
|
+
# "IEC 62302:2007" → ["IEC", "62302:2007"]
|
|
361
|
+
# "ISO/IEC 2382:2015" → ["ISO/IEC", "2382:2015"]
|
|
362
|
+
# "ISO/TS 14812:2022" → ["ISO/TS", "14812:2022"]
|
|
363
|
+
# "IEC CISPR 16-1:2003" → ["IEC CISPR", "16-1:2003"]
|
|
364
|
+
# "ITU-T Recommendation F.791 (11/2015)" → ["ITU-T Recommendation", "F.791 (11/2015)"]
|
|
365
|
+
# "IEV" → ["IEV", nil]
|
|
366
|
+
def split_ref(full_ref)
|
|
367
|
+
case full_ref
|
|
368
|
+
when /\A(ISO\/IEC\/IEEE)\s+(.+)/
|
|
369
|
+
[$1, $2]
|
|
370
|
+
when /\A(ISO\/IEC\s+Guide)\s+(.+)/
|
|
371
|
+
[$1, $2]
|
|
372
|
+
when /\A(ISO\/IEC)\s+(.+)/
|
|
373
|
+
[$1, $2]
|
|
374
|
+
when /\A(IEC\/IEEE)\s+(.+)/
|
|
375
|
+
[$1, $2]
|
|
376
|
+
when %r{\A((?:ISO|IEC)/(?:PAS|TR|TS))\s+(.+)}
|
|
377
|
+
[$1, $2]
|
|
378
|
+
when /\A(IEC\s+CISPR)\s+(.+)/
|
|
379
|
+
[$1, $2]
|
|
380
|
+
when /\A(ITU-T\s+Recommendation)\s+(.+)/
|
|
381
|
+
[$1, $2]
|
|
382
|
+
when /\A(ITU-R\s+Recommendation)\s+(.+)/
|
|
383
|
+
[$1, $2]
|
|
384
|
+
when /\A(ITU-R)\s+(.+)/
|
|
385
|
+
[$1, $2]
|
|
386
|
+
when /\A((?:ISO|IEC)\s+Guide)\s+(.+)/
|
|
387
|
+
[$1, $2]
|
|
388
|
+
when /\A(ISO|IEC|IAEA)\s+(.+)/
|
|
389
|
+
[$1, $2]
|
|
390
|
+
when /\AIEV\z/
|
|
391
|
+
["IEV", nil]
|
|
392
|
+
when /\A(JCGM)\s+(VIM)\z/
|
|
393
|
+
[$1, $2]
|
|
394
|
+
when /\ABBIPM/
|
|
395
|
+
["BIPM", "SI Brochure"]
|
|
396
|
+
else
|
|
397
|
+
[full_ref, nil]
|
|
398
|
+
end
|
|
399
|
+
end
|
|
400
|
+
|
|
354
401
|
# Uses Relaton to obtain link for given source ref.
|
|
355
402
|
def obtain_source_link(ref)
|
|
356
403
|
return nil unless self.class.relaton_enabled
|