iev 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +3 -0
- data/Gemfile +3 -18
- data/README.adoc +34 -0
- data/data/subject_areas.yaml +1920 -0
- data/lib/iev/cli/command.rb +24 -0
- data/lib/iev/exporter.rb +19 -0
- data/lib/iev/scraper/browser.rb +102 -0
- data/lib/iev/scraper.rb +5 -105
- data/lib/iev/subject_area_concepts.rb +123 -0
- data/lib/iev/subject_areas.rb +232 -0
- data/lib/iev/term_builder.rb +19 -0
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +36 -0
- metadata +6 -2
data/lib/iev/cli/command.rb
CHANGED
|
@@ -142,6 +142,30 @@ module Iev
|
|
|
142
142
|
summary
|
|
143
143
|
end
|
|
144
144
|
|
|
145
|
+
desc "subject_areas", "Fetch IEV subject areas and sections from Electropedia."
|
|
146
|
+
option :output, desc: "Output YAML file (default: stdout)", aliases: :o
|
|
147
|
+
option :refresh, type: :boolean, default: false,
|
|
148
|
+
desc: "Force re-fetch even if cached"
|
|
149
|
+
def subject_areas
|
|
150
|
+
if options[:refresh]
|
|
151
|
+
cache_path = File.join(Iev.config.cache_dir, "subject_areas.yaml")
|
|
152
|
+
FileUtils.rm_f(cache_path) if File.exist?(cache_path)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
result = Iev::SubjectAreas.fetch
|
|
156
|
+
|
|
157
|
+
yaml = YAML.dump(result)
|
|
158
|
+
if options[:output]
|
|
159
|
+
File.write(options[:output], yaml, encoding: "utf-8")
|
|
160
|
+
puts "Written to #{options[:output]}"
|
|
161
|
+
else
|
|
162
|
+
puts yaml
|
|
163
|
+
end
|
|
164
|
+
rescue Iev::SubjectAreas::FetchError => e
|
|
165
|
+
error e.message
|
|
166
|
+
exit 1
|
|
167
|
+
end
|
|
168
|
+
|
|
145
169
|
desc "fetch CODE", "Fetch an IEV concept and output YAML to stdout."
|
|
146
170
|
option :scrape, type: :boolean, default: false,
|
|
147
171
|
desc: "Scrape from Electropedia instead of using cached data"
|
data/lib/iev/exporter.rb
CHANGED
|
@@ -28,16 +28,19 @@ module Iev
|
|
|
28
28
|
# @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
|
|
29
29
|
# @param only_languages [String, nil] comma-separated language codes
|
|
30
30
|
# @param fetch_relaton_links [Boolean] fetch source URLs via Relaton
|
|
31
|
+
# @param include_areas [Boolean] create area/section hierarchy concepts
|
|
31
32
|
# @param on_progress [Proc, nil] callback (current, total) during build
|
|
32
33
|
def initialize(input_path, output_dir: Dir.pwd,
|
|
33
34
|
only_concepts: nil, only_languages: nil,
|
|
34
35
|
fetch_relaton_links: false,
|
|
36
|
+
include_areas: true,
|
|
35
37
|
on_progress: nil)
|
|
36
38
|
@input_path = Pathname.new(input_path)
|
|
37
39
|
validate_input!
|
|
38
40
|
|
|
39
41
|
@output_dir = Pathname.new(output_dir)
|
|
40
42
|
@fetch_relaton_links = fetch_relaton_links
|
|
43
|
+
@include_areas = include_areas
|
|
41
44
|
@on_progress = on_progress
|
|
42
45
|
@filters = {
|
|
43
46
|
only_concepts: only_concepts,
|
|
@@ -51,6 +54,7 @@ module Iev
|
|
|
51
54
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
52
55
|
dataset = load_dataset
|
|
53
56
|
collection = build_collection(dataset)
|
|
57
|
+
add_subject_area_concepts(collection) if @include_areas
|
|
54
58
|
save_collection(collection)
|
|
55
59
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
56
60
|
|
|
@@ -137,6 +141,7 @@ module Iev
|
|
|
137
141
|
|
|
138
142
|
concept = concept_index[term.id] ||= begin
|
|
139
143
|
c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
|
|
144
|
+
c.data.domains = domain_references_for(term.id)
|
|
140
145
|
collection.store(c)
|
|
141
146
|
c
|
|
142
147
|
end
|
|
@@ -148,6 +153,10 @@ module Iev
|
|
|
148
153
|
SourceParser.relaton_enabled = true
|
|
149
154
|
end
|
|
150
155
|
|
|
156
|
+
def add_subject_area_concepts(collection)
|
|
157
|
+
SubjectAreaConcepts.add_to(collection)
|
|
158
|
+
end
|
|
159
|
+
|
|
151
160
|
def save_collection(collection)
|
|
152
161
|
concepts_dir = output_dir.expand_path.join("concepts")
|
|
153
162
|
FileUtils.mkdir_p(concepts_dir)
|
|
@@ -157,5 +166,15 @@ module Iev
|
|
|
157
166
|
def localized_count(collection)
|
|
158
167
|
collection.sum { |c| c.localized_concepts.count }
|
|
159
168
|
end
|
|
169
|
+
|
|
170
|
+
def domain_references_for(ievref)
|
|
171
|
+
parts = ievref.to_s.split("-")
|
|
172
|
+
return [] unless parts.length >= 2
|
|
173
|
+
|
|
174
|
+
[
|
|
175
|
+
SubjectAreas.area_uri(parts[0]),
|
|
176
|
+
SubjectAreas.section_uri(parts[0..1].join("-")),
|
|
177
|
+
].map { |id| Glossarist::ConceptReference.domain(id) }
|
|
178
|
+
end
|
|
160
179
|
end
|
|
161
180
|
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ferrum"
|
|
4
|
+
|
|
5
|
+
module Iev
|
|
6
|
+
# Shared headless browser utilities for fetching pages behind AWS WAF.
|
|
7
|
+
module ScraperBrowser
|
|
8
|
+
USER_AGENT_PROFILES = [
|
|
9
|
+
{
|
|
10
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
11
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
12
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
13
|
+
platform: '"macOS"',
|
|
14
|
+
chrome_version: "131",
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
18
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
19
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
20
|
+
platform: '"Windows"',
|
|
21
|
+
chrome_version: "130",
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
25
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
26
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
27
|
+
platform: '"Linux"',
|
|
28
|
+
chrome_version: "131",
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
32
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
33
|
+
"Chrome/129.0.0.0 Safari/537.36",
|
|
34
|
+
platform: '"macOS"',
|
|
35
|
+
chrome_version: "129",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
39
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
40
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
41
|
+
platform: '"Windows"',
|
|
42
|
+
chrome_version: "131",
|
|
43
|
+
},
|
|
44
|
+
].freeze
|
|
45
|
+
|
|
46
|
+
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
47
|
+
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
48
|
+
def self.fetch(url, browser_opts: {})
|
|
49
|
+
browser = Ferrum::Browser.new(
|
|
50
|
+
headless: "new",
|
|
51
|
+
timeout: 30,
|
|
52
|
+
window_size: [1366, 768],
|
|
53
|
+
browser_options: {
|
|
54
|
+
"disable-blink-features" => "AutomationControlled",
|
|
55
|
+
},
|
|
56
|
+
**browser_opts,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
browser.headers.set(random_headers)
|
|
60
|
+
browser.go_to(url)
|
|
61
|
+
browser.network.wait_for_idle(timeout: 15)
|
|
62
|
+
html = browser.body
|
|
63
|
+
|
|
64
|
+
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
65
|
+
warn "IEV: AWS WAF blocked request for #{url}"
|
|
66
|
+
return nil
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
html
|
|
70
|
+
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
71
|
+
warn "IEV: Browser error fetching #{url}: #{e.message}"
|
|
72
|
+
nil
|
|
73
|
+
ensure
|
|
74
|
+
browser&.quit
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def self.random_headers
|
|
78
|
+
profile = USER_AGENT_PROFILES.sample
|
|
79
|
+
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
80
|
+
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
81
|
+
"\"Not_A Brand\";v=\"24\""
|
|
82
|
+
|
|
83
|
+
{
|
|
84
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
85
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
86
|
+
"application/signed-exchange;v=b3;q=0.7",
|
|
87
|
+
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
88
|
+
"Cache-Control" => "no-cache",
|
|
89
|
+
"Pragma" => "no-cache",
|
|
90
|
+
"Sec-Ch-Ua" => sec_ch_ua,
|
|
91
|
+
"Sec-Ch-Ua-Mobile" => "?0",
|
|
92
|
+
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
93
|
+
"Sec-Fetch-Dest" => "document",
|
|
94
|
+
"Sec-Fetch-Mode" => "navigate",
|
|
95
|
+
"Sec-Fetch-Site" => "cross-site",
|
|
96
|
+
"Sec-Fetch-User" => "?1",
|
|
97
|
+
"Upgrade-Insecure-Requests" => "1",
|
|
98
|
+
"User-Agent" => profile[:user_agent],
|
|
99
|
+
}
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
data/lib/iev/scraper.rb
CHANGED
|
@@ -1,59 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
3
5
|
module Iev
|
|
4
|
-
# Scrapes IEV term data from Electropedia (electropedia.org).
|
|
5
|
-
#
|
|
6
|
-
# Electropedia is behind AWS WAF which requires JavaScript execution,
|
|
7
|
-
# so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
|
|
8
|
-
#
|
|
9
|
-
# @example
|
|
10
|
-
# scraper = Iev::Scraper.new
|
|
11
|
-
# concept = scraper.fetch_concept("103-01-02")
|
|
12
|
-
# doc = scraper.fetch_page("103-01-02")
|
|
13
6
|
class Scraper
|
|
14
7
|
BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
|
|
15
8
|
"display?openform&ievref="
|
|
16
9
|
|
|
17
|
-
# Pool of realistic Chrome User-Agent strings with matching platform hints.
|
|
18
|
-
# Rotated per request to reduce fingerprinting by AWS WAF.
|
|
19
|
-
USER_AGENT_PROFILES = [
|
|
20
|
-
{
|
|
21
|
-
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
22
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
23
|
-
"Chrome/131.0.0.0 Safari/537.36",
|
|
24
|
-
platform: '"macOS"',
|
|
25
|
-
chrome_version: "131",
|
|
26
|
-
},
|
|
27
|
-
{
|
|
28
|
-
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
29
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
30
|
-
"Chrome/130.0.0.0 Safari/537.36",
|
|
31
|
-
platform: '"Windows"',
|
|
32
|
-
chrome_version: "130",
|
|
33
|
-
},
|
|
34
|
-
{
|
|
35
|
-
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
36
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
37
|
-
"Chrome/131.0.0.0 Safari/537.36",
|
|
38
|
-
platform: '"Linux"',
|
|
39
|
-
chrome_version: "131",
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
43
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
44
|
-
"Chrome/129.0.0.0 Safari/537.36",
|
|
45
|
-
platform: '"macOS"',
|
|
46
|
-
chrome_version: "129",
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
50
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
51
|
-
"Chrome/131.0.0.0 Safari/537.36",
|
|
52
|
-
platform: '"Windows"',
|
|
53
|
-
chrome_version: "131",
|
|
54
|
-
},
|
|
55
|
-
].freeze
|
|
56
|
-
|
|
57
10
|
def initialize(browser_opts: {})
|
|
58
11
|
@browser_opts = browser_opts
|
|
59
12
|
end
|
|
@@ -61,37 +14,10 @@ module Iev
|
|
|
61
14
|
# Fetch the Electropedia page HTML for a given IEV code.
|
|
62
15
|
# Returns a Nokogiri document.
|
|
63
16
|
def fetch_page(code)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
url = "#{BASE_URL}#{code}"
|
|
68
|
-
browser = Ferrum::Browser.new(
|
|
69
|
-
headless: "new",
|
|
70
|
-
timeout: 30,
|
|
71
|
-
window_size: [1366, 768],
|
|
72
|
-
browser_options: {
|
|
73
|
-
"disable-blink-features" => "AutomationControlled",
|
|
74
|
-
},
|
|
75
|
-
**@browser_opts,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
browser.headers.set(random_headers)
|
|
79
|
-
browser.go_to(url)
|
|
80
|
-
browser.network.wait_for_idle(timeout: 15)
|
|
81
|
-
html = browser.body
|
|
82
|
-
|
|
83
|
-
# Check if we got a real page or a WAF block
|
|
84
|
-
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
85
|
-
warn "IEV Scraper: AWS WAF blocked request for #{code}"
|
|
86
|
-
return nil
|
|
87
|
-
end
|
|
17
|
+
html = ScraperBrowser.fetch("#{BASE_URL}#{code}", browser_opts: @browser_opts)
|
|
18
|
+
return nil unless html
|
|
88
19
|
|
|
89
20
|
Nokogiri::HTML(html)
|
|
90
|
-
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
91
|
-
warn "IEV Scraper error for #{code}: #{e.message}"
|
|
92
|
-
nil
|
|
93
|
-
ensure
|
|
94
|
-
browser&.quit
|
|
95
21
|
end
|
|
96
22
|
|
|
97
23
|
# Fetch and parse concept data for an IEV code.
|
|
@@ -102,34 +28,8 @@ module Iev
|
|
|
102
28
|
|
|
103
29
|
PageParser.new(doc, code).parse
|
|
104
30
|
end
|
|
105
|
-
|
|
106
|
-
private
|
|
107
|
-
|
|
108
|
-
def random_headers
|
|
109
|
-
profile = USER_AGENT_PROFILES.sample
|
|
110
|
-
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
111
|
-
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
112
|
-
"\"Not_A Brand\";v=\"24\""
|
|
113
|
-
|
|
114
|
-
{
|
|
115
|
-
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
116
|
-
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
117
|
-
"application/signed-exchange;v=b3;q=0.7",
|
|
118
|
-
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
119
|
-
"Cache-Control" => "no-cache",
|
|
120
|
-
"Pragma" => "no-cache",
|
|
121
|
-
"Sec-Ch-Ua" => sec_ch_ua,
|
|
122
|
-
"Sec-Ch-Ua-Mobile" => "?0",
|
|
123
|
-
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
124
|
-
"Sec-Fetch-Dest" => "document",
|
|
125
|
-
"Sec-Fetch-Mode" => "navigate",
|
|
126
|
-
"Sec-Fetch-Site" => "cross-site",
|
|
127
|
-
"Sec-Fetch-User" => "?1",
|
|
128
|
-
"Upgrade-Insecure-Requests" => "1",
|
|
129
|
-
"User-Agent" => profile[:user_agent],
|
|
130
|
-
}
|
|
131
|
-
end
|
|
132
31
|
end
|
|
133
32
|
end
|
|
134
33
|
|
|
34
|
+
require_relative "scraper/browser"
|
|
135
35
|
require_relative "scraper/page_parser"
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Creates ManagedConcept entries for the IEV subject area hierarchy.
|
|
5
|
+
#
|
|
6
|
+
# The hierarchy has two levels:
|
|
7
|
+
# - Area (e.g., "102" = "Mathematics - General concepts and linear algebra")
|
|
8
|
+
# - Section (e.g., "102-01" = "Sets and operations")
|
|
9
|
+
#
|
|
10
|
+
# Linking:
|
|
11
|
+
# - Each IEV concept's ConceptData#domain references its section URI
|
|
12
|
+
# - Each IEV concept's ManagedConceptData#domains includes area and section codes
|
|
13
|
+
# - Each section concept has a "broader" relation to its parent area
|
|
14
|
+
# - Each area concept has "narrower" relations to its sections
|
|
15
|
+
module SubjectAreaConcepts
|
|
16
|
+
class << self
|
|
17
|
+
# Build all area and section concepts and add them to the collection.
|
|
18
|
+
#
|
|
19
|
+
# @param collection [Glossarist::ManagedConceptCollection]
|
|
20
|
+
# @return [void]
|
|
21
|
+
def add_to(collection)
|
|
22
|
+
Iev.subject_areas.each do |area|
|
|
23
|
+
area_mc = build_area_concept(area)
|
|
24
|
+
collection.store(area_mc)
|
|
25
|
+
|
|
26
|
+
(area["sections"] || []).each do |section|
|
|
27
|
+
section_mc = build_section_concept(section, area)
|
|
28
|
+
collection.store(section_mc)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def build_area_concept(area)
|
|
36
|
+
id = SubjectAreas.area_uri(area["code"])
|
|
37
|
+
|
|
38
|
+
mc = Glossarist::ManagedConcept.new(
|
|
39
|
+
data: Glossarist::ManagedConceptData.new(
|
|
40
|
+
id: id,
|
|
41
|
+
domains: [Glossarist::ConceptReference.domain(id)],
|
|
42
|
+
),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
mc.add_localization(build_localization(id, area["title"], "eng"))
|
|
46
|
+
|
|
47
|
+
narrower = (area["sections"] || []).map { |s| build_narrower_ref(s["code"]) }
|
|
48
|
+
mc.related = narrower unless narrower.empty?
|
|
49
|
+
|
|
50
|
+
mc
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def build_section_concept(section, area)
|
|
54
|
+
id = SubjectAreas.section_uri(section["code"])
|
|
55
|
+
|
|
56
|
+
mc = Glossarist::ManagedConcept.new(
|
|
57
|
+
data: Glossarist::ManagedConceptData.new(
|
|
58
|
+
id: id,
|
|
59
|
+
domains: [
|
|
60
|
+
Glossarist::ConceptReference.domain(SubjectAreas.area_uri(area["code"])),
|
|
61
|
+
Glossarist::ConceptReference.domain(id),
|
|
62
|
+
],
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
cd = build_concept_data(id, section["title"], "eng")
|
|
67
|
+
cd.domain = SubjectAreas.area_uri(area["code"])
|
|
68
|
+
cd.related = [build_broader_ref(area["code"])]
|
|
69
|
+
|
|
70
|
+
mc.add_localization(build_localization_from_data(id, cd))
|
|
71
|
+
mc
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def build_concept_data(id, title, lang_code)
|
|
75
|
+
Glossarist::ConceptData.new(
|
|
76
|
+
id: id,
|
|
77
|
+
language_code: lang_code,
|
|
78
|
+
terms: [
|
|
79
|
+
Glossarist::Designation::Expression.new(
|
|
80
|
+
type: "expression",
|
|
81
|
+
designation: title,
|
|
82
|
+
normative_status: "preferred",
|
|
83
|
+
),
|
|
84
|
+
],
|
|
85
|
+
)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def build_localization(id, title, lang_code)
|
|
89
|
+
cd = build_concept_data(id, title, lang_code)
|
|
90
|
+
|
|
91
|
+
l10n = Glossarist::LocalizedConcept.new
|
|
92
|
+
l10n.data = cd
|
|
93
|
+
l10n.id = id
|
|
94
|
+
l10n.entry_status = "valid"
|
|
95
|
+
l10n.data.review_decision_event = "published"
|
|
96
|
+
l10n
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def build_localization_from_data(id, concept_data)
|
|
100
|
+
l10n = Glossarist::LocalizedConcept.new
|
|
101
|
+
l10n.data = concept_data
|
|
102
|
+
l10n.id = id
|
|
103
|
+
l10n.entry_status = "valid"
|
|
104
|
+
l10n.data.review_decision_event = "published"
|
|
105
|
+
l10n
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def build_broader_ref(area_code)
|
|
109
|
+
Glossarist::RelatedConcept.new(
|
|
110
|
+
type: "broader",
|
|
111
|
+
content: SubjectAreas.area_uri(area_code),
|
|
112
|
+
)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def build_narrower_ref(section_code)
|
|
116
|
+
Glossarist::RelatedConcept.new(
|
|
117
|
+
type: "narrower",
|
|
118
|
+
content: SubjectAreas.section_uri(section_code),
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
require "nokogiri"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "iev/config"
|
|
7
|
+
|
|
8
|
+
module Iev
|
|
9
|
+
module SubjectAreas
|
|
10
|
+
DATA_FILE = File.expand_path("../../data/subject_areas.yaml", __dir__)
|
|
11
|
+
|
|
12
|
+
AREAS_URL = "https://electropedia.org/iev/iev.nsf/" \
|
|
13
|
+
"6d6bdd8667c378f7c12581fa003d80e7?OpenForm"
|
|
14
|
+
SECTIONS_URL_TEMPLATE = "https://electropedia.org/iev/iev.nsf/" \
|
|
15
|
+
"index?openform&part=%<part>s"
|
|
16
|
+
|
|
17
|
+
MIN_PAGE_SIZE = 15_000
|
|
18
|
+
|
|
19
|
+
FETCH_DELAY = 5
|
|
20
|
+
RETRY_DELAY = 30
|
|
21
|
+
MAX_RETRIES = 2
|
|
22
|
+
|
|
23
|
+
class FetchError < StandardError; end
|
|
24
|
+
|
|
25
|
+
class << self
|
|
26
|
+
# --- URI scheme ---
|
|
27
|
+
|
|
28
|
+
# URI for a subject area concept.
|
|
29
|
+
# @param code [String, Integer] e.g. "102"
|
|
30
|
+
# @return [String] e.g. "area-102"
|
|
31
|
+
def area_uri(code)
|
|
32
|
+
"area-#{code}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# URI for a section concept.
|
|
36
|
+
# @param code [String] e.g. "103-01"
|
|
37
|
+
# @return [String] e.g. "section-103-01"
|
|
38
|
+
def section_uri(code)
|
|
39
|
+
"section-#{code}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# --- Query API (reads from bundled data) ---
|
|
43
|
+
|
|
44
|
+
# Return all subject areas with their sections.
|
|
45
|
+
# @return [Array<Hash>] each hash has "code", "title", "sections"
|
|
46
|
+
def all
|
|
47
|
+
data["areas"]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Find a single subject area by its numeric code.
|
|
51
|
+
# @param code [String, Integer] e.g. "102" or 102
|
|
52
|
+
# @return [Hash, nil]
|
|
53
|
+
def find_area(code)
|
|
54
|
+
all.find { |a| a["code"] == code.to_s }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Return all sections for a given area code.
|
|
58
|
+
# @param code [String, Integer] area code, e.g. "102"
|
|
59
|
+
# @return [Array<Hash>] each hash has "code", "title"
|
|
60
|
+
def sections_for(code)
|
|
61
|
+
area = find_area(code)
|
|
62
|
+
area ? area["sections"] : []
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Find a single section by its section code.
|
|
66
|
+
# @param section_code [String] e.g. "102-01"
|
|
67
|
+
# @return [Hash, nil]
|
|
68
|
+
def find_section(section_code)
|
|
69
|
+
sc = section_code.to_s
|
|
70
|
+
all.each do |area|
|
|
71
|
+
found = area["sections"]&.find { |s| s["code"] == sc }
|
|
72
|
+
return found if found
|
|
73
|
+
end
|
|
74
|
+
nil
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Return the parent area for a given section code.
|
|
78
|
+
# @param section_code [String] e.g. "102-01"
|
|
79
|
+
# @return [Hash, nil]
|
|
80
|
+
def area_for_section(section_code)
|
|
81
|
+
sc = section_code.to_s
|
|
82
|
+
all.find do |area|
|
|
83
|
+
area["sections"]&.any? { |s| s["code"] == sc }
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# --- Fetching (network, writes to bundled data file) ---
|
|
88
|
+
|
|
89
|
+
def fetch
|
|
90
|
+
cached = read_cache("subject_areas.yaml")
|
|
91
|
+
return cached if cached && complete?(cached)
|
|
92
|
+
|
|
93
|
+
areas = cached ? cached["areas"] : []
|
|
94
|
+
fresh_areas = fetch_areas
|
|
95
|
+
puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
|
|
96
|
+
|
|
97
|
+
# Merge: keep existing sections, add new areas
|
|
98
|
+
existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a }
|
|
99
|
+
fresh_areas.each do |fa|
|
|
100
|
+
existing[fa["code"]] ||= fa
|
|
101
|
+
end
|
|
102
|
+
areas = fresh_areas.map { |fa| existing[fa["code"]] || fa }
|
|
103
|
+
|
|
104
|
+
areas.each_with_index do |area, i|
|
|
105
|
+
next if area["fetched"]
|
|
106
|
+
|
|
107
|
+
begin
|
|
108
|
+
area["sections"] = fetch_sections(area["code"])
|
|
109
|
+
area["fetched"] = true
|
|
110
|
+
rescue FetchError
|
|
111
|
+
area["sections"] ||= []
|
|
112
|
+
warn "IEV: Skipping area #{area["code"]} due to WAF"
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty?
|
|
116
|
+
|
|
117
|
+
# Save progress every 10 areas so partial results survive WAF failures
|
|
118
|
+
if (i + 1) % 10 == 0
|
|
119
|
+
write_cache("subject_areas.yaml", { "areas" => areas })
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
sleep FETCH_DELAY unless i == areas.length - 1
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
result = { "areas" => areas }
|
|
126
|
+
write_cache("subject_areas.yaml", result)
|
|
127
|
+
result
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def fetch_areas
|
|
131
|
+
html = fetch_page_with_retry(AREAS_URL)
|
|
132
|
+
doc = Nokogiri::HTML(html)
|
|
133
|
+
|
|
134
|
+
areas = []
|
|
135
|
+
doc.css("a").each do |link|
|
|
136
|
+
href = link["href"].to_s
|
|
137
|
+
next unless href.include?("part=")
|
|
138
|
+
|
|
139
|
+
code = href.match(/part=(\d+)/)&.[](1)
|
|
140
|
+
next unless code
|
|
141
|
+
|
|
142
|
+
title = link.text.strip
|
|
143
|
+
next if title.empty?
|
|
144
|
+
|
|
145
|
+
areas << { "code" => code, "title" => title, "sections" => [] }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
areas.uniq { |a| a["code"] }
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def fetch_sections(part)
|
|
152
|
+
url = format(SECTIONS_URL_TEMPLATE, part: part)
|
|
153
|
+
html = fetch_page_with_retry(url)
|
|
154
|
+
doc = Nokogiri::HTML(html)
|
|
155
|
+
|
|
156
|
+
sections = []
|
|
157
|
+
doc.css("td").each do |td|
|
|
158
|
+
text = td.text.strip
|
|
159
|
+
if (m = text.match(/\ASection\s+([\d-]+):\s*(.+)\z/))
|
|
160
|
+
sections << { "code" => m[1], "title" => m[2].strip }
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
sections.uniq { |s| s["code"] }
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
private
|
|
168
|
+
|
|
169
|
+
def data
|
|
170
|
+
@data ||= begin
|
|
171
|
+
path = File.exist?(DATA_FILE) ? DATA_FILE : nil
|
|
172
|
+
if path
|
|
173
|
+
YAML.safe_load(File.read(path, encoding: "utf-8")) || { "areas" => [] }
|
|
174
|
+
else
|
|
175
|
+
{ "areas" => [] }
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def complete?(data)
|
|
181
|
+
areas = data["areas"]
|
|
182
|
+
return false unless areas&.length&.>= 99
|
|
183
|
+
|
|
184
|
+
areas.all? { |a| a["fetched"] == true }
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def captcha_page?(html)
|
|
188
|
+
html.length < MIN_PAGE_SIZE ||
|
|
189
|
+
html.include?("Confirm you are human") ||
|
|
190
|
+
html.include?("solve a puzzle") ||
|
|
191
|
+
html.include?("security check before continuing")
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def fetch_page_with_retry(url, retries: MAX_RETRIES)
|
|
195
|
+
require "iev/scraper/browser"
|
|
196
|
+
|
|
197
|
+
retries.times do |attempt|
|
|
198
|
+
html = ScraperBrowser.fetch(url)
|
|
199
|
+
raise FetchError, "Failed to fetch #{url}" unless html
|
|
200
|
+
|
|
201
|
+
unless captcha_page?(html)
|
|
202
|
+
return html
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
if attempt < retries - 1
|
|
206
|
+
wait = RETRY_DELAY * (attempt + 1)
|
|
207
|
+
warn "IEV: WAF challenge for #{url}, retrying in #{wait}s (attempt #{attempt + 1}/#{retries})"
|
|
208
|
+
sleep wait
|
|
209
|
+
else
|
|
210
|
+
raise FetchError, "WAF challenge for #{url}"
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def read_cache(filename)
|
|
216
|
+
cache_path = File.join(Iev.config.cache_dir, filename)
|
|
217
|
+
return nil unless File.exist?(cache_path)
|
|
218
|
+
|
|
219
|
+
d = YAML.safe_load(File.read(cache_path, encoding: "utf-8"))
|
|
220
|
+
return nil unless d&.dig("areas")&.any?
|
|
221
|
+
|
|
222
|
+
d
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def write_cache(filename, d)
|
|
226
|
+
cache_path = File.join(Iev.config.cache_dir, filename)
|
|
227
|
+
FileUtils.mkdir_p(File.dirname(cache_path))
|
|
228
|
+
File.write(cache_path, YAML.dump(d), encoding: "utf-8")
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|