iev 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/iev.gemspec CHANGED
@@ -27,7 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.add_dependency "nokogiri", "~> 1.19"
28
28
  spec.add_dependency "plurimath"
29
29
  spec.add_dependency "lutaml-model", "~> 0.8.0"
30
- spec.add_dependency "relaton", "~> 1.18"
30
+ spec.add_dependency "relaton", ">= 2.0.0", "< 3"
31
31
  spec.add_dependency "sequel", "~> 5.40"
32
32
  spec.add_dependency "sqlite3", "~> 1.7"
33
33
  spec.add_dependency "thor", "~> 1.0"
@@ -142,6 +142,30 @@ module Iev
142
142
  summary
143
143
  end
144
144
 
145
+ desc "subject_areas", "Fetch IEV subject areas and sections from Electropedia."
146
+ option :output, desc: "Output YAML file (default: stdout)", aliases: :o
147
+ option :refresh, type: :boolean, default: false,
148
+ desc: "Force re-fetch even if cached"
149
+ def subject_areas
150
+ if options[:refresh]
151
+ cache_path = File.join(Iev.config.cache_dir, "subject_areas.yaml")
152
+ FileUtils.rm_f(cache_path) if File.exist?(cache_path)
153
+ end
154
+
155
+ result = Iev::SubjectAreas.fetch
156
+
157
+ yaml = YAML.dump(result)
158
+ if options[:output]
159
+ File.write(options[:output], yaml, encoding: "utf-8")
160
+ puts "Written to #{options[:output]}"
161
+ else
162
+ puts yaml
163
+ end
164
+ rescue Iev::SubjectAreas::FetchError => e
165
+ error e.message
166
+ exit 1
167
+ end
168
+
145
169
  desc "fetch CODE", "Fetch an IEV concept and output YAML to stdout."
146
170
  option :scrape, type: :boolean, default: false,
147
171
  desc: "Scrape from Electropedia instead of using cached data"
data/lib/iev/exporter.rb CHANGED
@@ -28,16 +28,19 @@ module Iev
28
28
  # @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
29
29
  # @param only_languages [String, nil] comma-separated language codes
30
30
  # @param fetch_relaton_links [Boolean] fetch source URLs via Relaton
31
+ # @param include_areas [Boolean] create area/section hierarchy concepts
31
32
  # @param on_progress [Proc, nil] callback (current, total) during build
32
33
  def initialize(input_path, output_dir: Dir.pwd,
33
34
  only_concepts: nil, only_languages: nil,
34
35
  fetch_relaton_links: false,
36
+ include_areas: true,
35
37
  on_progress: nil)
36
38
  @input_path = Pathname.new(input_path)
37
39
  validate_input!
38
40
 
39
41
  @output_dir = Pathname.new(output_dir)
40
42
  @fetch_relaton_links = fetch_relaton_links
43
+ @include_areas = include_areas
41
44
  @on_progress = on_progress
42
45
  @filters = {
43
46
  only_concepts: only_concepts,
@@ -51,6 +54,7 @@ module Iev
51
54
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
52
55
  dataset = load_dataset
53
56
  collection = build_collection(dataset)
57
+ add_subject_area_concepts(collection) if @include_areas
54
58
  save_collection(collection)
55
59
  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
56
60
 
@@ -137,6 +141,7 @@ module Iev
137
141
 
138
142
  concept = concept_index[term.id] ||= begin
139
143
  c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
144
+ c.data.domains = domain_references_for(term.id)
140
145
  collection.store(c)
141
146
  c
142
147
  end
@@ -148,6 +153,10 @@ module Iev
148
153
  SourceParser.relaton_enabled = true
149
154
  end
150
155
 
156
+ def add_subject_area_concepts(collection)
157
+ SubjectAreaConcepts.add_to(collection)
158
+ end
159
+
151
160
  def save_collection(collection)
152
161
  concepts_dir = output_dir.expand_path.join("concepts")
153
162
  FileUtils.mkdir_p(concepts_dir)
@@ -157,5 +166,15 @@ module Iev
157
166
  def localized_count(collection)
158
167
  collection.sum { |c| c.localized_concepts.count }
159
168
  end
169
+
170
+ def domain_references_for(ievref)
171
+ parts = ievref.to_s.split("-")
172
+ return [] unless parts.length >= 2
173
+
174
+ [
175
+ SubjectAreas.area_uri(parts[0]),
176
+ SubjectAreas.section_uri(parts[0..1].join("-")),
177
+ ].map { |id| Glossarist::ConceptReference.domain(id) }
178
+ end
160
179
  end
161
180
  end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ferrum"
4
+
5
+ module Iev
6
+ # Shared headless browser utilities for fetching pages behind AWS WAF.
7
+ module ScraperBrowser
8
+ USER_AGENT_PROFILES = [
9
+ {
10
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
11
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
12
+ "Chrome/131.0.0.0 Safari/537.36",
13
+ platform: '"macOS"',
14
+ chrome_version: "131",
15
+ },
16
+ {
17
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
18
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
19
+ "Chrome/130.0.0.0 Safari/537.36",
20
+ platform: '"Windows"',
21
+ chrome_version: "130",
22
+ },
23
+ {
24
+ user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
25
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
26
+ "Chrome/131.0.0.0 Safari/537.36",
27
+ platform: '"Linux"',
28
+ chrome_version: "131",
29
+ },
30
+ {
31
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
32
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
33
+ "Chrome/129.0.0.0 Safari/537.36",
34
+ platform: '"macOS"',
35
+ chrome_version: "129",
36
+ },
37
+ {
38
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
39
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
40
+ "Chrome/131.0.0.0 Safari/537.36",
41
+ platform: '"Windows"',
42
+ chrome_version: "131",
43
+ },
44
+ ].freeze
45
+
46
+ # Fetch a URL using headless Chrome, returning the page HTML.
47
+ # Handles AWS WAF challenge pages by waiting for JS execution.
48
+ def self.fetch(url, browser_opts: {})
49
+ browser = Ferrum::Browser.new(
50
+ headless: "new",
51
+ timeout: 30,
52
+ window_size: [1366, 768],
53
+ browser_options: {
54
+ "disable-blink-features" => "AutomationControlled",
55
+ },
56
+ **browser_opts,
57
+ )
58
+
59
+ browser.headers.set(random_headers)
60
+ browser.go_to(url)
61
+ browser.network.wait_for_idle(timeout: 15)
62
+ html = browser.body
63
+
64
+ if html.include?("403 ERROR") || html.include?("Request blocked")
65
+ warn "IEV: AWS WAF blocked request for #{url}"
66
+ return nil
67
+ end
68
+
69
+ html
70
+ rescue Ferrum::Error, Ferrum::BrowserError => e
71
+ warn "IEV: Browser error fetching #{url}: #{e.message}"
72
+ nil
73
+ ensure
74
+ browser&.quit
75
+ end
76
+
77
+ def self.random_headers
78
+ profile = USER_AGENT_PROFILES.sample
79
+ sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
80
+ "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
81
+ "\"Not_A Brand\";v=\"24\""
82
+
83
+ {
84
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
85
+ "image/avif,image/webp,image/apng,*/*;q=0.8," \
86
+ "application/signed-exchange;v=b3;q=0.7",
87
+ "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
88
+ "Cache-Control" => "no-cache",
89
+ "Pragma" => "no-cache",
90
+ "Sec-Ch-Ua" => sec_ch_ua,
91
+ "Sec-Ch-Ua-Mobile" => "?0",
92
+ "Sec-Ch-Ua-Platform" => profile[:platform],
93
+ "Sec-Fetch-Dest" => "document",
94
+ "Sec-Fetch-Mode" => "navigate",
95
+ "Sec-Fetch-Site" => "cross-site",
96
+ "Sec-Fetch-User" => "?1",
97
+ "Upgrade-Insecure-Requests" => "1",
98
+ "User-Agent" => profile[:user_agent],
99
+ }
100
+ end
101
+ end
102
+ end
data/lib/iev/scraper.rb CHANGED
@@ -1,59 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "nokogiri"
4
+
3
5
  module Iev
4
- # Scrapes IEV term data from Electropedia (electropedia.org).
5
- #
6
- # Electropedia is behind AWS WAF which requires JavaScript execution,
7
- # so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
8
- #
9
- # @example
10
- # scraper = Iev::Scraper.new
11
- # concept = scraper.fetch_concept("103-01-02")
12
- # doc = scraper.fetch_page("103-01-02")
13
6
  class Scraper
14
7
  BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
15
8
  "display?openform&ievref="
16
9
 
17
- # Pool of realistic Chrome User-Agent strings with matching platform hints.
18
- # Rotated per request to reduce fingerprinting by AWS WAF.
19
- USER_AGENT_PROFILES = [
20
- {
21
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
22
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
23
- "Chrome/131.0.0.0 Safari/537.36",
24
- platform: '"macOS"',
25
- chrome_version: "131",
26
- },
27
- {
28
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
29
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
30
- "Chrome/130.0.0.0 Safari/537.36",
31
- platform: '"Windows"',
32
- chrome_version: "130",
33
- },
34
- {
35
- user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
36
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
37
- "Chrome/131.0.0.0 Safari/537.36",
38
- platform: '"Linux"',
39
- chrome_version: "131",
40
- },
41
- {
42
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
43
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
44
- "Chrome/129.0.0.0 Safari/537.36",
45
- platform: '"macOS"',
46
- chrome_version: "129",
47
- },
48
- {
49
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
50
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
51
- "Chrome/131.0.0.0 Safari/537.36",
52
- platform: '"Windows"',
53
- chrome_version: "131",
54
- },
55
- ].freeze
56
-
57
10
  def initialize(browser_opts: {})
58
11
  @browser_opts = browser_opts
59
12
  end
@@ -61,37 +14,10 @@ module Iev
61
14
  # Fetch the Electropedia page HTML for a given IEV code.
62
15
  # Returns a Nokogiri document.
63
16
  def fetch_page(code)
64
- require "ferrum"
65
- require "nokogiri"
66
-
67
- url = "#{BASE_URL}#{code}"
68
- browser = Ferrum::Browser.new(
69
- headless: "new",
70
- timeout: 30,
71
- window_size: [1366, 768],
72
- browser_options: {
73
- "disable-blink-features" => "AutomationControlled",
74
- },
75
- **@browser_opts,
76
- )
77
-
78
- browser.headers.set(random_headers)
79
- browser.go_to(url)
80
- browser.network.wait_for_idle(timeout: 15)
81
- html = browser.body
82
-
83
- # Check if we got a real page or a WAF block
84
- if html.include?("403 ERROR") || html.include?("Request blocked")
85
- warn "IEV Scraper: AWS WAF blocked request for #{code}"
86
- return nil
87
- end
17
+ html = ScraperBrowser.fetch("#{BASE_URL}#{code}", browser_opts: @browser_opts)
18
+ return nil unless html
88
19
 
89
20
  Nokogiri::HTML(html)
90
- rescue Ferrum::Error, Ferrum::BrowserError => e
91
- warn "IEV Scraper error for #{code}: #{e.message}"
92
- nil
93
- ensure
94
- browser&.quit
95
21
  end
96
22
 
97
23
  # Fetch and parse concept data for an IEV code.
@@ -102,34 +28,8 @@ module Iev
102
28
 
103
29
  PageParser.new(doc, code).parse
104
30
  end
105
-
106
- private
107
-
108
- def random_headers
109
- profile = USER_AGENT_PROFILES.sample
110
- sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
111
- "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
112
- "\"Not_A Brand\";v=\"24\""
113
-
114
- {
115
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
116
- "image/avif,image/webp,image/apng,*/*;q=0.8," \
117
- "application/signed-exchange;v=b3;q=0.7",
118
- "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
119
- "Cache-Control" => "no-cache",
120
- "Pragma" => "no-cache",
121
- "Sec-Ch-Ua" => sec_ch_ua,
122
- "Sec-Ch-Ua-Mobile" => "?0",
123
- "Sec-Ch-Ua-Platform" => profile[:platform],
124
- "Sec-Fetch-Dest" => "document",
125
- "Sec-Fetch-Mode" => "navigate",
126
- "Sec-Fetch-Site" => "cross-site",
127
- "Sec-Fetch-User" => "?1",
128
- "Upgrade-Insecure-Requests" => "1",
129
- "User-Agent" => profile[:user_agent],
130
- }
131
- end
132
31
  end
133
32
  end
134
33
 
34
+ require_relative "scraper/browser"
135
35
  require_relative "scraper/page_parser"
@@ -95,7 +95,7 @@ module Iev
95
95
  origin: origin,
96
96
  modification: relationship[:modification],
97
97
  )
98
- rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
98
+ rescue Relaton::RequestError, Socket::ResolutionError, SocketError => e
99
99
  warn e.message
100
100
  end
101
101
 
@@ -356,8 +356,11 @@ module Iev
356
356
  return nil unless self.class.relaton_enabled
357
357
  return nil unless defined?(RelatonDb)
358
358
 
359
- RelatonDb.instance.fetch(ref)&.url
360
- rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
359
+ item = RelatonDb.instance.fetch(ref)
360
+ return nil unless item
361
+
362
+ item.source("src")
363
+ rescue Relaton::RequestError, Socket::ResolutionError, SocketError => e
361
364
  warn e.message
362
365
  nil
363
366
  end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Creates ManagedConcept entries for the IEV subject area hierarchy.
5
+ #
6
+ # The hierarchy has two levels:
7
+ # - Area (e.g., "102" = "Mathematics - General concepts and linear algebra")
8
+ # - Section (e.g., "102-01" = "Sets and operations")
9
+ #
10
+ # Linking:
11
+ # - Each IEV concept's ConceptData#domain references its section URI
12
+ # - Each IEV concept's ManagedConceptData#domains includes area and section codes
13
+ # - Each section concept has a "broader" relation to its parent area
14
+ # - Each area concept has "narrower" relations to its sections
15
+ module SubjectAreaConcepts
16
+ class << self
17
+ # Build all area and section concepts and add them to the collection.
18
+ #
19
+ # @param collection [Glossarist::ManagedConceptCollection]
20
+ # @return [void]
21
+ def add_to(collection)
22
+ Iev.subject_areas.each do |area|
23
+ area_mc = build_area_concept(area)
24
+ collection.store(area_mc)
25
+
26
+ (area["sections"] || []).each do |section|
27
+ section_mc = build_section_concept(section, area)
28
+ collection.store(section_mc)
29
+ end
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def build_area_concept(area)
36
+ id = SubjectAreas.area_uri(area["code"])
37
+
38
+ mc = Glossarist::ManagedConcept.new(
39
+ data: Glossarist::ManagedConceptData.new(
40
+ id: id,
41
+ domains: [Glossarist::ConceptReference.domain(id)],
42
+ ),
43
+ )
44
+
45
+ mc.add_localization(build_localization(id, area["title"], "eng"))
46
+
47
+ narrower = (area["sections"] || []).map { |s| build_narrower_ref(s["code"]) }
48
+ mc.related = narrower unless narrower.empty?
49
+
50
+ mc
51
+ end
52
+
53
+ def build_section_concept(section, area)
54
+ id = SubjectAreas.section_uri(section["code"])
55
+
56
+ mc = Glossarist::ManagedConcept.new(
57
+ data: Glossarist::ManagedConceptData.new(
58
+ id: id,
59
+ domains: [
60
+ Glossarist::ConceptReference.domain(SubjectAreas.area_uri(area["code"])),
61
+ Glossarist::ConceptReference.domain(id),
62
+ ],
63
+ ),
64
+ )
65
+
66
+ cd = build_concept_data(id, section["title"], "eng")
67
+ cd.domain = SubjectAreas.area_uri(area["code"])
68
+ cd.related = [build_broader_ref(area["code"])]
69
+
70
+ mc.add_localization(build_localization_from_data(id, cd))
71
+ mc
72
+ end
73
+
74
+ def build_concept_data(id, title, lang_code)
75
+ Glossarist::ConceptData.new(
76
+ id: id,
77
+ language_code: lang_code,
78
+ terms: [
79
+ Glossarist::Designation::Expression.new(
80
+ type: "expression",
81
+ designation: title,
82
+ normative_status: "preferred",
83
+ ),
84
+ ],
85
+ )
86
+ end
87
+
88
+ def build_localization(id, title, lang_code)
89
+ cd = build_concept_data(id, title, lang_code)
90
+
91
+ l10n = Glossarist::LocalizedConcept.new
92
+ l10n.data = cd
93
+ l10n.id = id
94
+ l10n.entry_status = "valid"
95
+ l10n.data.review_decision_event = "published"
96
+ l10n
97
+ end
98
+
99
+ def build_localization_from_data(id, concept_data)
100
+ l10n = Glossarist::LocalizedConcept.new
101
+ l10n.data = concept_data
102
+ l10n.id = id
103
+ l10n.entry_status = "valid"
104
+ l10n.data.review_decision_event = "published"
105
+ l10n
106
+ end
107
+
108
+ def build_broader_ref(area_code)
109
+ Glossarist::RelatedConcept.new(
110
+ type: "broader",
111
+ content: SubjectAreas.area_uri(area_code),
112
+ )
113
+ end
114
+
115
+ def build_narrower_ref(section_code)
116
+ Glossarist::RelatedConcept.new(
117
+ type: "narrower",
118
+ content: SubjectAreas.section_uri(section_code),
119
+ )
120
+ end
121
+ end
122
+ end
123
+ end