iev 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +3 -0
- data/Gemfile +4 -2
- data/README.adoc +34 -0
- data/data/subject_areas.yaml +1920 -0
- data/iev.gemspec +1 -1
- data/lib/iev/cli/command.rb +24 -0
- data/lib/iev/exporter.rb +19 -0
- data/lib/iev/scraper/browser.rb +102 -0
- data/lib/iev/scraper.rb +5 -105
- data/lib/iev/source_parser.rb +6 -3
- data/lib/iev/subject_area_concepts.rb +123 -0
- data/lib/iev/subject_areas.rb +232 -0
- data/lib/iev/term_builder.rb +19 -0
- data/lib/iev/utilities.rb +1 -1
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +36 -0
- metadata +16 -6
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
require "nokogiri"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "iev/config"
|
|
7
|
+
|
|
8
|
+
module Iev
|
|
9
|
+
module SubjectAreas
|
|
10
|
+
DATA_FILE = File.expand_path("../../data/subject_areas.yaml", __dir__)
|
|
11
|
+
|
|
12
|
+
AREAS_URL = "https://electropedia.org/iev/iev.nsf/" \
|
|
13
|
+
"6d6bdd8667c378f7c12581fa003d80e7?OpenForm"
|
|
14
|
+
SECTIONS_URL_TEMPLATE = "https://electropedia.org/iev/iev.nsf/" \
|
|
15
|
+
"index?openform&part=%<part>s"
|
|
16
|
+
|
|
17
|
+
MIN_PAGE_SIZE = 15_000
|
|
18
|
+
|
|
19
|
+
FETCH_DELAY = 5
|
|
20
|
+
RETRY_DELAY = 30
|
|
21
|
+
MAX_RETRIES = 2
|
|
22
|
+
|
|
23
|
+
class FetchError < StandardError; end
|
|
24
|
+
|
|
25
|
+
class << self
|
|
26
|
+
# --- URI scheme ---
|
|
27
|
+
|
|
28
|
+
# URI for a subject area concept.
|
|
29
|
+
# @param code [String, Integer] e.g. "102"
|
|
30
|
+
# @return [String] e.g. "area-102"
|
|
31
|
+
def area_uri(code)
|
|
32
|
+
"area-#{code}"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# URI for a section concept.
|
|
36
|
+
# @param code [String] e.g. "103-01"
|
|
37
|
+
# @return [String] e.g. "section-103-01"
|
|
38
|
+
def section_uri(code)
|
|
39
|
+
"section-#{code}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# --- Query API (reads from bundled data) ---
|
|
43
|
+
|
|
44
|
+
# Return all subject areas with their sections.
|
|
45
|
+
# @return [Array<Hash>] each hash has "code", "title", "sections"
|
|
46
|
+
def all
|
|
47
|
+
data["areas"]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Find a single subject area by its numeric code.
|
|
51
|
+
# @param code [String, Integer] e.g. "102" or 102
|
|
52
|
+
# @return [Hash, nil]
|
|
53
|
+
def find_area(code)
|
|
54
|
+
all.find { |a| a["code"] == code.to_s }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Return all sections for a given area code.
|
|
58
|
+
# @param code [String, Integer] area code, e.g. "102"
|
|
59
|
+
# @return [Array<Hash>] each hash has "code", "title"
|
|
60
|
+
def sections_for(code)
|
|
61
|
+
area = find_area(code)
|
|
62
|
+
area ? area["sections"] : []
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Find a single section by its section code.
|
|
66
|
+
# @param section_code [String] e.g. "102-01"
|
|
67
|
+
# @return [Hash, nil]
|
|
68
|
+
def find_section(section_code)
|
|
69
|
+
sc = section_code.to_s
|
|
70
|
+
all.each do |area|
|
|
71
|
+
found = area["sections"]&.find { |s| s["code"] == sc }
|
|
72
|
+
return found if found
|
|
73
|
+
end
|
|
74
|
+
nil
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Return the parent area for a given section code.
|
|
78
|
+
# @param section_code [String] e.g. "102-01"
|
|
79
|
+
# @return [Hash, nil]
|
|
80
|
+
def area_for_section(section_code)
|
|
81
|
+
sc = section_code.to_s
|
|
82
|
+
all.find do |area|
|
|
83
|
+
area["sections"]&.any? { |s| s["code"] == sc }
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# --- Fetching (network, writes to bundled data file) ---
|
|
88
|
+
|
|
89
|
+
def fetch
|
|
90
|
+
cached = read_cache("subject_areas.yaml")
|
|
91
|
+
return cached if cached && complete?(cached)
|
|
92
|
+
|
|
93
|
+
areas = cached ? cached["areas"] : []
|
|
94
|
+
fresh_areas = fetch_areas
|
|
95
|
+
puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
|
|
96
|
+
|
|
97
|
+
# Merge: keep existing sections, add new areas
|
|
98
|
+
existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a }
|
|
99
|
+
fresh_areas.each do |fa|
|
|
100
|
+
existing[fa["code"]] ||= fa
|
|
101
|
+
end
|
|
102
|
+
areas = fresh_areas.map { |fa| existing[fa["code"]] || fa }
|
|
103
|
+
|
|
104
|
+
areas.each_with_index do |area, i|
|
|
105
|
+
next if area["fetched"]
|
|
106
|
+
|
|
107
|
+
begin
|
|
108
|
+
area["sections"] = fetch_sections(area["code"])
|
|
109
|
+
area["fetched"] = true
|
|
110
|
+
rescue FetchError
|
|
111
|
+
area["sections"] ||= []
|
|
112
|
+
warn "IEV: Skipping area #{area["code"]} due to WAF"
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty?
|
|
116
|
+
|
|
117
|
+
# Save progress every 10 areas so partial results survive WAF failures
|
|
118
|
+
if (i + 1) % 10 == 0
|
|
119
|
+
write_cache("subject_areas.yaml", { "areas" => areas })
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
sleep FETCH_DELAY unless i == areas.length - 1
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
result = { "areas" => areas }
|
|
126
|
+
write_cache("subject_areas.yaml", result)
|
|
127
|
+
result
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def fetch_areas
|
|
131
|
+
html = fetch_page_with_retry(AREAS_URL)
|
|
132
|
+
doc = Nokogiri::HTML(html)
|
|
133
|
+
|
|
134
|
+
areas = []
|
|
135
|
+
doc.css("a").each do |link|
|
|
136
|
+
href = link["href"].to_s
|
|
137
|
+
next unless href.include?("part=")
|
|
138
|
+
|
|
139
|
+
code = href.match(/part=(\d+)/)&.[](1)
|
|
140
|
+
next unless code
|
|
141
|
+
|
|
142
|
+
title = link.text.strip
|
|
143
|
+
next if title.empty?
|
|
144
|
+
|
|
145
|
+
areas << { "code" => code, "title" => title, "sections" => [] }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
areas.uniq { |a| a["code"] }
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def fetch_sections(part)
|
|
152
|
+
url = format(SECTIONS_URL_TEMPLATE, part: part)
|
|
153
|
+
html = fetch_page_with_retry(url)
|
|
154
|
+
doc = Nokogiri::HTML(html)
|
|
155
|
+
|
|
156
|
+
sections = []
|
|
157
|
+
doc.css("td").each do |td|
|
|
158
|
+
text = td.text.strip
|
|
159
|
+
if (m = text.match(/\ASection\s+([\d-]+):\s*(.+)\z/))
|
|
160
|
+
sections << { "code" => m[1], "title" => m[2].strip }
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
sections.uniq { |s| s["code"] }
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
private
|
|
168
|
+
|
|
169
|
+
def data
|
|
170
|
+
@data ||= begin
|
|
171
|
+
path = File.exist?(DATA_FILE) ? DATA_FILE : nil
|
|
172
|
+
if path
|
|
173
|
+
YAML.safe_load(File.read(path, encoding: "utf-8")) || { "areas" => [] }
|
|
174
|
+
else
|
|
175
|
+
{ "areas" => [] }
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def complete?(data)
|
|
181
|
+
areas = data["areas"]
|
|
182
|
+
return false unless areas&.length&.>= 99
|
|
183
|
+
|
|
184
|
+
areas.all? { |a| a["fetched"] == true }
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def captcha_page?(html)
|
|
188
|
+
html.length < MIN_PAGE_SIZE ||
|
|
189
|
+
html.include?("Confirm you are human") ||
|
|
190
|
+
html.include?("solve a puzzle") ||
|
|
191
|
+
html.include?("security check before continuing")
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def fetch_page_with_retry(url, retries: MAX_RETRIES)
|
|
195
|
+
require "iev/scraper/browser"
|
|
196
|
+
|
|
197
|
+
retries.times do |attempt|
|
|
198
|
+
html = ScraperBrowser.fetch(url)
|
|
199
|
+
raise FetchError, "Failed to fetch #{url}" unless html
|
|
200
|
+
|
|
201
|
+
unless captcha_page?(html)
|
|
202
|
+
return html
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
if attempt < retries - 1
|
|
206
|
+
wait = RETRY_DELAY * (attempt + 1)
|
|
207
|
+
warn "IEV: WAF challenge for #{url}, retrying in #{wait}s (attempt #{attempt + 1}/#{retries})"
|
|
208
|
+
sleep wait
|
|
209
|
+
else
|
|
210
|
+
raise FetchError, "WAF challenge for #{url}"
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def read_cache(filename)
|
|
216
|
+
cache_path = File.join(Iev.config.cache_dir, filename)
|
|
217
|
+
return nil unless File.exist?(cache_path)
|
|
218
|
+
|
|
219
|
+
d = YAML.safe_load(File.read(cache_path, encoding: "utf-8"))
|
|
220
|
+
return nil unless d&.dig("areas")&.any?
|
|
221
|
+
|
|
222
|
+
d
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def write_cache(filename, d)
|
|
226
|
+
cache_path = File.join(Iev.config.cache_dir, filename)
|
|
227
|
+
FileUtils.mkdir_p(File.dirname(cache_path))
|
|
228
|
+
File.write(cache_path, YAML.dump(d), encoding: "utf-8")
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
data/lib/iev/term_builder.rb
CHANGED
|
@@ -77,6 +77,9 @@ module Iev
|
|
|
77
77
|
cd.notes = extract_notes
|
|
78
78
|
cd.terms = extract_terms
|
|
79
79
|
|
|
80
|
+
domain = extract_domain
|
|
81
|
+
cd.domain = domain if domain
|
|
82
|
+
|
|
80
83
|
sources = extract_authoritative_source
|
|
81
84
|
cd.sources = sources if sources&.any?
|
|
82
85
|
|
|
@@ -98,6 +101,22 @@ module Iev
|
|
|
98
101
|
@term_language ||= find_value_for("LANGUAGE").to_three_char_code
|
|
99
102
|
end
|
|
100
103
|
|
|
104
|
+
# Derives the domain (subject area section) from the IEVREF identifier.
|
|
105
|
+
# IEVREF format: "AAA-BB-CC" where AAA = area, AAA-BB = section.
|
|
106
|
+
# Returns a URI reference to the section concept (e.g. "section-103-01").
|
|
107
|
+
def extract_domain
|
|
108
|
+
return nil unless term_id
|
|
109
|
+
|
|
110
|
+
section_code = term_id.split("-")[0..1].join("-")
|
|
111
|
+
section = Iev.find_section(section_code)
|
|
112
|
+
return SubjectAreas.section_uri(section_code) if section
|
|
113
|
+
|
|
114
|
+
area_code = term_id.split("-")[0]
|
|
115
|
+
SubjectAreas.area_uri(area_code)
|
|
116
|
+
rescue StandardError
|
|
117
|
+
nil
|
|
118
|
+
end
|
|
119
|
+
|
|
101
120
|
# Splits unified definition (from the spreadsheet) into separate
|
|
102
121
|
# definition, examples, and notes strings (for YAMLs).
|
|
103
122
|
#
|
data/lib/iev/utilities.rb
CHANGED
data/lib/iev/version.rb
CHANGED
data/lib/iev.rb
CHANGED
|
@@ -34,6 +34,8 @@ module Iev
|
|
|
34
34
|
autoload :RelatonDb, "iev/relaton_db"
|
|
35
35
|
autoload :Scraper, "iev/scraper"
|
|
36
36
|
autoload :SourceParser, "iev/source_parser"
|
|
37
|
+
autoload :SubjectAreas, "iev/subject_areas"
|
|
38
|
+
autoload :SubjectAreaConcepts, "iev/subject_area_concepts"
|
|
37
39
|
autoload :SupersessionParser, "iev/supersession_parser"
|
|
38
40
|
autoload :TermAttrsParser, "iev/term_attrs_parser"
|
|
39
41
|
autoload :TermBuilder, "iev/term_builder"
|
|
@@ -80,4 +82,38 @@ module Iev
|
|
|
80
82
|
def self.scrape_concept(code)
|
|
81
83
|
Scraper.new.fetch_concept(code)
|
|
82
84
|
end
|
|
85
|
+
|
|
86
|
+
# Return all IEV subject areas with their sections (from bundled data).
|
|
87
|
+
# @return [Array<Hash>]
|
|
88
|
+
def self.subject_areas
|
|
89
|
+
SubjectAreas.all
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Find a subject area by code.
|
|
93
|
+
# @param code [String, Integer] e.g. "102"
|
|
94
|
+
# @return [Hash, nil]
|
|
95
|
+
def self.find_subject_area(code)
|
|
96
|
+
SubjectAreas.find_area(code)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Find a section by its section code.
|
|
100
|
+
# @param section_code [String] e.g. "102-01"
|
|
101
|
+
# @return [Hash, nil]
|
|
102
|
+
def self.find_section(section_code)
|
|
103
|
+
SubjectAreas.find_section(section_code)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Return sections for a given area code.
|
|
107
|
+
# @param code [String, Integer] e.g. "102"
|
|
108
|
+
# @return [Array<Hash>]
|
|
109
|
+
def self.sections_for(code)
|
|
110
|
+
SubjectAreas.sections_for(code)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Return the parent subject area for a given section code.
|
|
114
|
+
# @param section_code [String] e.g. "102-01"
|
|
115
|
+
# @return [Hash, nil]
|
|
116
|
+
def self.area_for_section(section_code)
|
|
117
|
+
SubjectAreas.area_for_section(section_code)
|
|
118
|
+
end
|
|
83
119
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: iev
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: creek
|
|
@@ -98,16 +98,22 @@ dependencies:
|
|
|
98
98
|
name: relaton
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
100
100
|
requirements:
|
|
101
|
-
- - "
|
|
101
|
+
- - ">="
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: 2.0.0
|
|
104
|
+
- - "<"
|
|
102
105
|
- !ruby/object:Gem::Version
|
|
103
|
-
version: '
|
|
106
|
+
version: '3'
|
|
104
107
|
type: :runtime
|
|
105
108
|
prerelease: false
|
|
106
109
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
110
|
requirements:
|
|
108
|
-
- - "
|
|
111
|
+
- - ">="
|
|
112
|
+
- !ruby/object:Gem::Version
|
|
113
|
+
version: 2.0.0
|
|
114
|
+
- - "<"
|
|
109
115
|
- !ruby/object:Gem::Version
|
|
110
|
-
version: '
|
|
116
|
+
version: '3'
|
|
111
117
|
- !ruby/object:Gem::Dependency
|
|
112
118
|
name: sequel
|
|
113
119
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -185,6 +191,7 @@ files:
|
|
|
185
191
|
- Rakefile
|
|
186
192
|
- bin/console
|
|
187
193
|
- bin/setup
|
|
194
|
+
- data/subject_areas.yaml
|
|
188
195
|
- exe/iev
|
|
189
196
|
- iev.gemspec
|
|
190
197
|
- lib/iev.rb
|
|
@@ -204,8 +211,11 @@ files:
|
|
|
204
211
|
- lib/iev/profiler.rb
|
|
205
212
|
- lib/iev/relaton_db.rb
|
|
206
213
|
- lib/iev/scraper.rb
|
|
214
|
+
- lib/iev/scraper/browser.rb
|
|
207
215
|
- lib/iev/scraper/page_parser.rb
|
|
208
216
|
- lib/iev/source_parser.rb
|
|
217
|
+
- lib/iev/subject_area_concepts.rb
|
|
218
|
+
- lib/iev/subject_areas.rb
|
|
209
219
|
- lib/iev/supersession_parser.rb
|
|
210
220
|
- lib/iev/term_attrs_parser.rb
|
|
211
221
|
- lib/iev/term_builder.rb
|