iev 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "nokogiri"
5
+ require "fileutils"
6
+ require "iev/config"
7
+
8
+ module Iev
9
+ module SubjectAreas
10
+ DATA_FILE = File.expand_path("../../data/subject_areas.yaml", __dir__)
11
+
12
+ AREAS_URL = "https://electropedia.org/iev/iev.nsf/" \
13
+ "6d6bdd8667c378f7c12581fa003d80e7?OpenForm"
14
+ SECTIONS_URL_TEMPLATE = "https://electropedia.org/iev/iev.nsf/" \
15
+ "index?openform&part=%<part>s"
16
+
17
+ MIN_PAGE_SIZE = 15_000
18
+
19
+ FETCH_DELAY = 5
20
+ RETRY_DELAY = 30
21
+ MAX_RETRIES = 2
22
+
23
+ class FetchError < StandardError; end
24
+
25
+ class << self
26
+ # --- URI scheme ---
27
+
28
+ # URI for a subject area concept.
29
+ # @param code [String, Integer] e.g. "102"
30
+ # @return [String] e.g. "area-102"
31
+ def area_uri(code)
32
+ "area-#{code}"
33
+ end
34
+
35
+ # URI for a section concept.
36
+ # @param code [String] e.g. "103-01"
37
+ # @return [String] e.g. "section-103-01"
38
+ def section_uri(code)
39
+ "section-#{code}"
40
+ end
41
+
42
+ # --- Query API (reads from bundled data) ---
43
+
44
+ # Return all subject areas with their sections.
45
+ # @return [Array<Hash>] each hash has "code", "title", "sections"
46
+ def all
47
+ data["areas"]
48
+ end
49
+
50
+ # Find a single subject area by its numeric code.
51
+ # @param code [String, Integer] e.g. "102" or 102
52
+ # @return [Hash, nil]
53
+ def find_area(code)
54
+ all.find { |a| a["code"] == code.to_s }
55
+ end
56
+
57
+ # Return all sections for a given area code.
58
+ # @param code [String, Integer] area code, e.g. "102"
59
+ # @return [Array<Hash>] each hash has "code", "title"
60
+ def sections_for(code)
61
+ area = find_area(code)
62
+ area ? area["sections"] : []
63
+ end
64
+
65
+ # Find a single section by its section code.
66
+ # @param section_code [String] e.g. "102-01"
67
+ # @return [Hash, nil]
68
+ def find_section(section_code)
69
+ sc = section_code.to_s
70
+ all.each do |area|
71
+ found = area["sections"]&.find { |s| s["code"] == sc }
72
+ return found if found
73
+ end
74
+ nil
75
+ end
76
+
77
+ # Return the parent area for a given section code.
78
+ # @param section_code [String] e.g. "102-01"
79
+ # @return [Hash, nil]
80
+ def area_for_section(section_code)
81
+ sc = section_code.to_s
82
+ all.find do |area|
83
+ area["sections"]&.any? { |s| s["code"] == sc }
84
+ end
85
+ end
86
+
87
+ # --- Fetching (network, writes to bundled data file) ---
88
+
89
+ def fetch
90
+ cached = read_cache("subject_areas.yaml")
91
+ return cached if cached && complete?(cached)
92
+
93
+ areas = cached ? cached["areas"] : []
94
+ fresh_areas = fetch_areas
95
+ puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
96
+
97
+ # Merge: keep existing sections, add new areas
98
+ existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a }
99
+ fresh_areas.each do |fa|
100
+ existing[fa["code"]] ||= fa
101
+ end
102
+ areas = fresh_areas.map { |fa| existing[fa["code"]] || fa }
103
+
104
+ areas.each_with_index do |area, i|
105
+ next if area["fetched"]
106
+
107
+ begin
108
+ area["sections"] = fetch_sections(area["code"])
109
+ area["fetched"] = true
110
+ rescue FetchError
111
+ area["sections"] ||= []
112
+ warn "IEV: Skipping area #{area["code"]} due to WAF"
113
+ end
114
+
115
+ puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty?
116
+
117
+ # Save progress every 10 areas so partial results survive WAF failures
118
+ if (i + 1) % 10 == 0
119
+ write_cache("subject_areas.yaml", { "areas" => areas })
120
+ end
121
+
122
+ sleep FETCH_DELAY unless i == areas.length - 1
123
+ end
124
+
125
+ result = { "areas" => areas }
126
+ write_cache("subject_areas.yaml", result)
127
+ result
128
+ end
129
+
130
+ def fetch_areas
131
+ html = fetch_page_with_retry(AREAS_URL)
132
+ doc = Nokogiri::HTML(html)
133
+
134
+ areas = []
135
+ doc.css("a").each do |link|
136
+ href = link["href"].to_s
137
+ next unless href.include?("part=")
138
+
139
+ code = href.match(/part=(\d+)/)&.[](1)
140
+ next unless code
141
+
142
+ title = link.text.strip
143
+ next if title.empty?
144
+
145
+ areas << { "code" => code, "title" => title, "sections" => [] }
146
+ end
147
+
148
+ areas.uniq { |a| a["code"] }
149
+ end
150
+
151
+ def fetch_sections(part)
152
+ url = format(SECTIONS_URL_TEMPLATE, part: part)
153
+ html = fetch_page_with_retry(url)
154
+ doc = Nokogiri::HTML(html)
155
+
156
+ sections = []
157
+ doc.css("td").each do |td|
158
+ text = td.text.strip
159
+ if (m = text.match(/\ASection\s+([\d-]+):\s*(.+)\z/))
160
+ sections << { "code" => m[1], "title" => m[2].strip }
161
+ end
162
+ end
163
+
164
+ sections.uniq { |s| s["code"] }
165
+ end
166
+
167
+ private
168
+
169
+ def data
170
+ @data ||= begin
171
+ path = File.exist?(DATA_FILE) ? DATA_FILE : nil
172
+ if path
173
+ YAML.safe_load(File.read(path, encoding: "utf-8")) || { "areas" => [] }
174
+ else
175
+ { "areas" => [] }
176
+ end
177
+ end
178
+ end
179
+
180
+ def complete?(data)
181
+ areas = data["areas"]
182
+ return false unless areas&.length&.>= 99
183
+
184
+ areas.all? { |a| a["fetched"] == true }
185
+ end
186
+
187
+ def captcha_page?(html)
188
+ html.length < MIN_PAGE_SIZE ||
189
+ html.include?("Confirm you are human") ||
190
+ html.include?("solve a puzzle") ||
191
+ html.include?("security check before continuing")
192
+ end
193
+
194
+ def fetch_page_with_retry(url, retries: MAX_RETRIES)
195
+ require "iev/scraper/browser"
196
+
197
+ retries.times do |attempt|
198
+ html = ScraperBrowser.fetch(url)
199
+ raise FetchError, "Failed to fetch #{url}" unless html
200
+
201
+ unless captcha_page?(html)
202
+ return html
203
+ end
204
+
205
+ if attempt < retries - 1
206
+ wait = RETRY_DELAY * (attempt + 1)
207
+ warn "IEV: WAF challenge for #{url}, retrying in #{wait}s (attempt #{attempt + 1}/#{retries})"
208
+ sleep wait
209
+ else
210
+ raise FetchError, "WAF challenge for #{url}"
211
+ end
212
+ end
213
+ end
214
+
215
+ def read_cache(filename)
216
+ cache_path = File.join(Iev.config.cache_dir, filename)
217
+ return nil unless File.exist?(cache_path)
218
+
219
+ d = YAML.safe_load(File.read(cache_path, encoding: "utf-8"))
220
+ return nil unless d&.dig("areas")&.any?
221
+
222
+ d
223
+ end
224
+
225
+ def write_cache(filename, d)
226
+ cache_path = File.join(Iev.config.cache_dir, filename)
227
+ FileUtils.mkdir_p(File.dirname(cache_path))
228
+ File.write(cache_path, YAML.dump(d), encoding: "utf-8")
229
+ end
230
+ end
231
+ end
232
+ end
@@ -77,6 +77,9 @@ module Iev
77
77
  cd.notes = extract_notes
78
78
  cd.terms = extract_terms
79
79
 
80
+ domain = extract_domain
81
+ cd.domain = domain if domain
82
+
80
83
  sources = extract_authoritative_source
81
84
  cd.sources = sources if sources&.any?
82
85
 
@@ -98,6 +101,22 @@ module Iev
98
101
  @term_language ||= find_value_for("LANGUAGE").to_three_char_code
99
102
  end
100
103
 
104
+ # Derives the domain (subject area section) from the IEVREF identifier.
105
+ # IEVREF format: "AAA-BB-CC" where AAA = area, AAA-BB = section.
106
+ # Returns a URI reference to the section concept (e.g. "section-103-01").
107
+ def extract_domain
108
+ return nil unless term_id
109
+
110
+ section_code = term_id.split("-")[0..1].join("-")
111
+ section = Iev.find_section(section_code)
112
+ return SubjectAreas.section_uri(section_code) if section
113
+
114
+ area_code = term_id.split("-")[0]
115
+ SubjectAreas.area_uri(area_code)
116
+ rescue StandardError
117
+ nil
118
+ end
119
+
101
120
  # Splits unified definition (from the spreadsheet) into separate
102
121
  # definition, examples, and notes strings (for YAMLs).
103
122
  #
data/lib/iev/utilities.rb CHANGED
@@ -134,7 +134,7 @@ module Iev
134
134
 
135
135
  if href.match?(IEV_CODE_RE)
136
136
  iev_code = href.sub(/\AIEV\s*/, "")
137
- "{{#{inner}, IEV:#{iev_code}}}"
137
+ "{{#{inner}, urn:iec:std:iec:60050-#{iev_code}}}"
138
138
  elsif !href.empty?
139
139
  "#{href}[#{inner}]"
140
140
  else
data/lib/iev/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iev
4
- VERSION = "0.4.2"
4
+ VERSION = "0.4.4"
5
5
  end
data/lib/iev.rb CHANGED
@@ -34,6 +34,8 @@ module Iev
34
34
  autoload :RelatonDb, "iev/relaton_db"
35
35
  autoload :Scraper, "iev/scraper"
36
36
  autoload :SourceParser, "iev/source_parser"
37
+ autoload :SubjectAreas, "iev/subject_areas"
38
+ autoload :SubjectAreaConcepts, "iev/subject_area_concepts"
37
39
  autoload :SupersessionParser, "iev/supersession_parser"
38
40
  autoload :TermAttrsParser, "iev/term_attrs_parser"
39
41
  autoload :TermBuilder, "iev/term_builder"
@@ -80,4 +82,38 @@ module Iev
80
82
  def self.scrape_concept(code)
81
83
  Scraper.new.fetch_concept(code)
82
84
  end
85
+
86
+ # Return all IEV subject areas with their sections (from bundled data).
87
+ # @return [Array<Hash>]
88
+ def self.subject_areas
89
+ SubjectAreas.all
90
+ end
91
+
92
+ # Find a subject area by code.
93
+ # @param code [String, Integer] e.g. "102"
94
+ # @return [Hash, nil]
95
+ def self.find_subject_area(code)
96
+ SubjectAreas.find_area(code)
97
+ end
98
+
99
+ # Find a section by its section code.
100
+ # @param section_code [String] e.g. "102-01"
101
+ # @return [Hash, nil]
102
+ def self.find_section(section_code)
103
+ SubjectAreas.find_section(section_code)
104
+ end
105
+
106
+ # Return sections for a given area code.
107
+ # @param code [String, Integer] e.g. "102"
108
+ # @return [Array<Hash>]
109
+ def self.sections_for(code)
110
+ SubjectAreas.sections_for(code)
111
+ end
112
+
113
+ # Return the parent subject area for a given section code.
114
+ # @param section_code [String] e.g. "102-01"
115
+ # @return [Hash, nil]
116
+ def self.area_for_section(section_code)
117
+ SubjectAreas.area_for_section(section_code)
118
+ end
83
119
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iev
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-28 00:00:00.000000000 Z
11
+ date: 2026-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: creek
@@ -98,16 +98,22 @@ dependencies:
98
98
  name: relaton
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - "~>"
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: 2.0.0
104
+ - - "<"
102
105
  - !ruby/object:Gem::Version
103
- version: '1.18'
106
+ version: '3'
104
107
  type: :runtime
105
108
  prerelease: false
106
109
  version_requirements: !ruby/object:Gem::Requirement
107
110
  requirements:
108
- - - "~>"
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: 2.0.0
114
+ - - "<"
109
115
  - !ruby/object:Gem::Version
110
- version: '1.18'
116
+ version: '3'
111
117
  - !ruby/object:Gem::Dependency
112
118
  name: sequel
113
119
  requirement: !ruby/object:Gem::Requirement
@@ -185,6 +191,7 @@ files:
185
191
  - Rakefile
186
192
  - bin/console
187
193
  - bin/setup
194
+ - data/subject_areas.yaml
188
195
  - exe/iev
189
196
  - iev.gemspec
190
197
  - lib/iev.rb
@@ -204,8 +211,11 @@ files:
204
211
  - lib/iev/profiler.rb
205
212
  - lib/iev/relaton_db.rb
206
213
  - lib/iev/scraper.rb
214
+ - lib/iev/scraper/browser.rb
207
215
  - lib/iev/scraper/page_parser.rb
208
216
  - lib/iev/source_parser.rb
217
+ - lib/iev/subject_area_concepts.rb
218
+ - lib/iev/subject_areas.rb
209
219
  - lib/iev/supersession_parser.rb
210
220
  - lib/iev/term_attrs_parser.rb
211
221
  - lib/iev/term_builder.rb