iev 0.4.3 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Immutable value object representing an IEV subject area (e.g. "102").
5
+ #
6
+ # A subject area is the aggregate root for its sections.
7
+ # Navigation: area → sections (direct), section → area (via registry).
8
+ class SubjectArea
9
+ attr_reader :code, :title, :sections
10
+
11
+ # @param code [#to_s] area code, e.g. "103"
12
+ # @param title [#to_s] area title, e.g. "Mathematics - Functions"
13
+ # @param sections [Array<Iev::Section>] child sections
14
+ def initialize(code:, title:, sections: [])
15
+ @code = code.to_s
16
+ @title = title.to_s
17
+ @sections = sections
18
+ freeze
19
+ end
20
+
21
+ def uri
22
+ "area-#{code}"
23
+ end
24
+
25
+ def section(section_code)
26
+ sections.find { |s| s.code == section_code.to_s }
27
+ end
28
+
29
+ def to_h
30
+ {
31
+ "code" => code,
32
+ "title" => title,
33
+ "sections" => sections.map(&:to_h),
34
+ }
35
+ end
36
+
37
+ def ==(other)
38
+ other.is_a?(self.class) && code == other.code
39
+ end
40
+ alias_method :eql?, :==
41
+
42
+ def hash
43
+ code.hash
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Creates ManagedConcept entries for the IEV subject area hierarchy.
5
+ #
6
+ # The hierarchy has two levels:
7
+ # - Area (e.g., "102" = "Mathematics - General concepts")
8
+ # - Section (e.g., "102-01" = "Sets and operations")
9
+ #
10
+ # Linking (all at ManagedConcept#related level):
11
+ # - Each area has "narrower" relations to its sections
12
+ # - Each section has "broader" relation to parent area
13
+ # - Each section gets "narrower" to child concepts (added by Exporter)
14
+ # - Each regular IEV concept gets "broader" to its section
15
+ # (added by Exporter)
16
+ #
17
+ # Classification (separate from hierarchy):
18
+ # - Each concept's ManagedConceptData#domains includes area and
19
+ # section ConceptReferences
20
+ # - Each concept's ConceptData#domain references its section URI
21
+ # - Each section concept's ConceptData#domain references parent area
22
+ module SubjectAreaConcepts
23
+ IEV_SOURCE = "urn:iec:std:iec:60050"
24
+
25
+ class << self
26
+ # Build all area and section concepts and add them to the collection.
27
+ #
28
+ # @param collection [Glossarist::ManagedConceptCollection]
29
+ # @return [void]
30
+ def add_to(collection)
31
+ Iev.subject_areas.each do |area|
32
+ area_mc = build_area_concept(area)
33
+ collection.store(area_mc)
34
+
35
+ area.sections.each do |section|
36
+ section_mc = build_section_concept(section, area)
37
+ collection.store(section_mc)
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def domain_ref(concept_id)
45
+ Glossarist::ConceptReference.new(
46
+ concept_id: concept_id,
47
+ source: IEV_SOURCE,
48
+ ref_type: "domain",
49
+ )
50
+ end
51
+
52
+ def build_area_concept(area)
53
+ id = area.uri
54
+
55
+ mc = Glossarist::ManagedConcept.new(
56
+ data: Glossarist::ManagedConceptData.new(
57
+ id: id,
58
+ domains: [domain_ref(id)],
59
+ ),
60
+ )
61
+ mc.uuid = id
62
+
63
+ mc.add_localization(build_localization(id, area.title, "eng"))
64
+ mc.related = area.sections.map { |s| build_narrower_relation(s.uri) }
65
+ mc.related = nil if mc.related.empty?
66
+
67
+ mc
68
+ end
69
+
70
+ def build_section_concept(section, area)
71
+ id = section.uri
72
+
73
+ mc = Glossarist::ManagedConcept.new(
74
+ data: Glossarist::ManagedConceptData.new(
75
+ id: id,
76
+ domains: [
77
+ domain_ref(area.uri),
78
+ domain_ref(id),
79
+ ],
80
+ ),
81
+ )
82
+ mc.uuid = id
83
+
84
+ cd = build_concept_data(id, section.title, "eng")
85
+ cd.domain = area.uri
86
+
87
+ mc.add_localization(build_localization_from_data(id, cd))
88
+
89
+ mc.related = [build_broader_relation(area.uri)]
90
+
91
+ mc
92
+ end
93
+
94
+ def build_concept_data(id, title, lang_code)
95
+ Glossarist::ConceptData.new(
96
+ id: id,
97
+ language_code: lang_code,
98
+ terms: [
99
+ Glossarist::Designation::Expression.new(
100
+ type: "expression",
101
+ designation: title,
102
+ normative_status: "preferred",
103
+ ),
104
+ ],
105
+ )
106
+ end
107
+
108
+ def build_localization(id, title, lang_code)
109
+ cd = build_concept_data(id, title, lang_code)
110
+
111
+ l10n = Glossarist::LocalizedConcept.new
112
+ l10n.data = cd
113
+ l10n.id = id
114
+ l10n.entry_status = "valid"
115
+ l10n.data.review_decision_event = "published"
116
+ l10n
117
+ end
118
+
119
+ def build_localization_from_data(id, concept_data)
120
+ l10n = Glossarist::LocalizedConcept.new
121
+ l10n.data = concept_data
122
+ l10n.id = id
123
+ l10n.entry_status = "valid"
124
+ l10n.data.review_decision_event = "published"
125
+ l10n
126
+ end
127
+
128
+ def build_broader_relation(target_uri)
129
+ Glossarist::RelatedConcept.new(
130
+ type: "broader",
131
+ content: target_uri,
132
+ ref: Glossarist::ConceptRef.new(source: "IEV", id: target_uri),
133
+ )
134
+ end
135
+
136
+ def build_narrower_relation(target_uri)
137
+ Glossarist::RelatedConcept.new(
138
+ type: "narrower",
139
+ content: target_uri,
140
+ ref: Glossarist::ConceptRef.new(source: "IEV", id: target_uri),
141
+ )
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,273 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "nokogiri"
5
+ require "fileutils"
6
+ require "iev/config"
7
+
8
+ module Iev
9
+ module SubjectAreas
10
+ DATA_FILE = File.expand_path("../../data/subject_areas.yaml", __dir__)
11
+
12
+ AREAS_URL = "https://electropedia.org/iev/iev.nsf/" \
13
+ "6d6bdd8667c378f7c12581fa003d80e7?OpenForm"
14
+ SECTIONS_URL_TEMPLATE = "https://electropedia.org/iev/iev.nsf/" \
15
+ "index?openform&part=%<part>s"
16
+
17
+ MIN_PAGE_SIZE = 15_000
18
+
19
+ FETCH_DELAY = 5
20
+ RETRY_DELAY = 30
21
+ MAX_RETRIES = 2
22
+
23
+ class FetchError < StandardError; end
24
+
25
+ class << self
26
+ # --- URI scheme ---
27
+
28
+ # URI for a subject area concept.
29
+ # @param code [String, Integer] e.g. "102"
30
+ # @return [String] e.g. "area-102"
31
+ def area_uri(code)
32
+ "area-#{code}"
33
+ end
34
+
35
+ # URI for a section concept.
36
+ # @param code [String] e.g. "103-01"
37
+ # @return [String] e.g. "section-103-01"
38
+ def section_uri(code)
39
+ "section-#{code}"
40
+ end
41
+
42
+ # --- Query API (returns typed objects) ---
43
+
44
+ # Return all subject areas with their sections.
45
+ # @return [Array<SubjectArea>]
46
+ def all
47
+ @typed_areas ||= raw_data["areas"].map { |h| build_area(h) }
48
+ end
49
+
50
+ # Find a single subject area by its numeric code. O(1) indexed.
51
+ # @param code [String, Integer] e.g. "102" or 102
52
+ # @return [SubjectArea, nil]
53
+ def find_area(code)
54
+ area_index[code.to_s]
55
+ end
56
+
57
+ # Return all sections for a given area code.
58
+ # @param code [String, Integer] area code, e.g. "102"
59
+ # @return [Array<Section>]
60
+ def sections_for(code)
61
+ find_area(code)&.sections || []
62
+ end
63
+
64
+ # Find a single section by its section code. O(1) indexed.
65
+ # @param section_code [String] e.g. "102-01"
66
+ # @return [Section, nil]
67
+ def find_section(section_code)
68
+ section_index[section_code.to_s]
69
+ end
70
+
71
+ # Return the parent area for a given section code.
72
+ # @param section_code [String] e.g. "102-01"
73
+ # @return [SubjectArea, nil]
74
+ def area_for_section(section_code)
75
+ sec = find_section(section_code)
76
+ sec ? find_area(sec.area_code) : nil
77
+ end
78
+
79
+ # --- Navigation from IEV reference ---
80
+
81
+ # Find the subject area for any IEV reference.
82
+ # @param ievref [String] e.g. "103-01-02"
83
+ # @return [SubjectArea, nil]
84
+ def area_for(ievref)
85
+ code = IevCode.new(ievref)
86
+ find_area(code.area_code)
87
+ end
88
+
89
+ # Find the section for any IEV reference.
90
+ # @param ievref [String] e.g. "103-01-02"
91
+ # @return [Section, nil]
92
+ def section_for(ievref)
93
+ code = IevCode.new(ievref)
94
+ code.section_code ? find_section(code.section_code) : nil
95
+ end
96
+
97
+ # --- Fetching (network, writes to bundled data file) ---
98
+
99
+ def fetch
100
+ cached = read_cache("subject_areas.yaml")
101
+ return cached if cached && complete?(cached)
102
+
103
+ areas = cached ? cached["areas"] : []
104
+ fresh_areas = fetch_areas
105
+ puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
106
+
107
+ # Merge: keep existing sections, add new areas
108
+ existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a }
109
+ fresh_areas.each do |fa|
110
+ existing[fa["code"]] ||= fa
111
+ end
112
+ areas = fresh_areas.map { |fa| existing[fa["code"]] || fa }
113
+
114
+ areas.each_with_index do |area, i|
115
+ next if area["fetched"]
116
+
117
+ begin
118
+ area["sections"] = fetch_sections(area["code"])
119
+ area["fetched"] = true
120
+ rescue FetchError
121
+ area["sections"] ||= []
122
+ warn "IEV: Skipping area #{area["code"]} due to WAF"
123
+ end
124
+
125
+ puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty?
126
+
127
+ # Save progress every 10 areas so partial results survive WAF failures
128
+ if (i + 1) % 10 == 0
129
+ write_cache("subject_areas.yaml", { "areas" => areas })
130
+ end
131
+
132
+ sleep FETCH_DELAY unless i == areas.length - 1
133
+ end
134
+
135
+ result = { "areas" => areas }
136
+ write_cache("subject_areas.yaml", result)
137
+ result
138
+ end
139
+
140
+ def fetch_areas
141
+ html = fetch_page_with_retry(AREAS_URL)
142
+ doc = Nokogiri::HTML(html)
143
+
144
+ areas = []
145
+ doc.css("a").each do |link|
146
+ href = link["href"].to_s
147
+ next unless href.include?("part=")
148
+
149
+ code = href.match(/part=(\d+)/)&.[](1)
150
+ next unless code
151
+
152
+ title = link.text.strip
153
+ next if title.empty?
154
+
155
+ areas << { "code" => code, "title" => title, "sections" => [] }
156
+ end
157
+
158
+ areas.uniq { |a| a["code"] }
159
+ end
160
+
161
+ def fetch_sections(part)
162
+ url = format(SECTIONS_URL_TEMPLATE, part: part)
163
+ html = fetch_page_with_retry(url)
164
+ doc = Nokogiri::HTML(html)
165
+
166
+ sections = []
167
+ doc.css("td").each do |td|
168
+ text = td.text.strip
169
+ if (m = text.match(/\ASection\s+([\d-]+):\s*(.+)\z/))
170
+ sections << { "code" => m[1], "title" => m[2].strip }
171
+ end
172
+ end
173
+
174
+ sections.uniq { |s| s["code"] }
175
+ end
176
+
177
+ # Clear cached typed objects (useful after fetch updates raw data).
178
+ def reload!
179
+ @typed_areas = nil
180
+ @area_index = nil
181
+ @section_index = nil
182
+ @raw_data = nil
183
+ end
184
+
185
+ private
186
+
187
+ def build_area(hash)
188
+ area_code = hash["code"]
189
+ sections = (hash["sections"] || []).map do |s|
190
+ Section.new(code: s["code"], title: s["title"], area_code: area_code)
191
+ end
192
+
193
+ SubjectArea.new(
194
+ code: area_code,
195
+ title: hash["title"],
196
+ sections: sections,
197
+ )
198
+ end
199
+
200
+ def raw_data
201
+ @raw_data ||= begin
202
+ path = File.exist?(DATA_FILE) ? DATA_FILE : nil
203
+ if path
204
+ YAML.safe_load(File.read(path, encoding: "utf-8")) || { "areas" => [] }
205
+ else
206
+ { "areas" => [] }
207
+ end
208
+ end
209
+ end
210
+
211
+ def area_index
212
+ @area_index ||= all.each_with_object({}) { |a, h| h[a.code] = a }
213
+ end
214
+
215
+ def section_index
216
+ @section_index ||= all.each_with_object({}) do |area, h|
217
+ area.sections.each { |s| h[s.code] = s }
218
+ end
219
+ end
220
+
221
+ def complete?(data)
222
+ areas = data["areas"]
223
+ return false unless areas&.length&.>= 99
224
+
225
+ areas.all? { |a| a["fetched"] == true }
226
+ end
227
+
228
+ def captcha_page?(html)
229
+ html.length < MIN_PAGE_SIZE ||
230
+ html.include?("Confirm you are human") ||
231
+ html.include?("solve a puzzle") ||
232
+ html.include?("security check before continuing")
233
+ end
234
+
235
+ def fetch_page_with_retry(url, retries: MAX_RETRIES)
236
+ require "iev/scraper/browser"
237
+
238
+ retries.times do |attempt|
239
+ html = ScraperBrowser.fetch(url)
240
+ raise FetchError, "Failed to fetch #{url}" unless html
241
+
242
+ unless captcha_page?(html)
243
+ return html
244
+ end
245
+
246
+ if attempt < retries - 1
247
+ wait = RETRY_DELAY * (attempt + 1)
248
+ warn "IEV: WAF challenge for #{url}, retrying in #{wait}s (attempt #{attempt + 1}/#{retries})"
249
+ sleep wait
250
+ else
251
+ raise FetchError, "WAF challenge for #{url}"
252
+ end
253
+ end
254
+ end
255
+
256
+ def read_cache(filename)
257
+ cache_path = File.join(Iev.config.cache_dir, filename)
258
+ return nil unless File.exist?(cache_path)
259
+
260
+ d = YAML.safe_load(File.read(cache_path, encoding: "utf-8"))
261
+ return nil unless d&.dig("areas")&.any?
262
+
263
+ d
264
+ end
265
+
266
+ def write_cache(filename, d)
267
+ cache_path = File.join(Iev.config.cache_dir, filename)
268
+ FileUtils.mkdir_p(File.dirname(cache_path))
269
+ File.write(cache_path, YAML.dump(d), encoding: "utf-8")
270
+ end
271
+ end
272
+ end
273
+ end
@@ -54,10 +54,9 @@ module Iev
54
54
  def relation_from_match(match_data)
55
55
  Glossarist::RelatedConcept.new(
56
56
  type: "supersedes",
57
- ref: Glossarist::Citation.new(
57
+ ref: Glossarist::ConceptRef.new(
58
58
  source: "IEV",
59
59
  id: match_data[:ref],
60
- version: match_data[:version],
61
60
  ),
62
61
  )
63
62
  end
@@ -77,6 +77,9 @@ module Iev
77
77
  cd.notes = extract_notes
78
78
  cd.terms = extract_terms
79
79
 
80
+ domain = extract_domain
81
+ cd.domain = domain if domain
82
+
80
83
  sources = extract_authoritative_source
81
84
  cd.sources = sources if sources&.any?
82
85
 
@@ -98,6 +101,22 @@ module Iev
98
101
  @term_language ||= find_value_for("LANGUAGE").to_three_char_code
99
102
  end
100
103
 
104
+ # Derives the domain (subject area section) from the IEVREF identifier.
105
+ # IEVREF format: "AAA-BB-CC" where AAA = area, AAA-BB = section.
106
+ # Returns a URI reference to the section concept (e.g. "section-103-01").
107
+ def extract_domain
108
+ return nil unless term_id
109
+
110
+ section_code = term_id.split("-")[0..1].join("-")
111
+ section = Iev.find_section(section_code)
112
+ return SubjectAreas.section_uri(section_code) if section
113
+
114
+ area_code = term_id.split("-")[0]
115
+ SubjectAreas.area_uri(area_code)
116
+ rescue StandardError
117
+ nil
118
+ end
119
+
101
120
  # Splits unified definition (from the spreadsheet) into separate
102
121
  # definition, examples, and notes strings (for YAMLs).
103
122
  #
data/lib/iev/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iev
4
- VERSION = "0.4.3"
4
+ VERSION = "0.4.5"
5
5
  end
data/lib/iev.rb CHANGED
@@ -29,11 +29,16 @@ module Iev
29
29
  autoload :DataSource, "iev/data_source"
30
30
  autoload :DbWriter, "iev/db_writer"
31
31
  autoload :Exporter, "iev/exporter"
32
+ autoload :IevCode, "iev/iev_code"
32
33
  autoload :Iso639Code, "iev/iso_639_code"
33
34
  autoload :Profiler, "iev/profiler"
34
35
  autoload :RelatonDb, "iev/relaton_db"
35
36
  autoload :Scraper, "iev/scraper"
37
+ autoload :Section, "iev/section"
36
38
  autoload :SourceParser, "iev/source_parser"
39
+ autoload :SubjectArea, "iev/subject_area"
40
+ autoload :SubjectAreas, "iev/subject_areas"
41
+ autoload :SubjectAreaConcepts, "iev/subject_area_concepts"
37
42
  autoload :SupersessionParser, "iev/supersession_parser"
38
43
  autoload :TermAttrsParser, "iev/term_attrs_parser"
39
44
  autoload :TermBuilder, "iev/term_builder"
@@ -80,4 +85,45 @@ module Iev
80
85
  def self.scrape_concept(code)
81
86
  Scraper.new.fetch_concept(code)
82
87
  end
88
+
89
+ # Return all IEV subject areas with their sections (from bundled data).
90
+ # @return [Array<SubjectArea>]
91
+ def self.subject_areas
92
+ SubjectAreas.all
93
+ end
94
+
95
+ # Find a subject area by code.
96
+ # @param code [String, Integer] e.g. "102"
97
+ # @return [SubjectArea, nil]
98
+ def self.find_subject_area(code)
99
+ SubjectAreas.find_area(code)
100
+ end
101
+
102
+ # Find a section by its section code.
103
+ # @param section_code [String] e.g. "102-01"
104
+ # @return [Section, nil]
105
+ def self.find_section(section_code)
106
+ SubjectAreas.find_section(section_code)
107
+ end
108
+
109
+ # Return sections for a given area code.
110
+ # @param code [String, Integer] e.g. "102"
111
+ # @return [Array<Section>]
112
+ def self.sections_for(code)
113
+ SubjectAreas.sections_for(code)
114
+ end
115
+
116
+ # Return the parent subject area for a given section code.
117
+ # @param section_code [String] e.g. "102-01"
118
+ # @return [SubjectArea, nil]
119
+ def self.area_for_section(section_code)
120
+ SubjectAreas.area_for_section(section_code)
121
+ end
122
+
123
+ # Parse an IEV code into its structural components.
124
+ # @param code [String] e.g. "103-01-02"
125
+ # @return [IevCode, nil] nil if the code is blank
126
+ def self.parse_code(code)
127
+ IevCode.parse(code)
128
+ end
83
129
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iev
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-03 00:00:00.000000000 Z
11
+ date: 2026-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: creek
@@ -28,16 +28,22 @@ dependencies:
28
28
  name: glossarist
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.6'
31
34
  - - ">="
32
35
  - !ruby/object:Gem::Version
33
- version: 2.3.0
36
+ version: 2.6.7
34
37
  type: :runtime
35
38
  prerelease: false
36
39
  version_requirements: !ruby/object:Gem::Requirement
37
40
  requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '2.6'
38
44
  - - ">="
39
45
  - !ruby/object:Gem::Version
40
- version: 2.3.0
46
+ version: 2.6.7
41
47
  - !ruby/object:Gem::Dependency
42
48
  name: ferrum
43
49
  requirement: !ruby/object:Gem::Requirement
@@ -191,6 +197,7 @@ files:
191
197
  - Rakefile
192
198
  - bin/console
193
199
  - bin/setup
200
+ - data/subject_areas.yaml
194
201
  - exe/iev
195
202
  - iev.gemspec
196
203
  - lib/iev.rb
@@ -205,13 +212,19 @@ files:
205
212
  - lib/iev/data_source.rb
206
213
  - lib/iev/db_writer.rb
207
214
  - lib/iev/exporter.rb
215
+ - lib/iev/iev_code.rb
208
216
  - lib/iev/iso_639_2.yaml
209
217
  - lib/iev/iso_639_code.rb
210
218
  - lib/iev/profiler.rb
211
219
  - lib/iev/relaton_db.rb
212
220
  - lib/iev/scraper.rb
221
+ - lib/iev/scraper/browser.rb
213
222
  - lib/iev/scraper/page_parser.rb
223
+ - lib/iev/section.rb
214
224
  - lib/iev/source_parser.rb
225
+ - lib/iev/subject_area.rb
226
+ - lib/iev/subject_area_concepts.rb
227
+ - lib/iev/subject_areas.rb
215
228
  - lib/iev/supersession_parser.rb
216
229
  - lib/iev/term_attrs_parser.rb
217
230
  - lib/iev/term_builder.rb