iev 0.4.4 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +7 -4
- data/.github/workflows/release.yml +2 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +4 -1
- data/.rubocop_todo.yml +98 -21
- data/CLAUDE.md +17 -5
- data/Gemfile +8 -4
- data/README.adoc +395 -10
- data/exe/iev +1 -1
- data/iev.gemspec +3 -2
- data/lib/iev/cli/command.rb +3 -2
- data/lib/iev/cli/command_helper.rb +1 -2
- data/lib/iev/cli/ui.rb +5 -5
- data/lib/iev/config.rb +1 -15
- data/lib/iev/data_source.rb +4 -2
- data/lib/iev/db_writer.rb +1 -0
- data/lib/iev/exporter.rb +182 -10
- data/lib/iev/iev_code.rb +80 -0
- data/lib/iev/iso_639_code.rb +2 -1
- data/lib/iev/relaton_db.rb +1 -1
- data/lib/iev/scraper/browser.rb +90 -88
- data/lib/iev/scraper.rb +5 -4
- data/lib/iev/section.rb +37 -0
- data/lib/iev/source_parser.rb +57 -11
- data/lib/iev/subject_area.rb +46 -0
- data/lib/iev/subject_area_concepts.rb +60 -35
- data/lib/iev/subject_areas.rb +72 -33
- data/lib/iev/supersession_parser.rb +1 -2
- data/lib/iev/term_attrs_parser.rb +1 -1
- data/lib/iev/term_builder.rb +14 -9
- data/lib/iev/utilities.rb +29 -1
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +43 -11
- metadata +26 -22
data/lib/iev/source_parser.rb
CHANGED
|
@@ -12,6 +12,7 @@ module Iev
|
|
|
12
12
|
# SourceParser.new(cell_data_string).parsed_sources
|
|
13
13
|
class SourceParser
|
|
14
14
|
include Utilities
|
|
15
|
+
|
|
15
16
|
using DataConversions
|
|
16
17
|
|
|
17
18
|
# When false, obtain_source_link skips Relaton network calls.
|
|
@@ -79,10 +80,11 @@ module Iev
|
|
|
79
80
|
relationship = extract_source_relationship(raw_ref)
|
|
80
81
|
clean_ref = normalize_ref_string(raw_ref)
|
|
81
82
|
source_ref = extract_source_ref(clean_ref)
|
|
83
|
+
ref_source, ref_id = split_ref(source_ref)
|
|
82
84
|
clause = extract_source_clause(clean_ref)
|
|
83
85
|
|
|
84
86
|
origin = Glossarist::Citation.new(
|
|
85
|
-
ref:
|
|
87
|
+
ref: Glossarist::Citation::Ref.new(source: ref_source, id: ref_id),
|
|
86
88
|
locality: build_locality(clause),
|
|
87
89
|
link: obtain_source_link(source_ref),
|
|
88
90
|
original: Iev::Converter.mathml_to_asciimath(
|
|
@@ -111,13 +113,13 @@ module Iev
|
|
|
111
113
|
# IEC 62313:2009, 3.6, modifié
|
|
112
114
|
|
|
113
115
|
str
|
|
114
|
-
.gsub(
|
|
115
|
-
.gsub(
|
|
116
|
+
.gsub("CEI", "IEC")
|
|
117
|
+
.gsub("Guide IEC", "IEC Guide")
|
|
116
118
|
.gsub(%r{Guide ISO/IEC}, "ISO/IEC Guide")
|
|
117
|
-
.gsub(
|
|
118
|
-
.gsub(
|
|
119
|
-
.gsub(
|
|
120
|
-
.gsub(
|
|
119
|
+
.gsub("VEI", "IEV")
|
|
120
|
+
.gsub("UIT", "ITU")
|
|
121
|
+
.gsub("IUT-R", "ITU-R")
|
|
122
|
+
.gsub("UTI-R", "ITU-R")
|
|
121
123
|
.gsub(/Recomm[ea]ndation ITU-T/, "ITU-T Recommendation")
|
|
122
124
|
.gsub(/ITU-T (\w.\d{3}):(\d{4})/, 'ITU-T Recommendation \1 (\2)')
|
|
123
125
|
.gsub(/ITU-R Rec. (\d+)/, 'ITU-R Recommendation \1')
|
|
@@ -289,17 +291,15 @@ module Iev
|
|
|
289
291
|
].map do |regex, _rule|
|
|
290
292
|
# TODO: Rubocop complains about unused rule -- need to make sure
|
|
291
293
|
# that no one forgot about something.
|
|
292
|
-
res = []
|
|
293
294
|
# puts "str is '#{str}'"
|
|
294
295
|
# puts "regex is '#{regex.to_s}'"
|
|
295
|
-
str.scan(regex).
|
|
296
|
+
str.scan(regex).map do |result|
|
|
296
297
|
# puts "result is #{result.first}"
|
|
297
|
-
|
|
298
|
+
{
|
|
298
299
|
index: $LAST_MATCH_INFO.offset(0)[0],
|
|
299
300
|
clause: result.first.strip,
|
|
300
301
|
}
|
|
301
302
|
end
|
|
302
|
-
res
|
|
303
303
|
# sort by index and also the length of match
|
|
304
304
|
end.flatten.sort_by { |hash| [hash[:index], -hash[:clause].length] }
|
|
305
305
|
|
|
@@ -351,6 +351,52 @@ module Iev
|
|
|
351
351
|
)
|
|
352
352
|
end
|
|
353
353
|
|
|
354
|
+
# Splits a normalized bibliographic reference into [source, id] for
|
|
355
|
+
# structured Citation::Ref construction. The full string is still
|
|
356
|
+
# passed to Relaton for link resolution — only the Citation::Ref
|
|
357
|
+
# model receives the split form.
|
|
358
|
+
#
|
|
359
|
+
# "IEC 62302:2007" → ["IEC", "62302:2007"]
|
|
360
|
+
# "ISO/IEC 2382:2015" → ["ISO/IEC", "2382:2015"]
|
|
361
|
+
# "ISO/TS 14812:2022" → ["ISO/TS", "14812:2022"]
|
|
362
|
+
# "IEC CISPR 16-1:2003" → ["IEC CISPR", "16-1:2003"]
|
|
363
|
+
# "ITU-T Recommendation F.791 (11/2015)" → ["ITU-T Recommendation", "F.791 (11/2015)"]
|
|
364
|
+
# "IEV" → ["IEV", nil]
|
|
365
|
+
def split_ref(full_ref)
|
|
366
|
+
case full_ref
|
|
367
|
+
when /\A(ISO\/IEC\/IEEE)\s+(.+)/
|
|
368
|
+
[$1, $2]
|
|
369
|
+
when /\A(ISO\/IEC\s+Guide)\s+(.+)/
|
|
370
|
+
[$1, $2]
|
|
371
|
+
when /\A(ISO\/IEC)\s+(.+)/
|
|
372
|
+
[$1, $2]
|
|
373
|
+
when /\A(IEC\/IEEE)\s+(.+)/
|
|
374
|
+
[$1, $2]
|
|
375
|
+
when %r{\A((?:ISO|IEC)/(?:PAS|TR|TS))\s+(.+)}
|
|
376
|
+
[$1, $2]
|
|
377
|
+
when /\A(IEC\s+CISPR)\s+(.+)/
|
|
378
|
+
[$1, $2]
|
|
379
|
+
when /\A(ITU-T\s+Recommendation)\s+(.+)/
|
|
380
|
+
[$1, $2]
|
|
381
|
+
when /\A(ITU-R\s+Recommendation)\s+(.+)/
|
|
382
|
+
[$1, $2]
|
|
383
|
+
when /\A(ITU-R)\s+(.+)/
|
|
384
|
+
[$1, $2]
|
|
385
|
+
when /\A((?:ISO|IEC)\s+Guide)\s+(.+)/
|
|
386
|
+
[$1, $2]
|
|
387
|
+
when /\A(ISO|IEC|IAEA)\s+(.+)/
|
|
388
|
+
[$1, $2]
|
|
389
|
+
when /\AIEV\z/
|
|
390
|
+
["IEV", nil]
|
|
391
|
+
when /\A(JCGM)\s+(VIM)\z/
|
|
392
|
+
[$1, $2]
|
|
393
|
+
when /\ABBIPM/
|
|
394
|
+
["BIPM", "SI Brochure"]
|
|
395
|
+
else
|
|
396
|
+
[full_ref, nil]
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
|
|
354
400
|
# Uses Relaton to obtain link for given source ref.
|
|
355
401
|
def obtain_source_link(ref)
|
|
356
402
|
return nil unless self.class.relaton_enabled
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Immutable value object representing an IEV subject area (e.g. "102").
|
|
5
|
+
#
|
|
6
|
+
# A subject area is the aggregate root for its sections.
|
|
7
|
+
# Navigation: area → sections (direct), section → area (via registry).
|
|
8
|
+
class SubjectArea
|
|
9
|
+
attr_reader :code, :title, :sections
|
|
10
|
+
|
|
11
|
+
# @param code [#to_s] area code, e.g. "103"
|
|
12
|
+
# @param title [#to_s] area title, e.g. "Mathematics - Functions"
|
|
13
|
+
# @param sections [Array<Iev::Section>] child sections
|
|
14
|
+
def initialize(code:, title:, sections: [])
|
|
15
|
+
@code = code.to_s
|
|
16
|
+
@title = title.to_s
|
|
17
|
+
@sections = sections
|
|
18
|
+
freeze
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def uri
|
|
22
|
+
"area-#{code}"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def section(section_code)
|
|
26
|
+
sections.find { |s| s.code == section_code.to_s }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def to_h
|
|
30
|
+
{
|
|
31
|
+
"code" => code,
|
|
32
|
+
"title" => title,
|
|
33
|
+
"sections" => sections.map(&:to_h),
|
|
34
|
+
}
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def ==(other)
|
|
38
|
+
other.is_a?(self.class) && code == other.code
|
|
39
|
+
end
|
|
40
|
+
alias_method :eql?, :==
|
|
41
|
+
|
|
42
|
+
def hash
|
|
43
|
+
code.hash
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -4,14 +4,21 @@ module Iev
|
|
|
4
4
|
# Creates ManagedConcept entries for the IEV subject area hierarchy.
|
|
5
5
|
#
|
|
6
6
|
# The hierarchy has two levels:
|
|
7
|
-
# - Area (e.g., "102" = "Mathematics - General concepts
|
|
7
|
+
# - Area (e.g., "102" = "Mathematics - General concepts")
|
|
8
8
|
# - Section (e.g., "102-01" = "Sets and operations")
|
|
9
9
|
#
|
|
10
|
-
# Linking:
|
|
11
|
-
# - Each
|
|
12
|
-
# - Each
|
|
13
|
-
# - Each section
|
|
14
|
-
# - Each
|
|
10
|
+
# Linking (all at ManagedConcept#related level):
|
|
11
|
+
# - Each area has "narrower" relations to its sections
|
|
12
|
+
# - Each section has "broader" relation to parent area
|
|
13
|
+
# - Each section gets "narrower" to child concepts (added by Exporter)
|
|
14
|
+
# - Each regular IEV concept gets "broader" to its section
|
|
15
|
+
# (added by Exporter)
|
|
16
|
+
#
|
|
17
|
+
# Classification (separate from hierarchy):
|
|
18
|
+
# - Each concept's ManagedConceptData#domains includes domain and
|
|
19
|
+
# section ConceptReferences (per ConceptReferenceType)
|
|
20
|
+
# - Each section concept's ConceptData#domain references parent area
|
|
21
|
+
# title text (a LocalizedString, not a URI)
|
|
15
22
|
module SubjectAreaConcepts
|
|
16
23
|
class << self
|
|
17
24
|
# Build all area and section concepts and add them to the collection.
|
|
@@ -23,7 +30,7 @@ module Iev
|
|
|
23
30
|
area_mc = build_area_concept(area)
|
|
24
31
|
collection.store(area_mc)
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
area.sections.each do |section|
|
|
27
34
|
section_mc = build_section_concept(section, area)
|
|
28
35
|
collection.store(section_mc)
|
|
29
36
|
end
|
|
@@ -33,41 +40,52 @@ module Iev
|
|
|
33
40
|
private
|
|
34
41
|
|
|
35
42
|
def build_area_concept(area)
|
|
36
|
-
id =
|
|
43
|
+
id = area.uri
|
|
37
44
|
|
|
38
45
|
mc = Glossarist::ManagedConcept.new(
|
|
39
46
|
data: Glossarist::ManagedConceptData.new(
|
|
40
47
|
id: id,
|
|
41
|
-
domains: [
|
|
48
|
+
domains: [domain_ref(id)],
|
|
49
|
+
tags: [area.title],
|
|
42
50
|
),
|
|
43
51
|
)
|
|
52
|
+
mc.uuid = id
|
|
53
|
+
mc.schema_version = "3"
|
|
44
54
|
|
|
45
|
-
mc.add_localization(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
mc.related =
|
|
55
|
+
mc.add_localization(
|
|
56
|
+
build_localization(id, build_concept_data(id, area.title, "eng")),
|
|
57
|
+
)
|
|
58
|
+
mc.related = area.sections.map { |s| narrower_relation(s.uri) }
|
|
59
|
+
mc.related = nil if mc.related.empty?
|
|
49
60
|
|
|
50
61
|
mc
|
|
51
62
|
end
|
|
52
63
|
|
|
53
64
|
def build_section_concept(section, area)
|
|
54
|
-
id =
|
|
65
|
+
id = section.uri
|
|
55
66
|
|
|
56
67
|
mc = Glossarist::ManagedConcept.new(
|
|
57
68
|
data: Glossarist::ManagedConceptData.new(
|
|
58
69
|
id: id,
|
|
59
70
|
domains: [
|
|
60
|
-
|
|
61
|
-
|
|
71
|
+
domain_ref(area.uri),
|
|
72
|
+
section_ref(id),
|
|
62
73
|
],
|
|
74
|
+
tags: [area.title, section.title],
|
|
63
75
|
),
|
|
64
76
|
)
|
|
77
|
+
mc.uuid = id
|
|
78
|
+
mc.schema_version = "3"
|
|
79
|
+
|
|
80
|
+
cd = build_concept_data(id, section.title, "eng")
|
|
81
|
+
# ConceptData#domain is a LocalizedString — use the area title text,
|
|
82
|
+
# not a URI. The structural relationship is expressed via domains[]
|
|
83
|
+
# and related[].
|
|
84
|
+
cd.domain = area.title
|
|
65
85
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
cd.related = [build_broader_ref(area["code"])]
|
|
86
|
+
mc.add_localization(build_localization(id, cd))
|
|
87
|
+
mc.related = [broader_relation(area.uri)]
|
|
69
88
|
|
|
70
|
-
mc.add_localization(build_localization_from_data(id, cd))
|
|
71
89
|
mc
|
|
72
90
|
end
|
|
73
91
|
|
|
@@ -85,37 +103,44 @@ module Iev
|
|
|
85
103
|
)
|
|
86
104
|
end
|
|
87
105
|
|
|
88
|
-
def build_localization(id,
|
|
89
|
-
cd = build_concept_data(id, title, lang_code)
|
|
90
|
-
|
|
106
|
+
def build_localization(id, concept_data)
|
|
91
107
|
l10n = Glossarist::LocalizedConcept.new
|
|
92
|
-
l10n.data =
|
|
108
|
+
l10n.data = concept_data
|
|
93
109
|
l10n.id = id
|
|
94
110
|
l10n.entry_status = "valid"
|
|
95
111
|
l10n.data.review_decision_event = "published"
|
|
96
112
|
l10n
|
|
97
113
|
end
|
|
98
114
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
115
|
+
# --- ConceptReference factory methods ---
|
|
116
|
+
|
|
117
|
+
def domain_ref(concept_id)
|
|
118
|
+
ref = Glossarist::ConceptReference.domain(concept_id)
|
|
119
|
+
ref.source = IEV_SOURCE
|
|
120
|
+
ref
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def section_ref(concept_id)
|
|
124
|
+
ref = Glossarist::ConceptReference.section(concept_id)
|
|
125
|
+
ref.source = IEV_SOURCE
|
|
126
|
+
ref
|
|
106
127
|
end
|
|
107
128
|
|
|
108
|
-
|
|
129
|
+
# --- RelatedConcept factory methods ---
|
|
130
|
+
|
|
131
|
+
def broader_relation(target_uri)
|
|
109
132
|
Glossarist::RelatedConcept.new(
|
|
110
133
|
type: "broader",
|
|
111
|
-
content:
|
|
134
|
+
content: target_uri,
|
|
135
|
+
ref: Glossarist::ConceptRef.new(source: "IEV", id: target_uri),
|
|
112
136
|
)
|
|
113
137
|
end
|
|
114
138
|
|
|
115
|
-
def
|
|
139
|
+
def narrower_relation(target_uri)
|
|
116
140
|
Glossarist::RelatedConcept.new(
|
|
117
141
|
type: "narrower",
|
|
118
|
-
content:
|
|
142
|
+
content: target_uri,
|
|
143
|
+
ref: Glossarist::ConceptRef.new(source: "IEV", id: target_uri),
|
|
119
144
|
)
|
|
120
145
|
end
|
|
121
146
|
end
|
data/lib/iev/subject_areas.rb
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
require "yaml"
|
|
4
4
|
require "nokogiri"
|
|
5
5
|
require "fileutils"
|
|
6
|
-
require "iev/config"
|
|
7
6
|
|
|
8
7
|
module Iev
|
|
9
8
|
module SubjectAreas
|
|
@@ -39,49 +38,59 @@ module Iev
|
|
|
39
38
|
"section-#{code}"
|
|
40
39
|
end
|
|
41
40
|
|
|
42
|
-
# --- Query API (
|
|
41
|
+
# --- Query API (returns typed objects) ---
|
|
43
42
|
|
|
44
43
|
# Return all subject areas with their sections.
|
|
45
|
-
# @return [Array<
|
|
44
|
+
# @return [Array<SubjectArea>]
|
|
46
45
|
def all
|
|
47
|
-
|
|
46
|
+
@all ||= raw_data["areas"].map { |h| build_area(h) }
|
|
48
47
|
end
|
|
49
48
|
|
|
50
|
-
# Find a single subject area by its numeric code.
|
|
49
|
+
# Find a single subject area by its numeric code. O(1) indexed.
|
|
51
50
|
# @param code [String, Integer] e.g. "102" or 102
|
|
52
|
-
# @return [
|
|
51
|
+
# @return [SubjectArea, nil]
|
|
53
52
|
def find_area(code)
|
|
54
|
-
|
|
53
|
+
area_index[code.to_s]
|
|
55
54
|
end
|
|
56
55
|
|
|
57
56
|
# Return all sections for a given area code.
|
|
58
57
|
# @param code [String, Integer] area code, e.g. "102"
|
|
59
|
-
# @return [Array<
|
|
58
|
+
# @return [Array<Section>]
|
|
60
59
|
def sections_for(code)
|
|
61
|
-
|
|
62
|
-
area ? area["sections"] : []
|
|
60
|
+
find_area(code)&.sections || []
|
|
63
61
|
end
|
|
64
62
|
|
|
65
|
-
# Find a single section by its section code.
|
|
63
|
+
# Find a single section by its section code. O(1) indexed.
|
|
66
64
|
# @param section_code [String] e.g. "102-01"
|
|
67
|
-
# @return [
|
|
65
|
+
# @return [Section, nil]
|
|
68
66
|
def find_section(section_code)
|
|
69
|
-
|
|
70
|
-
all.each do |area|
|
|
71
|
-
found = area["sections"]&.find { |s| s["code"] == sc }
|
|
72
|
-
return found if found
|
|
73
|
-
end
|
|
74
|
-
nil
|
|
67
|
+
section_index[section_code.to_s]
|
|
75
68
|
end
|
|
76
69
|
|
|
77
70
|
# Return the parent area for a given section code.
|
|
78
71
|
# @param section_code [String] e.g. "102-01"
|
|
79
|
-
# @return [
|
|
72
|
+
# @return [SubjectArea, nil]
|
|
80
73
|
def area_for_section(section_code)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
74
|
+
sec = find_section(section_code)
|
|
75
|
+
sec ? find_area(sec.area_code) : nil
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# --- Navigation from IEV reference ---
|
|
79
|
+
|
|
80
|
+
# Find the subject area for any IEV reference.
|
|
81
|
+
# @param ievref [String] e.g. "103-01-02"
|
|
82
|
+
# @return [SubjectArea, nil]
|
|
83
|
+
def area_for(ievref)
|
|
84
|
+
code = IevCode.new(ievref)
|
|
85
|
+
find_area(code.area_code)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Find the section for any IEV reference.
|
|
89
|
+
# @param ievref [String] e.g. "103-01-02"
|
|
90
|
+
# @return [Section, nil]
|
|
91
|
+
def section_for(ievref)
|
|
92
|
+
code = IevCode.new(ievref)
|
|
93
|
+
code.section_code ? find_section(code.section_code) : nil
|
|
85
94
|
end
|
|
86
95
|
|
|
87
96
|
# --- Fetching (network, writes to bundled data file) ---
|
|
@@ -95,7 +104,7 @@ module Iev
|
|
|
95
104
|
puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
|
|
96
105
|
|
|
97
106
|
# Merge: keep existing sections, add new areas
|
|
98
|
-
existing = areas.
|
|
107
|
+
existing = areas.to_h { |a| [a["code"], a] }
|
|
99
108
|
fresh_areas.each do |fa|
|
|
100
109
|
existing[fa["code"]] ||= fa
|
|
101
110
|
end
|
|
@@ -109,13 +118,13 @@ module Iev
|
|
|
109
118
|
area["fetched"] = true
|
|
110
119
|
rescue FetchError
|
|
111
120
|
area["sections"] ||= []
|
|
112
|
-
warn "IEV: Skipping area #{area[
|
|
121
|
+
warn "IEV: Skipping area #{area['code']} due to WAF"
|
|
113
122
|
end
|
|
114
123
|
|
|
115
|
-
puts "[#{i + 1}/#{areas.length}] #{area[
|
|
124
|
+
puts "[#{i + 1}/#{areas.length}] #{area['code']}: #{area['title']} — #{area['sections'].length} sections" if $stdout.tty?
|
|
116
125
|
|
|
117
126
|
# Save progress every 10 areas so partial results survive WAF failures
|
|
118
|
-
if (i + 1) % 10
|
|
127
|
+
if ((i + 1) % 10).zero?
|
|
119
128
|
write_cache("subject_areas.yaml", { "areas" => areas })
|
|
120
129
|
end
|
|
121
130
|
|
|
@@ -164,19 +173,51 @@ module Iev
|
|
|
164
173
|
sections.uniq { |s| s["code"] }
|
|
165
174
|
end
|
|
166
175
|
|
|
176
|
+
# Clear cached typed objects (useful after fetch updates raw data).
|
|
177
|
+
def reload!
|
|
178
|
+
@typed_areas = nil
|
|
179
|
+
@area_index = nil
|
|
180
|
+
@section_index = nil
|
|
181
|
+
@raw_data = nil
|
|
182
|
+
end
|
|
183
|
+
|
|
167
184
|
private
|
|
168
185
|
|
|
169
|
-
def
|
|
170
|
-
|
|
186
|
+
def build_area(hash)
|
|
187
|
+
area_code = hash["code"]
|
|
188
|
+
sections = (hash["sections"] || []).map do |s|
|
|
189
|
+
Section.new(code: s["code"], title: s["title"], area_code: area_code)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
SubjectArea.new(
|
|
193
|
+
code: area_code,
|
|
194
|
+
title: hash["title"],
|
|
195
|
+
sections: sections,
|
|
196
|
+
)
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def raw_data
|
|
200
|
+
@raw_data ||= begin
|
|
171
201
|
path = File.exist?(DATA_FILE) ? DATA_FILE : nil
|
|
172
202
|
if path
|
|
173
|
-
YAML.safe_load(File.read(path,
|
|
203
|
+
YAML.safe_load(File.read(path,
|
|
204
|
+
encoding: "utf-8")) || { "areas" => [] }
|
|
174
205
|
else
|
|
175
206
|
{ "areas" => [] }
|
|
176
207
|
end
|
|
177
208
|
end
|
|
178
209
|
end
|
|
179
210
|
|
|
211
|
+
def area_index
|
|
212
|
+
@area_index ||= all.to_h { |a| [a.code, a] }
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def section_index
|
|
216
|
+
@section_index ||= all.each_with_object({}) do |area, h|
|
|
217
|
+
area.sections.each { |s| h[s.code] = s }
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
180
221
|
def complete?(data)
|
|
181
222
|
areas = data["areas"]
|
|
182
223
|
return false unless areas&.length&.>= 99
|
|
@@ -192,10 +233,8 @@ module Iev
|
|
|
192
233
|
end
|
|
193
234
|
|
|
194
235
|
def fetch_page_with_retry(url, retries: MAX_RETRIES)
|
|
195
|
-
require "iev/scraper/browser"
|
|
196
|
-
|
|
197
236
|
retries.times do |attempt|
|
|
198
|
-
html =
|
|
237
|
+
html = Scraper::Browser.fetch(url)
|
|
199
238
|
raise FetchError, "Failed to fetch #{url}" unless html
|
|
200
239
|
|
|
201
240
|
unless captcha_page?(html)
|
|
@@ -54,10 +54,9 @@ module Iev
|
|
|
54
54
|
def relation_from_match(match_data)
|
|
55
55
|
Glossarist::RelatedConcept.new(
|
|
56
56
|
type: "supersedes",
|
|
57
|
-
ref: Glossarist::
|
|
57
|
+
ref: Glossarist::ConceptRef.new(
|
|
58
58
|
source: "IEV",
|
|
59
59
|
id: match_data[:ref],
|
|
60
|
-
version: match_data[:version],
|
|
61
60
|
),
|
|
62
61
|
)
|
|
63
62
|
end
|
data/lib/iev/term_builder.rb
CHANGED
|
@@ -7,6 +7,7 @@ module Iev
|
|
|
7
7
|
class TermBuilder
|
|
8
8
|
include Cli::Ui
|
|
9
9
|
include Utilities
|
|
10
|
+
|
|
10
11
|
using DataConversions
|
|
11
12
|
|
|
12
13
|
def initialize(data)
|
|
@@ -102,18 +103,22 @@ module Iev
|
|
|
102
103
|
end
|
|
103
104
|
|
|
104
105
|
# Derives the domain (subject area section) from the IEVREF identifier.
|
|
105
|
-
#
|
|
106
|
-
# Returns
|
|
106
|
+
#
|
|
107
|
+
# Returns the section or area title text as a localized string.
|
|
108
|
+
# Per the concept model, ConceptData#domain is a LocalizedString
|
|
109
|
+
# (the domain name), not a URI. Structural membership is expressed
|
|
110
|
+
# via ManagedConceptData#domains[] with ConceptReference objects.
|
|
107
111
|
def extract_domain
|
|
108
112
|
return nil unless term_id
|
|
109
113
|
|
|
110
|
-
|
|
111
|
-
section = Iev.find_section(section_code)
|
|
112
|
-
return
|
|
114
|
+
code = IevCode.new(term_id)
|
|
115
|
+
section = Iev.find_section(code.section_code) if code.section_code
|
|
116
|
+
return section.title if section
|
|
113
117
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
rescue StandardError
|
|
118
|
+
area = Iev.find_subject_area(code.area_code)
|
|
119
|
+
area&.title
|
|
120
|
+
rescue StandardError => e
|
|
121
|
+
warn "IEV: extract_domain failed for #{term_id}: #{e.message}"
|
|
117
122
|
nil
|
|
118
123
|
end
|
|
119
124
|
|
|
@@ -142,7 +147,7 @@ module Iev
|
|
|
142
147
|
Note \d+\sto\sentry: |
|
|
143
148
|
Note\s*\d+\sto\sthe\sentry: |
|
|
144
149
|
Note\sto\sentry\s*\d+: |
|
|
145
|
-
Note\s*\d+?\sà\sl
|
|
150
|
+
Note\s*\d+?\sà\sl'article: |
|
|
146
151
|
<NOTE/?>?\s*\d?\s+[–-]\s* |
|
|
147
152
|
NOTE(?:\s+-)?\s* |
|
|
148
153
|
Note\s+\d+\s[–-]\s* |
|
data/lib/iev/utilities.rb
CHANGED
|
@@ -5,6 +5,10 @@ module Iev
|
|
|
5
5
|
IMAGE_PATH_PREFIX = "image::/assets/images/parts"
|
|
6
6
|
IEV_CODE_RE = /\A(IEV)?\s*(\d{2,3}-\d{2,3}-\d{2,3})\z/
|
|
7
7
|
|
|
8
|
+
# Pattern matching an anchor's inner text that is just an IEV code
|
|
9
|
+
# (not a meaningful term designation).
|
|
10
|
+
IEV_CODE_TEXT_RE = /\A\s*IEV\s*\d{2,3}-\d{2,3}-\d{2,3}\s*\z/
|
|
11
|
+
|
|
8
12
|
# SIMG/Figure patterns — custom IEV XML, pre-processed before Nokogiri.
|
|
9
13
|
# Uses [^>] and [^<] instead of . to avoid polynomial backtracking.
|
|
10
14
|
SIMG_PATH_REGEX = /<simg [^>]*\/\$file\/([\d\-\w.]+)>/
|
|
@@ -134,12 +138,36 @@ module Iev
|
|
|
134
138
|
|
|
135
139
|
if href.match?(IEV_CODE_RE)
|
|
136
140
|
iev_code = href.sub(/\AIEV\s*/, "")
|
|
137
|
-
|
|
141
|
+
display = render_term_for(inner, iev_code)
|
|
142
|
+
"{{urn:iec:std:iec:60050-#{iev_code}, #{display}}}"
|
|
138
143
|
elsif !href.empty?
|
|
139
144
|
"#{href}[#{inner}]"
|
|
140
145
|
else
|
|
141
146
|
inner
|
|
142
147
|
end
|
|
143
148
|
end
|
|
149
|
+
|
|
150
|
+
# Resolve the display (render) text for an IEV cross-reference.
|
|
151
|
+
#
|
|
152
|
+
# When the anchor text is already a meaningful term (e.g. "adjective"),
|
|
153
|
+
# use it directly. When it's just a bare IEV code (e.g. "IEV 102-01-10"),
|
|
154
|
+
# try to look up the actual term designation via DataSource.
|
|
155
|
+
#
|
|
156
|
+
# @param inner_text [String] the anchor element's inner text
|
|
157
|
+
# @param iev_code [String] the extracted numeric IEV code (e.g. "102-01-10")
|
|
158
|
+
# @return [String] the term designation to use as render text
|
|
159
|
+
def render_term_for(inner_text, iev_code)
|
|
160
|
+
stripped = inner_text.strip
|
|
161
|
+
return stripped unless iev_code_only?(stripped)
|
|
162
|
+
|
|
163
|
+
Iev.get(iev_code, "en") || stripped
|
|
164
|
+
rescue StandardError
|
|
165
|
+
stripped
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# True when the anchor text is just a raw IEV code, not a term designation.
|
|
169
|
+
def iev_code_only?(text)
|
|
170
|
+
text.match?(IEV_CODE_TEXT_RE)
|
|
171
|
+
end
|
|
144
172
|
end
|
|
145
173
|
end
|
data/lib/iev/version.rb
CHANGED