iev 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +0 -2
- data/.github/workflows/release.yml +3 -1
- data/.gitignore +3 -1
- data/CLAUDE.md +50 -0
- data/Gemfile +3 -0
- data/README.adoc +65 -15
- data/exe/iev +11 -0
- data/iev.gemspec +5 -4
- data/lib/iev/cli/command.rb +119 -76
- data/lib/iev/cli/command_helper.rb +55 -36
- data/lib/iev/config.rb +31 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +119 -158
- data/lib/iev/data_source.rb +124 -0
- data/lib/iev/exporter.rb +122 -0
- data/lib/iev/scraper/page_parser.rb +176 -0
- data/lib/iev/scraper.rb +135 -0
- data/lib/iev/source_parser.rb +31 -18
- data/lib/iev/supersession_parser.rb +9 -13
- data/lib/iev/term_attrs_parser.rb +21 -7
- data/lib/iev/term_builder.rb +100 -94
- data/lib/iev/utilities.rb +91 -42
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +47 -35
- metadata +34 -13
- data/lib/iev/db.rb +0 -82
- data/lib/iev/db_cache.rb +0 -124
data/lib/iev/term_builder.rb
CHANGED
|
@@ -44,44 +44,46 @@ module Iev
|
|
|
44
44
|
|
|
45
45
|
split_definition
|
|
46
46
|
|
|
47
|
-
|
|
47
|
+
concept_data = build_concept_data
|
|
48
|
+
|
|
49
|
+
concept = Glossarist::LocalizedConcept.new
|
|
50
|
+
concept.data = concept_data
|
|
51
|
+
concept.id = term_id
|
|
52
|
+
concept.entry_status = extract_entry_status
|
|
53
|
+
concept.classification = extract_classification
|
|
54
|
+
|
|
55
|
+
concept
|
|
48
56
|
end
|
|
49
57
|
|
|
50
|
-
def
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
type: :amended,
|
|
61
|
-
date: flesh_date(find_value_for("PUBLICATIONDATE")),
|
|
62
|
-
},
|
|
58
|
+
def build_concept_data
|
|
59
|
+
cd = Glossarist::ConceptData.new
|
|
60
|
+
cd.id = term_id
|
|
61
|
+
cd.language_code = term_language
|
|
62
|
+
|
|
63
|
+
pub_date = flesh_date(find_value_for("PUBLICATIONDATE"))
|
|
64
|
+
if pub_date
|
|
65
|
+
cd.dates = [
|
|
66
|
+
Glossarist::ConceptDate.new(type: "accepted", date: pub_date),
|
|
67
|
+
Glossarist::ConceptDate.new(type: "amended", date: pub_date),
|
|
63
68
|
]
|
|
69
|
+
cd.review_date = pub_date
|
|
70
|
+
cd.review_decision_date = pub_date
|
|
64
71
|
end
|
|
72
|
+
cd.review_decision_event = "published"
|
|
65
73
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
review_decision_event: "published",
|
|
80
|
-
language_code: term_language,
|
|
81
|
-
sources: extract_authoritative_source,
|
|
82
|
-
related: extract_superseded_concepts,
|
|
83
|
-
}.compact,
|
|
84
|
-
}.compact
|
|
74
|
+
definition = extract_definition_value
|
|
75
|
+
cd.definition = [definition] if definition
|
|
76
|
+
cd.examples = extract_examples
|
|
77
|
+
cd.notes = extract_notes
|
|
78
|
+
cd.terms = extract_terms
|
|
79
|
+
|
|
80
|
+
sources = extract_authoritative_source
|
|
81
|
+
cd.sources = sources if sources&.any?
|
|
82
|
+
|
|
83
|
+
related = extract_superseded_concepts
|
|
84
|
+
cd.related = related if related&.any?
|
|
85
|
+
|
|
86
|
+
cd
|
|
85
87
|
end
|
|
86
88
|
|
|
87
89
|
def term_id
|
|
@@ -121,10 +123,10 @@ module Iev
|
|
|
121
123
|
Note \d+\sto\sentry: |
|
|
122
124
|
Note\s*\d+\sto\sthe\sentry: |
|
|
123
125
|
Note\sto\sentry\s*\d+: |
|
|
124
|
-
Note\s*\d+?\sà\sl['
|
|
125
|
-
<NOTE/?>?\s*\d?\s
|
|
126
|
-
NOTE(?:\s+-)
|
|
127
|
-
Note\s+\d+\s
|
|
126
|
+
Note\s*\d+?\sà\sl['']article: |
|
|
127
|
+
<NOTE/?>?\s*\d?\s+[–-]\s* |
|
|
128
|
+
NOTE(?:\s+-)?\s* |
|
|
129
|
+
Note\s+\d+\s[–-]\s* |
|
|
128
130
|
Note \d+\s
|
|
129
131
|
)
|
|
130
132
|
)
|
|
@@ -140,28 +142,14 @@ module Iev
|
|
|
140
142
|
|
|
141
143
|
while (md = remaining_str&.match(slicer_rx))
|
|
142
144
|
next_part = md.pre_match
|
|
143
|
-
next_part.sub!(/^\[:Ex(a|e)mple\]/, 'Ex
|
|
145
|
+
next_part.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\1mple')
|
|
144
146
|
next_part_arr.push(next_part)
|
|
145
147
|
next_part_arr = md[:example] ? @examples : @notes
|
|
146
|
-
# 112-03-17
|
|
147
|
-
# supplements the name of a quantity, especially for a component in a
|
|
148
|
-
# system, to indicate the quotient of that quantity by the total
|
|
149
|
-
# volume
|
|
150
|
-
# <NOTE – Examples: amount-of-substance volume concentration of
|
|
151
|
-
# component B (or concentration of B, in particular, ion
|
|
152
|
-
# concentration), molecular concentration of B, electron concentration
|
|
153
|
-
# (or electron density).
|
|
154
|
-
#
|
|
155
|
-
# In the above case the `Example` is part of the note but the regex
|
|
156
|
-
# above will capture it as an example and will add an empty `Note`
|
|
157
|
-
# and put the rest in an `Example`. So In this case we will replace
|
|
158
|
-
# the `Example` with `[:Example]` and revert it in the next iteration
|
|
159
|
-
# so it will not be caught by the regex.
|
|
160
148
|
remaining_str = md.post_match
|
|
161
|
-
remaining_str.sub!(/^Ex(a|e)mple/, '[:Ex
|
|
149
|
+
remaining_str.sub!(/^Ex(a|e)mple/, '[:Ex\1mple]') if md[:note]
|
|
162
150
|
end
|
|
163
151
|
|
|
164
|
-
remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, 'Ex
|
|
152
|
+
remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\1mple')
|
|
165
153
|
next_part_arr.push(remaining_str)
|
|
166
154
|
@definition = definition_arr.first
|
|
167
155
|
@definition = nil if @definition&.empty?
|
|
@@ -211,28 +199,21 @@ module Iev
|
|
|
211
199
|
def extract_definition_value
|
|
212
200
|
return unless @definition
|
|
213
201
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
).strip
|
|
202
|
+
content = convert_content(@definition)
|
|
203
|
+
Glossarist::DetailedDefinition.new(content: content)
|
|
217
204
|
end
|
|
218
205
|
|
|
219
206
|
def extract_examples
|
|
220
207
|
@examples.map do |str|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
replace_newlines(parse_anchor_tag(str, term_domain)),
|
|
224
|
-
).strip,
|
|
225
|
-
}
|
|
208
|
+
content = convert_content(clean_extracted_text(str))
|
|
209
|
+
Glossarist::DetailedDefinition.new(content: content)
|
|
226
210
|
end
|
|
227
211
|
end
|
|
228
212
|
|
|
229
213
|
def extract_notes
|
|
230
214
|
@notes.map do |str|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
replace_newlines(parse_anchor_tag(str, term_domain)),
|
|
234
|
-
).strip,
|
|
235
|
-
}
|
|
215
|
+
content = convert_content(clean_extracted_text(str))
|
|
216
|
+
Glossarist::DetailedDefinition.new(content: content)
|
|
236
217
|
end
|
|
237
218
|
end
|
|
238
219
|
|
|
@@ -246,14 +227,14 @@ module Iev
|
|
|
246
227
|
classification_val = find_value_for("SYNONYM1STATUS")
|
|
247
228
|
|
|
248
229
|
case classification_val
|
|
249
|
-
when ""
|
|
250
|
-
|
|
230
|
+
when nil, ""
|
|
231
|
+
nil
|
|
251
232
|
when "认可的", "допустимый", "admitido"
|
|
252
233
|
"admitted"
|
|
253
234
|
when "首选的", "suositettava", "suositeltava", "рекомендуемый", "preferente"
|
|
254
235
|
"preferred"
|
|
255
236
|
else
|
|
256
|
-
classification_val
|
|
237
|
+
classification_val.downcase
|
|
257
238
|
end
|
|
258
239
|
end
|
|
259
240
|
|
|
@@ -261,12 +242,12 @@ module Iev
|
|
|
261
242
|
source_val = find_value_for("SOURCE")
|
|
262
243
|
return nil if source_val.nil?
|
|
263
244
|
|
|
264
|
-
SourceParser.new(source_val, term_domain)
|
|
245
|
+
sources = SourceParser.new(source_val, term_domain)
|
|
265
246
|
.parsed_sources
|
|
266
247
|
.compact
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
248
|
+
|
|
249
|
+
sources.each { |src| src.type = "authoritative" }
|
|
250
|
+
sources.empty? ? nil : sources
|
|
270
251
|
end
|
|
271
252
|
|
|
272
253
|
def extract_superseded_concepts
|
|
@@ -279,9 +260,7 @@ module Iev
|
|
|
279
260
|
private
|
|
280
261
|
|
|
281
262
|
def build_expression_designation(raw_term, attribute_data:, status:)
|
|
282
|
-
term =
|
|
283
|
-
parse_anchor_tag(raw_term, term_domain),
|
|
284
|
-
)
|
|
263
|
+
term = convert_content(raw_term)
|
|
285
264
|
term_attributes = TermAttrsParser.new(attribute_data.to_s)
|
|
286
265
|
|
|
287
266
|
statuses = {
|
|
@@ -289,29 +268,56 @@ module Iev
|
|
|
289
268
|
"напуштен" => "deprecated",
|
|
290
269
|
}
|
|
291
270
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
"gender" => term_attributes.gender,
|
|
301
|
-
"plurality" => term_attributes.plurality,
|
|
271
|
+
grammar_info = term_attributes.to_grammar_info
|
|
272
|
+
attrs = {
|
|
273
|
+
designation: term,
|
|
274
|
+
normative_status: statuses[status] || status,
|
|
275
|
+
geographical_area: term_attributes.geographical_area,
|
|
276
|
+
prefix: term_attributes.prefix,
|
|
277
|
+
usage_info: term_attributes.usage_info,
|
|
278
|
+
grammar_info: grammar_info ? [grammar_info] : nil,
|
|
302
279
|
}.compact
|
|
280
|
+
|
|
281
|
+
Glossarist::Designation::Expression.new(**attrs)
|
|
303
282
|
end
|
|
304
283
|
|
|
305
284
|
def build_symbol_designation(raw_term)
|
|
306
|
-
term =
|
|
307
|
-
|
|
285
|
+
term = convert_content(raw_term)
|
|
286
|
+
|
|
287
|
+
Glossarist::Designation::Symbol.new(
|
|
288
|
+
designation: term,
|
|
289
|
+
international: true,
|
|
308
290
|
)
|
|
291
|
+
end
|
|
309
292
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
293
|
+
def convert_content(str)
|
|
294
|
+
stripped = strip_html_comments(str.to_s)
|
|
295
|
+
Iev::Converter.mathml_to_asciimath(
|
|
296
|
+
replace_newlines(parse_anchor_tag(stripped, term_domain)),
|
|
297
|
+
).strip
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def strip_html_comments(str)
|
|
301
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(str)
|
|
302
|
+
comments = doc.children.select(&:comment?)
|
|
303
|
+
return str if comments.empty?
|
|
304
|
+
|
|
305
|
+
result = str.dup
|
|
306
|
+
comments.each { |c| result = result.gsub("<!--#{c.content}-->", "") }
|
|
307
|
+
result
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Remove leading numbering artifacts from extracted notes/examples.
|
|
311
|
+
# The definition text sometimes duplicates note/example numbers:
|
|
312
|
+
# "1 A time interval comprises..." (note)
|
|
313
|
+
# "1: In a vending machine..." (example)
|
|
314
|
+
# "2 à l'article: ..." (French note)
|
|
315
|
+
# ": Par la réticulation..." (French note)
|
|
316
|
+
def clean_extracted_text(str)
|
|
317
|
+
# Strip leading number + optional separator (colon, em-space, etc.)
|
|
318
|
+
str.gsub(/\A\s*\d+[\s: ]*\s*/, "")
|
|
319
|
+
# Strip leading standalone colon (French style: ": text")
|
|
320
|
+
.gsub(/\A\s*:\s*/, "")
|
|
315
321
|
end
|
|
316
322
|
end
|
|
317
323
|
end
|
data/lib/iev/utilities.rb
CHANGED
|
@@ -2,57 +2,106 @@
|
|
|
2
2
|
|
|
3
3
|
module Iev
|
|
4
4
|
module Utilities
|
|
5
|
-
SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
|
|
6
|
-
FIGURE_ONE_REGEX =
|
|
7
|
-
'<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+(.+)\\s*<\\/b>(<\\/p>)?'
|
|
8
|
-
FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}".freeze
|
|
9
5
|
IMAGE_PATH_PREFIX = "image::/assets/images/parts"
|
|
6
|
+
IEV_CODE_RE = /\A(IEV)?\s*(\d{2,3}-\d{2,3}-\d{2,3})\z/
|
|
7
|
+
|
|
8
|
+
# SIMG/Figure patterns — custom IEV XML, pre-processed before Nokogiri.
|
|
9
|
+
# Uses [^>] and [^<] instead of . to avoid polynomial backtracking.
|
|
10
|
+
SIMG_PATH_REGEX = /<simg [^>]*\/\$file\/([\d\-\w.]+)>/
|
|
11
|
+
FIGURE_ONE_REGEX = '<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+([^<]+)\\s*<\\/b>(<\\/p>)?'
|
|
12
|
+
FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}".freeze
|
|
10
13
|
|
|
11
14
|
def parse_anchor_tag(text, term_domain)
|
|
12
|
-
return
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
%r{<a href="?(IEV)\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)</?a>},
|
|
20
|
-
'{{\3, \1:\2}}',
|
|
21
|
-
).gsub(
|
|
22
|
-
%r{<a href="?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)</?a>},
|
|
23
|
-
'{{\3, IEV:\2}}',
|
|
24
|
-
).gsub(
|
|
25
|
-
# To handle <a> tags without ending tag like
|
|
26
|
-
# `Voir <a href=IEV103-05-21>IEV 103-05-21`
|
|
27
|
-
# for concept '702-03-11' in `fr`
|
|
28
|
-
/<a href="?(IEV)?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)$/,
|
|
29
|
-
'{{\3, IEV:\2}}',
|
|
30
|
-
).gsub(
|
|
31
|
-
%r{<a href="?([^<>]*?)"?>(.*?)</a>},
|
|
32
|
-
'\1[\2]',
|
|
33
|
-
).gsub(
|
|
34
|
-
Regexp.new([SIMG_PATH_REGEX, '\\s*', FIGURE_TWO_REGEX].join),
|
|
35
|
-
"#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6]",
|
|
36
|
-
).gsub(
|
|
37
|
-
Regexp.new([SIMG_PATH_REGEX, '\\s*', FIGURE_ONE_REGEX].join),
|
|
38
|
-
"#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
|
|
39
|
-
).gsub(
|
|
40
|
-
/<img\s+([^<>]+?)\s*>/,
|
|
41
|
-
"#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]",
|
|
42
|
-
).gsub(
|
|
43
|
-
/<br>/,
|
|
44
|
-
"\n",
|
|
45
|
-
).gsub(
|
|
46
|
-
%r{<b>(.*?)</b>},
|
|
47
|
-
'*\\1*',
|
|
48
|
-
)
|
|
15
|
+
return nil if text.nil?
|
|
16
|
+
|
|
17
|
+
text = process_simg_figures(text, term_domain)
|
|
18
|
+
text = fix_unquoted_href(text)
|
|
19
|
+
|
|
20
|
+
doc = Nokogiri::HTML::DocumentFragment.parse(text)
|
|
21
|
+
nodes_to_adoc(doc.children, term_domain)
|
|
49
22
|
end
|
|
50
23
|
|
|
51
24
|
def replace_newlines(input)
|
|
52
|
-
input
|
|
25
|
+
input
|
|
26
|
+
.gsub('\n', "\n\n")
|
|
53
27
|
.gsub(/<[pbr]+>/, "\n\n")
|
|
28
|
+
.gsub(/<br\s*\/?>/, "\n\n")
|
|
54
29
|
.gsub(/\s*\n[\n\s]+/, "\n\n")
|
|
55
30
|
.strip
|
|
56
31
|
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
# IEV data has unquoted href with spaces, e.g.
|
|
36
|
+
# <a href=IEV 102-01-10>...</a>
|
|
37
|
+
# Nokogiri stops at first space, so add quotes.
|
|
38
|
+
# Uses a specific IEV code pattern to avoid regex backtracking.
|
|
39
|
+
def fix_unquoted_href(text)
|
|
40
|
+
text.gsub(/href=(IEV\s\d{2,3}-\d{2,3}-\d{2,3})(?=[>\s])/) do
|
|
41
|
+
"href=\"#{Regexp.last_match(1)}\""
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def process_simg_figures(text, term_domain)
|
|
46
|
+
text = text.gsub(
|
|
47
|
+
Regexp.new([SIMG_PATH_REGEX.source, '\s*', FIGURE_TWO_REGEX].join),
|
|
48
|
+
"#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6 - \\7]",
|
|
49
|
+
)
|
|
50
|
+
text = text.gsub(
|
|
51
|
+
Regexp.new([SIMG_PATH_REGEX.source, '\s*', FIGURE_ONE_REGEX].join),
|
|
52
|
+
"#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
|
|
53
|
+
)
|
|
54
|
+
text.gsub(SIMG_PATH_REGEX, "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]")
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def nodes_to_adoc(nodes, term_domain)
|
|
58
|
+
nodes.map { |n| node_to_adoc(n, term_domain) }.join
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def node_to_adoc(node, term_domain)
|
|
62
|
+
case node
|
|
63
|
+
when Nokogiri::XML::Text
|
|
64
|
+
node.text
|
|
65
|
+
when Nokogiri::XML::Comment
|
|
66
|
+
""
|
|
67
|
+
when Nokogiri::XML::Element
|
|
68
|
+
element_to_adoc(node, term_domain)
|
|
69
|
+
else
|
|
70
|
+
""
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def element_to_adoc(node, term_domain)
|
|
75
|
+
inner = nodes_to_adoc(node.children, term_domain)
|
|
76
|
+
|
|
77
|
+
case node.name
|
|
78
|
+
when "a"
|
|
79
|
+
convert_link(node, inner)
|
|
80
|
+
when "b"
|
|
81
|
+
"*#{inner}*"
|
|
82
|
+
when "br"
|
|
83
|
+
"\n"
|
|
84
|
+
when "img"
|
|
85
|
+
src = node["src"] || node.attributes.keys.first.to_s
|
|
86
|
+
"#{IMAGE_PATH_PREFIX}/#{term_domain}/#{src}[]"
|
|
87
|
+
when "p", "div", "span"
|
|
88
|
+
inner
|
|
89
|
+
else
|
|
90
|
+
node.to_s
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def convert_link(node, inner)
|
|
95
|
+
href = (node["href"] || "").to_s.strip
|
|
96
|
+
|
|
97
|
+
if href.match?(IEV_CODE_RE)
|
|
98
|
+
iev_code = href.sub(/\AIEV\s*/, "")
|
|
99
|
+
"{{#{inner}, IEV:#{iev_code}}}"
|
|
100
|
+
elsif !href.empty?
|
|
101
|
+
"#{href}[#{inner}]"
|
|
102
|
+
else
|
|
103
|
+
inner
|
|
104
|
+
end
|
|
105
|
+
end
|
|
57
106
|
end
|
|
58
107
|
end
|
data/lib/iev/version.rb
CHANGED
data/lib/iev.rb
CHANGED
|
@@ -1,68 +1,80 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "iev/version"
|
|
4
|
-
require "iev/
|
|
5
|
-
require "
|
|
6
|
-
require "nokogiri"
|
|
4
|
+
require "iev/config"
|
|
5
|
+
require "iev/data_source"
|
|
7
6
|
|
|
8
|
-
require "benchmark"
|
|
9
|
-
require "creek"
|
|
10
|
-
require "unitsml"
|
|
11
|
-
require "plurimath"
|
|
12
|
-
require "glossarist"
|
|
13
|
-
require "relaton"
|
|
14
|
-
require "relaton_bib"
|
|
15
|
-
require "sequel"
|
|
16
|
-
require "thor"
|
|
17
7
|
require "yaml"
|
|
18
8
|
|
|
9
|
+
# plurimath and unitsml both depend on mml, which has a transitive
|
|
10
|
+
# dependency version mismatch with lutaml-model in some environments.
|
|
11
|
+
# Load them when available; the DataSource APIs work without them.
|
|
12
|
+
begin
|
|
13
|
+
require "plurimath"
|
|
14
|
+
rescue LoadError
|
|
15
|
+
nil
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
begin
|
|
19
|
+
require "unitsml"
|
|
20
|
+
rescue LoadError
|
|
21
|
+
nil
|
|
22
|
+
end
|
|
23
|
+
|
|
19
24
|
module Iev
|
|
20
25
|
autoload :Cli, "iev/cli"
|
|
26
|
+
autoload :Config, "iev/config"
|
|
21
27
|
autoload :Converter, "iev/converter"
|
|
22
28
|
autoload :DataConversions, "iev/data_conversions"
|
|
23
|
-
autoload :
|
|
24
|
-
autoload :DbCache, "iev/db_cache"
|
|
29
|
+
autoload :DataSource, "iev/data_source"
|
|
25
30
|
autoload :DbWriter, "iev/db_writer"
|
|
31
|
+
autoload :Exporter, "iev/exporter"
|
|
26
32
|
autoload :Iso639Code, "iev/iso_639_code"
|
|
27
33
|
autoload :Profiler, "iev/profiler"
|
|
28
34
|
autoload :RelatonDb, "iev/relaton_db"
|
|
35
|
+
autoload :Scraper, "iev/scraper"
|
|
29
36
|
autoload :SourceParser, "iev/source_parser"
|
|
30
37
|
autoload :SupersessionParser, "iev/supersession_parser"
|
|
31
38
|
autoload :TermAttrsParser, "iev/term_attrs_parser"
|
|
32
39
|
autoload :TermBuilder, "iev/term_builder"
|
|
33
40
|
autoload :Utilities, "iev/utilities"
|
|
34
41
|
|
|
35
|
-
#
|
|
36
|
-
# Scrape Electropedia for term.
|
|
42
|
+
# Fetch term designation from IEV data.
|
|
37
43
|
#
|
|
38
44
|
# @param [String] code for example "103-01-02"
|
|
39
45
|
# @param [String] lang language code, for example "en"
|
|
40
46
|
#
|
|
41
|
-
# @return [String, nil] if found
|
|
42
|
-
# if code not found then
|
|
47
|
+
# @return [String, nil] if found then term,
|
|
48
|
+
# if code not found then nil,
|
|
43
49
|
# if language not found then nil.
|
|
44
50
|
#
|
|
45
51
|
def self.get(code, lang)
|
|
46
|
-
|
|
47
|
-
xpath = "//table/tr/td/div/font[.=\"#{lang}\"]/../../"\
|
|
48
|
-
"following-sibling::td[2]"
|
|
49
|
-
a = doc&.at(xpath)&.children&.to_xml
|
|
50
|
-
a&.sub(%r{<br/>.*$}, "")
|
|
51
|
-
&.sub(/, <.*$/, "")
|
|
52
|
-
&.gsub(/<[^<>]*>/, "")&.strip
|
|
52
|
+
DataSource.fetch_term_designation(code, lang)
|
|
53
53
|
end
|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
55
|
+
# Fetch full concept data (all languages) for a given IEV code.
|
|
56
|
+
#
|
|
57
|
+
# @param [String] code IEV code, e.g. "103-01-02"
|
|
58
|
+
# @return [Hash, nil] concept data hash with all languages
|
|
59
|
+
def self.fetch_concept(code)
|
|
60
|
+
DataSource.fetch_concept(code)
|
|
61
|
+
end
|
|
58
62
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
63
|
+
# Fetch localized term data for a given IEV code and language.
|
|
64
|
+
#
|
|
65
|
+
# @param [String] code IEV code, e.g. "103-01-02"
|
|
66
|
+
# @param [String] lang language code, e.g. "en" or "eng"
|
|
67
|
+
# @return [Hash, nil] localized concept data
|
|
68
|
+
def self.fetch_term(code, lang)
|
|
69
|
+
DataSource.fetch_term(code, lang)
|
|
70
|
+
end
|
|
62
71
|
|
|
63
|
-
|
|
64
|
-
|
|
72
|
+
# Scrape concept data from Electropedia for a given IEV code.
|
|
73
|
+
# Uses Ferrum (headless Chrome) to handle AWS WAF challenge.
|
|
74
|
+
#
|
|
75
|
+
# @param code [String] IEV code, e.g. "103-01-02"
|
|
76
|
+
# @return [Hash, nil] concept data hash or nil if not found
|
|
77
|
+
def self.scrape_concept(code)
|
|
78
|
+
Scraper.new.fetch_concept(code)
|
|
65
79
|
end
|
|
66
80
|
end
|
|
67
|
-
|
|
68
|
-
require_relative "iev/cli"
|