iev 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,44 +44,46 @@ module Iev
44
44
 
45
45
  split_definition
46
46
 
47
- Glossarist::LocalizedConcept.from_hash(term_hash)
47
+ concept_data = build_concept_data
48
+
49
+ concept = Glossarist::LocalizedConcept.new
50
+ concept.data = concept_data
51
+ concept.id = term_id
52
+ concept.entry_status = extract_entry_status
53
+ concept.classification = extract_classification
54
+
55
+ concept
48
56
  end
49
57
 
50
- def term_hash
51
- dates = nil
52
-
53
- if flesh_date(find_value_for("PUBLICATIONDATE"))
54
- dates = [
55
- {
56
- type: :accepted,
57
- date: flesh_date(find_value_for("PUBLICATIONDATE")),
58
- },
59
- {
60
- type: :amended,
61
- date: flesh_date(find_value_for("PUBLICATIONDATE")),
62
- },
58
+ def build_concept_data
59
+ cd = Glossarist::ConceptData.new
60
+ cd.id = term_id
61
+ cd.language_code = term_language
62
+
63
+ pub_date = flesh_date(find_value_for("PUBLICATIONDATE"))
64
+ if pub_date
65
+ cd.dates = [
66
+ Glossarist::ConceptDate.new(type: "accepted", date: pub_date),
67
+ Glossarist::ConceptDate.new(type: "amended", date: pub_date),
63
68
  ]
69
+ cd.review_date = pub_date
70
+ cd.review_decision_date = pub_date
64
71
  end
72
+ cd.review_decision_event = "published"
65
73
 
66
- {
67
- id: term_id,
68
- classification: extract_classification,
69
- entry_status: extract_entry_status,
70
- data: {
71
- id: term_id,
72
- dates: dates,
73
- definition: [{ "content" => extract_definition_value }],
74
- examples: extract_examples,
75
- notes: extract_notes,
76
- terms: extract_terms,
77
- review_date: flesh_date(find_value_for("PUBLICATIONDATE")),
78
- review_decision_date: flesh_date(find_value_for("PUBLICATIONDATE")),
79
- review_decision_event: "published",
80
- language_code: term_language,
81
- sources: extract_authoritative_source,
82
- related: extract_superseded_concepts,
83
- }.compact,
84
- }.compact
74
+ definition = extract_definition_value
75
+ cd.definition = [definition] if definition
76
+ cd.examples = extract_examples
77
+ cd.notes = extract_notes
78
+ cd.terms = extract_terms
79
+
80
+ sources = extract_authoritative_source
81
+ cd.sources = sources if sources&.any?
82
+
83
+ related = extract_superseded_concepts
84
+ cd.related = related if related&.any?
85
+
86
+ cd
85
87
  end
86
88
 
87
89
  def term_id
@@ -121,10 +123,10 @@ module Iev
121
123
  Note \d+\sto\sentry: |
122
124
  Note\s*\d+\sto\sthe\sentry: |
123
125
  Note\sto\sentry\s*\d+: |
124
- Note\s*\d+?\sà\sl[']article: |
125
- <NOTE/?>?\s*\d?\s+.*?– |
126
- NOTE(?:\s+-)? |
127
- Note\s+\d+\s |
126
+ Note\s*\d+?\sà\sl['']article: |
127
+ <NOTE/?>?\s*\d?\s+[–-]\s* |
128
+ NOTE(?:\s+-)?\s* |
129
+ Note\s+\d+\s[–-]\s* |
128
130
  Note&nbsp;\d+\s
129
131
  )
130
132
  )
@@ -140,28 +142,14 @@ module Iev
140
142
 
141
143
  while (md = remaining_str&.match(slicer_rx))
142
144
  next_part = md.pre_match
143
- next_part.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\\1mple')
145
+ next_part.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\1mple')
144
146
  next_part_arr.push(next_part)
145
147
  next_part_arr = md[:example] ? @examples : @notes
146
- # 112-03-17
147
- # supplements the name of a quantity, especially for a component in a
148
- # system, to indicate the quotient of that quantity by the total
149
- # volume
150
- # <NOTE – Examples: amount-of-substance volume concentration of
151
- # component B (or concentration of B, in particular, ion
152
- # concentration), molecular concentration of B, electron concentration
153
- # (or electron density).
154
- #
155
- # In the above case the `Example` is part of the note but the regex
156
- # above will capture it as an example and will add an empty `Note`
157
- # and put the rest in an `Example`. So In this case we will replace
158
- # the `Example` with `[:Example]` and revert it in the next iteration
159
- # so it will not be caught by the regex.
160
148
  remaining_str = md.post_match
161
- remaining_str.sub!(/^Ex(a|e)mple/, '[:Ex\\1mple]') if md[:note]
149
+ remaining_str.sub!(/^Ex(a|e)mple/, '[:Ex\1mple]') if md[:note]
162
150
  end
163
151
 
164
- remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\\1mple')
152
+ remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\1mple')
165
153
  next_part_arr.push(remaining_str)
166
154
  @definition = definition_arr.first
167
155
  @definition = nil if @definition&.empty?
@@ -211,28 +199,21 @@ module Iev
211
199
  def extract_definition_value
212
200
  return unless @definition
213
201
 
214
- Iev::Converter.mathml_to_asciimath(
215
- replace_newlines(parse_anchor_tag(@definition, term_domain)),
216
- ).strip
202
+ content = convert_content(@definition)
203
+ Glossarist::DetailedDefinition.new(content: content)
217
204
  end
218
205
 
219
206
  def extract_examples
220
207
  @examples.map do |str|
221
- {
222
- content: Iev::Converter.mathml_to_asciimath(
223
- replace_newlines(parse_anchor_tag(str, term_domain)),
224
- ).strip,
225
- }
208
+ content = convert_content(clean_extracted_text(str))
209
+ Glossarist::DetailedDefinition.new(content: content)
226
210
  end
227
211
  end
228
212
 
229
213
  def extract_notes
230
214
  @notes.map do |str|
231
- {
232
- content: Iev::Converter.mathml_to_asciimath(
233
- replace_newlines(parse_anchor_tag(str, term_domain)),
234
- ).strip,
235
- }
215
+ content = convert_content(clean_extracted_text(str))
216
+ Glossarist::DetailedDefinition.new(content: content)
236
217
  end
237
218
  end
238
219
 
@@ -246,14 +227,14 @@ module Iev
246
227
  classification_val = find_value_for("SYNONYM1STATUS")
247
228
 
248
229
  case classification_val
249
- when ""
250
- "admitted"
230
+ when nil, ""
231
+ nil
251
232
  when "认可的", "допустимый", "admitido"
252
233
  "admitted"
253
234
  when "首选的", "suositettava", "suositeltava", "рекомендуемый", "preferente"
254
235
  "preferred"
255
236
  else
256
- classification_val
237
+ classification_val.downcase
257
238
  end
258
239
  end
259
240
 
@@ -261,12 +242,12 @@ module Iev
261
242
  source_val = find_value_for("SOURCE")
262
243
  return nil if source_val.nil?
263
244
 
264
- SourceParser.new(source_val, term_domain)
245
+ sources = SourceParser.new(source_val, term_domain)
265
246
  .parsed_sources
266
247
  .compact
267
- .map do |source|
268
- source.merge({ "type" => "authoritative" })
269
- end
248
+
249
+ sources.each { |src| src.type = "authoritative" }
250
+ sources.empty? ? nil : sources
270
251
  end
271
252
 
272
253
  def extract_superseded_concepts
@@ -279,9 +260,7 @@ module Iev
279
260
  private
280
261
 
281
262
  def build_expression_designation(raw_term, attribute_data:, status:)
282
- term = Iev::Converter.mathml_to_asciimath(
283
- parse_anchor_tag(raw_term, term_domain),
284
- )
263
+ term = convert_content(raw_term)
285
264
  term_attributes = TermAttrsParser.new(attribute_data.to_s)
286
265
 
287
266
  statuses = {
@@ -289,29 +268,56 @@ module Iev
289
268
  "напуштен" => "deprecated",
290
269
  }
291
270
 
292
- {
293
- "type" => "expression",
294
- "prefix" => term_attributes.prefix,
295
- "normative_status" => statuses[status] || status,
296
- "usage_info" => term_attributes.usage_info,
297
- "designation" => term,
298
- "part_of_speech" => term_attributes.part_of_speech,
299
- "geographical_area" => term_attributes.geographical_area,
300
- "gender" => term_attributes.gender,
301
- "plurality" => term_attributes.plurality,
271
+ grammar_info = term_attributes.to_grammar_info
272
+ attrs = {
273
+ designation: term,
274
+ normative_status: statuses[status] || status,
275
+ geographical_area: term_attributes.geographical_area,
276
+ prefix: term_attributes.prefix,
277
+ usage_info: term_attributes.usage_info,
278
+ grammar_info: grammar_info ? [grammar_info] : nil,
302
279
  }.compact
280
+
281
+ Glossarist::Designation::Expression.new(**attrs)
303
282
  end
304
283
 
305
284
  def build_symbol_designation(raw_term)
306
- term = Iev::Converter.mathml_to_asciimath(
307
- parse_anchor_tag(raw_term, term_domain),
285
+ term = convert_content(raw_term)
286
+
287
+ Glossarist::Designation::Symbol.new(
288
+ designation: term,
289
+ international: true,
308
290
  )
291
+ end
309
292
 
310
- {
311
- "type" => "symbol",
312
- "designation" => term,
313
- "international" => true,
314
- }.compact
293
+ def convert_content(str)
294
+ stripped = strip_html_comments(str.to_s)
295
+ Iev::Converter.mathml_to_asciimath(
296
+ replace_newlines(parse_anchor_tag(stripped, term_domain)),
297
+ ).strip
298
+ end
299
+
300
+ def strip_html_comments(str)
301
+ doc = Nokogiri::HTML::DocumentFragment.parse(str)
302
+ comments = doc.children.select(&:comment?)
303
+ return str if comments.empty?
304
+
305
+ result = str.dup
306
+ comments.each { |c| result = result.gsub("<!--#{c.content}-->", "") }
307
+ result
308
+ end
309
+
310
+ # Remove leading numbering artifacts from extracted notes/examples.
311
+ # The definition text sometimes duplicates note/example numbers:
312
+ # "1 A time interval comprises..." (note)
313
+ # "1: In a vending machine..." (example)
314
+ # "2 à l'article: ..." (French note)
315
+ # ": Par la réticulation..." (French note)
316
+ def clean_extracted_text(str)
317
+ # Strip leading number + optional separator (colon, em-space, etc.)
318
+ str.gsub(/\A\s*\d+[\s: ]*\s*/, "")
319
+ # Strip leading standalone colon (French style: ": text")
320
+ .gsub(/\A\s*:\s*/, "")
315
321
  end
316
322
  end
317
323
  end
data/lib/iev/utilities.rb CHANGED
@@ -2,57 +2,106 @@
2
2
 
3
3
  module Iev
4
4
  module Utilities
5
- SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
6
- FIGURE_ONE_REGEX =
7
- '<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+(.+)\\s*<\\/b>(<\\/p>)?'
8
- FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}".freeze
9
5
  IMAGE_PATH_PREFIX = "image::/assets/images/parts"
6
+ IEV_CODE_RE = /\A(IEV)?\s*(\d{2,3}-\d{2,3}-\d{2,3})\z/
7
+
8
+ # SIMG/Figure patterns — custom IEV XML, pre-processed before Nokogiri.
9
+ # Uses [^>] and [^<] instead of . to avoid polynomial backtracking.
10
+ SIMG_PATH_REGEX = /<simg [^>]*\/\$file\/([\d\-\w.]+)>/
11
+ FIGURE_ONE_REGEX = '<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+([^<]+)\\s*<\\/b>(<\\/p>)?'
12
+ FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}".freeze
10
13
 
11
14
  def parse_anchor_tag(text, term_domain)
12
- return unless text
13
-
14
- # Convert IEV term references
15
- # Convert href links
16
- # Need to take care of this pattern:
17
- # `inverse de la <a href="IEV103-06-01">période<a>`
18
- text.gsub(
19
- %r{<a href="?(IEV)\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)</?a>},
20
- '{{\3, \1:\2}}',
21
- ).gsub(
22
- %r{<a href="?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)</?a>},
23
- '{{\3, IEV:\2}}',
24
- ).gsub(
25
- # To handle <a> tags without ending tag like
26
- # `Voir <a href=IEV103-05-21>IEV 103-05-21`
27
- # for concept '702-03-11' in `fr`
28
- /<a href="?(IEV)?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)$/,
29
- '{{\3, IEV:\2}}',
30
- ).gsub(
31
- %r{<a href="?([^<>]*?)"?>(.*?)</a>},
32
- '\1[\2]',
33
- ).gsub(
34
- Regexp.new([SIMG_PATH_REGEX, '\\s*', FIGURE_TWO_REGEX].join),
35
- "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6]",
36
- ).gsub(
37
- Regexp.new([SIMG_PATH_REGEX, '\\s*', FIGURE_ONE_REGEX].join),
38
- "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
39
- ).gsub(
40
- /<img\s+([^<>]+?)\s*>/,
41
- "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]",
42
- ).gsub(
43
- /<br>/,
44
- "\n",
45
- ).gsub(
46
- %r{<b>(.*?)</b>},
47
- '*\\1*',
48
- )
15
+ return nil if text.nil?
16
+
17
+ text = process_simg_figures(text, term_domain)
18
+ text = fix_unquoted_href(text)
19
+
20
+ doc = Nokogiri::HTML::DocumentFragment.parse(text)
21
+ nodes_to_adoc(doc.children, term_domain)
49
22
  end
50
23
 
51
24
  def replace_newlines(input)
52
- input.gsub('\n', "\n\n")
25
+ input
26
+ .gsub('\n', "\n\n")
53
27
  .gsub(/<[pbr]+>/, "\n\n")
28
+ .gsub(/<br\s*\/?>/, "\n\n")
54
29
  .gsub(/\s*\n[\n\s]+/, "\n\n")
55
30
  .strip
56
31
  end
32
+
33
+ private
34
+
35
+ # IEV data has unquoted href with spaces, e.g.
36
+ # <a href=IEV 102-01-10>...</a>
37
+ # Nokogiri stops at first space, so add quotes.
38
+ # Uses a specific IEV code pattern to avoid regex backtracking.
39
+ def fix_unquoted_href(text)
40
+ text.gsub(/href=(IEV\s\d{2,3}-\d{2,3}-\d{2,3})(?=[>\s])/) do
41
+ "href=\"#{Regexp.last_match(1)}\""
42
+ end
43
+ end
44
+
45
+ def process_simg_figures(text, term_domain)
46
+ text = text.gsub(
47
+ Regexp.new([SIMG_PATH_REGEX.source, '\s*', FIGURE_TWO_REGEX].join),
48
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6 - \\7]",
49
+ )
50
+ text = text.gsub(
51
+ Regexp.new([SIMG_PATH_REGEX.source, '\s*', FIGURE_ONE_REGEX].join),
52
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
53
+ )
54
+ text.gsub(SIMG_PATH_REGEX, "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]")
55
+ end
56
+
57
+ def nodes_to_adoc(nodes, term_domain)
58
+ nodes.map { |n| node_to_adoc(n, term_domain) }.join
59
+ end
60
+
61
+ def node_to_adoc(node, term_domain)
62
+ case node
63
+ when Nokogiri::XML::Text
64
+ node.text
65
+ when Nokogiri::XML::Comment
66
+ ""
67
+ when Nokogiri::XML::Element
68
+ element_to_adoc(node, term_domain)
69
+ else
70
+ ""
71
+ end
72
+ end
73
+
74
+ def element_to_adoc(node, term_domain)
75
+ inner = nodes_to_adoc(node.children, term_domain)
76
+
77
+ case node.name
78
+ when "a"
79
+ convert_link(node, inner)
80
+ when "b"
81
+ "*#{inner}*"
82
+ when "br"
83
+ "\n"
84
+ when "img"
85
+ src = node["src"] || node.attributes.keys.first.to_s
86
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/#{src}[]"
87
+ when "p", "div", "span"
88
+ inner
89
+ else
90
+ node.to_s
91
+ end
92
+ end
93
+
94
+ def convert_link(node, inner)
95
+ href = (node["href"] || "").to_s.strip
96
+
97
+ if href.match?(IEV_CODE_RE)
98
+ iev_code = href.sub(/\AIEV\s*/, "")
99
+ "{{#{inner}, IEV:#{iev_code}}}"
100
+ elsif !href.empty?
101
+ "#{href}[#{inner}]"
102
+ else
103
+ inner
104
+ end
105
+ end
57
106
  end
58
107
  end
data/lib/iev/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iev
4
- VERSION = "0.3.9"
4
+ VERSION = "0.4.0"
5
5
  end
data/lib/iev.rb CHANGED
@@ -1,68 +1,80 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "iev/version"
4
- require "iev/db"
5
- require "mechanize"
6
- require "nokogiri"
4
+ require "iev/config"
5
+ require "iev/data_source"
7
6
 
8
- require "benchmark"
9
- require "creek"
10
- require "unitsml"
11
- require "plurimath"
12
- require "glossarist"
13
- require "relaton"
14
- require "relaton_bib"
15
- require "sequel"
16
- require "thor"
17
7
  require "yaml"
18
8
 
9
+ # plurimath and unitsml both depend on mml, which has a transitive
10
+ # dependency version mismatch with lutaml-model in some environments.
11
+ # Load them when available; the DataSource APIs work without them.
12
+ begin
13
+ require "plurimath"
14
+ rescue LoadError
15
+ nil
16
+ end
17
+
18
+ begin
19
+ require "unitsml"
20
+ rescue LoadError
21
+ nil
22
+ end
23
+
19
24
  module Iev
20
25
  autoload :Cli, "iev/cli"
26
+ autoload :Config, "iev/config"
21
27
  autoload :Converter, "iev/converter"
22
28
  autoload :DataConversions, "iev/data_conversions"
23
- autoload :Db, "iev/db"
24
- autoload :DbCache, "iev/db_cache"
29
+ autoload :DataSource, "iev/data_source"
25
30
  autoload :DbWriter, "iev/db_writer"
31
+ autoload :Exporter, "iev/exporter"
26
32
  autoload :Iso639Code, "iev/iso_639_code"
27
33
  autoload :Profiler, "iev/profiler"
28
34
  autoload :RelatonDb, "iev/relaton_db"
35
+ autoload :Scraper, "iev/scraper"
29
36
  autoload :SourceParser, "iev/source_parser"
30
37
  autoload :SupersessionParser, "iev/supersession_parser"
31
38
  autoload :TermAttrsParser, "iev/term_attrs_parser"
32
39
  autoload :TermBuilder, "iev/term_builder"
33
40
  autoload :Utilities, "iev/utilities"
34
41
 
35
- #
36
- # Scrape Electropedia for term.
42
+ # Fetch term designation from IEV data.
37
43
  #
38
44
  # @param [String] code for example "103-01-02"
39
45
  # @param [String] lang language code, for example "en"
40
46
  #
41
- # @return [String, nil] if found than term,
42
- # if code not found then empty string,
47
+ # @return [String, nil] if found then term,
48
+ # if code not found then nil,
43
49
  # if language not found then nil.
44
50
  #
45
51
  def self.get(code, lang)
46
- doc = get_doc(code)
47
- xpath = "//table/tr/td/div/font[.=\"#{lang}\"]/../../"\
48
- "following-sibling::td[2]"
49
- a = doc&.at(xpath)&.children&.to_xml
50
- a&.sub(%r{<br/>.*$}, "")
51
- &.sub(/, &lt;.*$/, "")
52
- &.gsub(/<[^<>]*>/, "")&.strip
52
+ DataSource.fetch_term_designation(code, lang)
53
53
  end
54
54
 
55
- def self.get_doc(code)
56
- url = "https://www.electropedia.org/iev/iev.nsf/"\
57
- "display?openform&ievref=#{code}"
55
+ # Fetch full concept data (all languages) for a given IEV code.
56
+ #
57
+ # @param [String] code IEV code, e.g. "103-01-02"
58
+ # @return [Hash, nil] concept data hash with all languages
59
+ def self.fetch_concept(code)
60
+ DataSource.fetch_concept(code)
61
+ end
58
62
 
59
- # Use Mechanize with User-Agent to avoid 403 Forbidden errors from bot detection
60
- agent = Mechanize.new
61
- agent.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
63
+ # Fetch localized term data for a given IEV code and language.
64
+ #
65
+ # @param [String] code IEV code, e.g. "103-01-02"
66
+ # @param [String] lang language code, e.g. "en" or "eng"
67
+ # @return [Hash, nil] localized concept data
68
+ def self.fetch_term(code, lang)
69
+ DataSource.fetch_term(code, lang)
70
+ end
62
71
 
63
- page = agent.get(url)
64
- page.parser # Nokogiri document
72
+ # Scrape concept data from Electropedia for a given IEV code.
73
+ # Uses Ferrum (headless Chrome) to handle AWS WAF challenge.
74
+ #
75
+ # @param code [String] IEV code, e.g. "103-01-02"
76
+ # @return [Hash, nil] concept data hash or nil if not found
77
+ def self.scrape_concept(code)
78
+ Scraper.new.fetch_concept(code)
65
79
  end
66
80
  end
67
-
68
- require_relative "iev/cli"