iev 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 382780f439ab289a11d44053962bb7dbf043688fd68cb2328b8746429c19c6d3
4
- data.tar.gz: 72151b973c479bfc82e61b3d0da4648f11ac54e9344b8c71e3bbf5d91a3a3c3b
3
+ metadata.gz: 3d7d47000d44c3dcf02934083f9d05b0a195491d6ef130cdc9ca198feeb0d8ad
4
+ data.tar.gz: 84b2da3efd17c94b8278bdbe963f1d356433fb0d20ccc6e06e1c2bffd05e8851
5
5
  SHA512:
6
- metadata.gz: d36fad5c1d853d39ffa5d8d67e98ecaedd3c40206237706ccb177ff14957ebea9616420bd41878b84ac34cdab1e9c5c7890143024b0a3070fb6c8a994d769518
7
- data.tar.gz: 8c5e7f0a9510fd562fc92db4ba225e5f9b7580e0e703525ec062b3d404ab57c804da824f94af0e3047738033577aaae8394970d47c349b1ac6b1979f7fbfa02d
6
+ metadata.gz: ba7fbe8bbdcbdec627b43aa62288d2b282a5476a2616ae74a56afcc81c7b983c0190dd43e92378024ad11155c0c9f9b5cf2f2ae4042c4c2536c68cecb4380262
7
+ data.tar.gz: ee080f35c1d17e1309217def4db29acc019e58c5ad3d104efbfc351155addc43c38a1e0602033024d213e91ce13692084f868cc1f1b64521145d90a763ae2f85
@@ -1,7 +1,7 @@
1
1
  name: rake
2
2
 
3
3
  permissions:
4
- contents: read
4
+ contents: write
5
5
 
6
6
  on:
7
7
  push:
@@ -34,6 +34,8 @@ module Iev
34
34
  desc: "Enables debug messages about authoritative sources recognition"
35
35
  option :debug_relaton, type: :boolean, default: false,
36
36
  desc: "Enables debug messages about Relaton integration"
37
+ option :relaton, type: :boolean, default: false,
38
+ desc: "Fetch source URLs via Relaton (slow, makes network requests)"
37
39
  def export(file)
38
40
  handle_generic_options(options)
39
41
 
@@ -42,6 +44,7 @@ module Iev
42
44
  output_dir: options[:output],
43
45
  only_concepts: options[:only_concepts],
44
46
  only_languages: options[:only_languages],
47
+ fetch_relaton_links: options[:relaton],
45
48
  ).export
46
49
 
47
50
  info "Done!"
@@ -48,8 +48,17 @@ module Iev
48
48
  mathml_to_asciimath(input)
49
49
  end
50
50
 
51
+ # Clear the Plurimath expression cache. Call between export runs.
52
+ def clear_cache
53
+ @math_cache = nil
54
+ end
55
+
51
56
  private
52
57
 
58
+ def math_cache
59
+ @math_cache ||= {}
60
+ end
61
+
53
62
  def mathml_to_asciimath(input)
54
63
  return input unless input&.match?(/<|&/)
55
64
 
@@ -58,17 +67,18 @@ module Iev
58
67
  to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
59
68
 
60
69
  to_asciimath.css("math").each do |math_element|
61
- asciimath = Plurimath::Math.parse(
62
- math_element.to_xml, :mathml
63
- ).to_asciimath.strip
70
+ math_xml = math_element.to_xml
71
+ asciimath = math_cache[math_xml] ||= begin
72
+ Plurimath::Math.parse(math_xml, :mathml).to_asciimath.strip
73
+ rescue Plurimath::Math::ParseError
74
+ ""
75
+ end
64
76
 
65
77
  if asciimath.empty?
66
78
  math_element.remove
67
79
  else
68
80
  math_element.replace "stem:[#{asciimath}]"
69
81
  end
70
- rescue Plurimath::Math::ParseError
71
- math_element.remove
72
82
  end
73
83
 
74
84
  html_to_asciimath(
@@ -79,6 +89,13 @@ module Iev
79
89
  def html_to_asciimath(input)
80
90
  return input if input.nil? || input.empty?
81
91
 
92
+ # Fast path: if no HTML elements remain that need Nokogiri processing
93
+ # (after parse_anchor_tag handles <i>/<sub>/<sup>/<ol>/<ul>/<font>),
94
+ # just do the Greek entity replacement.
95
+ unless input.match?(/<([iI]|sub|sup|ol|ul|font)\b/)
96
+ return html_entities_to_stem(input)
97
+ end
98
+
82
99
  to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
83
100
 
84
101
  to_asciimath.css("i").each do |math_element|
data/lib/iev/exporter.rb CHANGED
@@ -27,12 +27,15 @@ module Iev
27
27
  # @param output_dir [String, Pathname] destination for YAML files
28
28
  # @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
29
29
  # @param only_languages [String, nil] comma-separated language codes
30
+ # @param fetch_relaton_links [Boolean] whether to fetch source URLs via Relaton
30
31
  def initialize(input_path, output_dir: Dir.pwd,
31
- only_concepts: nil, only_languages: nil)
32
+ only_concepts: nil, only_languages: nil,
33
+ fetch_relaton_links: false)
32
34
  @input_path = Pathname.new(input_path)
33
35
  validate_input!
34
36
 
35
37
  @output_dir = Pathname.new(output_dir)
38
+ @fetch_relaton_links = fetch_relaton_links
36
39
  @filters = {
37
40
  only_concepts: only_concepts,
38
41
  only_languages: only_languages,
@@ -102,15 +105,28 @@ module Iev
102
105
  end
103
106
 
104
107
  def build_collection(dataset)
105
- Glossarist::ManagedConceptCollection.new.tap do |collection|
106
- dataset.each do |row|
107
- term = TermBuilder.build_from(row)
108
- next unless term
108
+ SourceParser.relaton_enabled = @fetch_relaton_links
109
109
 
110
- concept = collection.fetch_or_initialize(term.id)
111
- concept.add_l10n(term)
110
+ # Use a hash index for O(1) concept lookup instead of
111
+ # Glossarist's O(n) fetch_or_initialize which does linear scan.
112
+ concept_index = {}
113
+ collection = Glossarist::ManagedConceptCollection.new
114
+
115
+ dataset.each do |row|
116
+ term = TermBuilder.build_from(row)
117
+ next unless term
118
+
119
+ concept = concept_index[term.id] ||= begin
120
+ c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
121
+ collection.store(c)
122
+ c
112
123
  end
124
+ concept.add_l10n(term)
113
125
  end
126
+
127
+ collection
128
+ ensure
129
+ SourceParser.relaton_enabled = true
114
130
  end
115
131
 
116
132
  def save_collection(collection)
@@ -14,6 +14,12 @@ module Iev
14
14
  include Utilities
15
15
  using DataConversions
16
16
 
17
+ # When false, obtain_source_link skips Relaton network calls.
18
+ @relaton_enabled = true
19
+ class << self
20
+ attr_accessor :relaton_enabled
21
+ end
22
+
17
23
  attr_reader :src_split, :parsed_sources, :raw_str, :src_str
18
24
 
19
25
  def initialize(source_str, term_domain)
@@ -89,7 +95,7 @@ module Iev
89
95
  origin: origin,
90
96
  modification: relationship[:modification],
91
97
  )
92
- rescue ::RelatonBib::RequestError => e
98
+ rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
93
99
  warn e.message
94
100
  end
95
101
 
@@ -347,10 +353,11 @@ module Iev
347
353
 
348
354
  # Uses Relaton to obtain link for given source ref.
349
355
  def obtain_source_link(ref)
356
+ return nil unless self.class.relaton_enabled
350
357
  return nil unless defined?(RelatonDb)
351
358
 
352
359
  RelatonDb.instance.fetch(ref)&.url
353
- rescue ::RelatonBib::RequestError => e
360
+ rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
354
361
  warn e.message
355
362
  nil
356
363
  end
@@ -298,6 +298,8 @@ module Iev
298
298
  end
299
299
 
300
300
  def strip_html_comments(str)
301
+ return str unless str.include?("<!--")
302
+
301
303
  doc = Nokogiri::HTML::DocumentFragment.parse(str)
302
304
  comments = doc.children.select(&:comment?)
303
305
  return str if comments.empty?
data/lib/iev/utilities.rb CHANGED
@@ -13,10 +13,14 @@ module Iev
13
13
 
14
14
  def parse_anchor_tag(text, term_domain)
15
15
  return nil if text.nil?
16
+ return text unless text.include?("<")
16
17
 
17
18
  text = process_simg_figures(text, term_domain)
18
19
  text = fix_unquoted_href(text)
19
20
 
21
+ # Second check: regex substitutions may have consumed all tags
22
+ return text unless text.include?("<")
23
+
20
24
  doc = Nokogiri::HTML::DocumentFragment.parse(text)
21
25
  nodes_to_adoc(doc.children, term_domain)
22
26
  end
@@ -86,11 +90,45 @@ module Iev
86
90
  "#{IMAGE_PATH_PREFIX}/#{term_domain}/#{src}[]"
87
91
  when "p", "div", "span"
88
92
  inner
93
+ when "i"
94
+ convert_italic(inner)
95
+ when "sub"
96
+ inner.empty? ? "" : "~#{inner}~"
97
+ when "sup"
98
+ inner.empty? ? "" : "^#{inner}^"
99
+ when "ol"
100
+ convert_list(node, ". ")
101
+ when "ul"
102
+ convert_list(node, "* ")
103
+ when "li"
104
+ inner
105
+ when "font"
106
+ convert_font(node, inner)
89
107
  else
90
108
  node.to_s
91
109
  end
92
110
  end
93
111
 
112
+ def convert_italic(text)
113
+ case text.length
114
+ when 0
115
+ ""
116
+ when 1..12
117
+ "stem:[#{text}]"
118
+ else
119
+ "_#{text}_"
120
+ end
121
+ end
122
+
123
+ def convert_list(node, prefix)
124
+ node.css("li").map { |li| "#{prefix}#{li.text}" }.join
125
+ end
126
+
127
+ def convert_font(node, inner)
128
+ style = node["style"].to_s
129
+ style.include?("sans-serif") ? "`#{inner}`" : inner
130
+ end
131
+
94
132
  def convert_link(node, inner)
95
133
  href = (node["href"] || "").to_s.strip
96
134
 
data/lib/iev/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iev
4
- VERSION = "0.4.0"
4
+ VERSION = "0.4.1"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iev
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-27 00:00:00.000000000 Z
11
+ date: 2026-04-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: creek