iev 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -1
- data/lib/iev/cli/command.rb +3 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +22 -5
- data/lib/iev/exporter.rb +23 -7
- data/lib/iev/source_parser.rb +9 -2
- data/lib/iev/term_builder.rb +2 -0
- data/lib/iev/utilities.rb +38 -0
- data/lib/iev/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3d7d47000d44c3dcf02934083f9d05b0a195491d6ef130cdc9ca198feeb0d8ad
|
|
4
|
+
data.tar.gz: 84b2da3efd17c94b8278bdbe963f1d356433fb0d20ccc6e06e1c2bffd05e8851
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ba7fbe8bbdcbdec627b43aa62288d2b282a5476a2616ae74a56afcc81c7b983c0190dd43e92378024ad11155c0c9f9b5cf2f2ae4042c4c2536c68cecb4380262
|
|
7
|
+
data.tar.gz: ee080f35c1d17e1309217def4db29acc019e58c5ad3d104efbfc351155addc43c38a1e0602033024d213e91ce13692084f868cc1f1b64521145d90a763ae2f85
|
data/.github/workflows/rake.yml
CHANGED
data/lib/iev/cli/command.rb
CHANGED
|
@@ -34,6 +34,8 @@ module Iev
|
|
|
34
34
|
desc: "Enables debug messages about authoritative sources recognition"
|
|
35
35
|
option :debug_relaton, type: :boolean, default: false,
|
|
36
36
|
desc: "Enables debug messages about Relaton integration"
|
|
37
|
+
option :relaton, type: :boolean, default: false,
|
|
38
|
+
desc: "Fetch source URLs via Relaton (slow, makes network requests)"
|
|
37
39
|
def export(file)
|
|
38
40
|
handle_generic_options(options)
|
|
39
41
|
|
|
@@ -42,6 +44,7 @@ module Iev
|
|
|
42
44
|
output_dir: options[:output],
|
|
43
45
|
only_concepts: options[:only_concepts],
|
|
44
46
|
only_languages: options[:only_languages],
|
|
47
|
+
fetch_relaton_links: options[:relaton],
|
|
45
48
|
).export
|
|
46
49
|
|
|
47
50
|
info "Done!"
|
|
@@ -48,8 +48,17 @@ module Iev
|
|
|
48
48
|
mathml_to_asciimath(input)
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
+
# Clear the Plurimath expression cache. Call between export runs.
|
|
52
|
+
def clear_cache
|
|
53
|
+
@math_cache = nil
|
|
54
|
+
end
|
|
55
|
+
|
|
51
56
|
private
|
|
52
57
|
|
|
58
|
+
def math_cache
|
|
59
|
+
@math_cache ||= {}
|
|
60
|
+
end
|
|
61
|
+
|
|
53
62
|
def mathml_to_asciimath(input)
|
|
54
63
|
return input unless input&.match?(/<|&/)
|
|
55
64
|
|
|
@@ -58,17 +67,18 @@ module Iev
|
|
|
58
67
|
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
59
68
|
|
|
60
69
|
to_asciimath.css("math").each do |math_element|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
70
|
+
math_xml = math_element.to_xml
|
|
71
|
+
asciimath = math_cache[math_xml] ||= begin
|
|
72
|
+
Plurimath::Math.parse(math_xml, :mathml).to_asciimath.strip
|
|
73
|
+
rescue Plurimath::Math::ParseError
|
|
74
|
+
""
|
|
75
|
+
end
|
|
64
76
|
|
|
65
77
|
if asciimath.empty?
|
|
66
78
|
math_element.remove
|
|
67
79
|
else
|
|
68
80
|
math_element.replace "stem:[#{asciimath}]"
|
|
69
81
|
end
|
|
70
|
-
rescue Plurimath::Math::ParseError
|
|
71
|
-
math_element.remove
|
|
72
82
|
end
|
|
73
83
|
|
|
74
84
|
html_to_asciimath(
|
|
@@ -79,6 +89,13 @@ module Iev
|
|
|
79
89
|
def html_to_asciimath(input)
|
|
80
90
|
return input if input.nil? || input.empty?
|
|
81
91
|
|
|
92
|
+
# Fast path: if no HTML elements remain that need Nokogiri processing
|
|
93
|
+
# (after parse_anchor_tag handles <i>/<sub>/<sup>/<ol>/<ul>/<font>),
|
|
94
|
+
# just do the Greek entity replacement.
|
|
95
|
+
unless input.match?(/<([iI]|sub|sup|ol|ul|font)\b/)
|
|
96
|
+
return html_entities_to_stem(input)
|
|
97
|
+
end
|
|
98
|
+
|
|
82
99
|
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
83
100
|
|
|
84
101
|
to_asciimath.css("i").each do |math_element|
|
data/lib/iev/exporter.rb
CHANGED
|
@@ -27,12 +27,15 @@ module Iev
|
|
|
27
27
|
# @param output_dir [String, Pathname] destination for YAML files
|
|
28
28
|
# @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
|
|
29
29
|
# @param only_languages [String, nil] comma-separated language codes
|
|
30
|
+
# @param fetch_relaton_links [Boolean] whether to fetch source URLs via Relaton
|
|
30
31
|
def initialize(input_path, output_dir: Dir.pwd,
|
|
31
|
-
only_concepts: nil, only_languages: nil
|
|
32
|
+
only_concepts: nil, only_languages: nil,
|
|
33
|
+
fetch_relaton_links: false)
|
|
32
34
|
@input_path = Pathname.new(input_path)
|
|
33
35
|
validate_input!
|
|
34
36
|
|
|
35
37
|
@output_dir = Pathname.new(output_dir)
|
|
38
|
+
@fetch_relaton_links = fetch_relaton_links
|
|
36
39
|
@filters = {
|
|
37
40
|
only_concepts: only_concepts,
|
|
38
41
|
only_languages: only_languages,
|
|
@@ -102,15 +105,28 @@ module Iev
|
|
|
102
105
|
end
|
|
103
106
|
|
|
104
107
|
def build_collection(dataset)
|
|
105
|
-
|
|
106
|
-
dataset.each do |row|
|
|
107
|
-
term = TermBuilder.build_from(row)
|
|
108
|
-
next unless term
|
|
108
|
+
SourceParser.relaton_enabled = @fetch_relaton_links
|
|
109
109
|
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
# Use a hash index for O(1) concept lookup instead of
|
|
111
|
+
# Glossarist's O(n) fetch_or_initialize which does linear scan.
|
|
112
|
+
concept_index = {}
|
|
113
|
+
collection = Glossarist::ManagedConceptCollection.new
|
|
114
|
+
|
|
115
|
+
dataset.each do |row|
|
|
116
|
+
term = TermBuilder.build_from(row)
|
|
117
|
+
next unless term
|
|
118
|
+
|
|
119
|
+
concept = concept_index[term.id] ||= begin
|
|
120
|
+
c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
|
|
121
|
+
collection.store(c)
|
|
122
|
+
c
|
|
112
123
|
end
|
|
124
|
+
concept.add_l10n(term)
|
|
113
125
|
end
|
|
126
|
+
|
|
127
|
+
collection
|
|
128
|
+
ensure
|
|
129
|
+
SourceParser.relaton_enabled = true
|
|
114
130
|
end
|
|
115
131
|
|
|
116
132
|
def save_collection(collection)
|
data/lib/iev/source_parser.rb
CHANGED
|
@@ -14,6 +14,12 @@ module Iev
|
|
|
14
14
|
include Utilities
|
|
15
15
|
using DataConversions
|
|
16
16
|
|
|
17
|
+
# When false, obtain_source_link skips Relaton network calls.
|
|
18
|
+
@relaton_enabled = true
|
|
19
|
+
class << self
|
|
20
|
+
attr_accessor :relaton_enabled
|
|
21
|
+
end
|
|
22
|
+
|
|
17
23
|
attr_reader :src_split, :parsed_sources, :raw_str, :src_str
|
|
18
24
|
|
|
19
25
|
def initialize(source_str, term_domain)
|
|
@@ -89,7 +95,7 @@ module Iev
|
|
|
89
95
|
origin: origin,
|
|
90
96
|
modification: relationship[:modification],
|
|
91
97
|
)
|
|
92
|
-
rescue ::RelatonBib::RequestError => e
|
|
98
|
+
rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
|
|
93
99
|
warn e.message
|
|
94
100
|
end
|
|
95
101
|
|
|
@@ -347,10 +353,11 @@ module Iev
|
|
|
347
353
|
|
|
348
354
|
# Uses Relaton to obtain link for given source ref.
|
|
349
355
|
def obtain_source_link(ref)
|
|
356
|
+
return nil unless self.class.relaton_enabled
|
|
350
357
|
return nil unless defined?(RelatonDb)
|
|
351
358
|
|
|
352
359
|
RelatonDb.instance.fetch(ref)&.url
|
|
353
|
-
rescue ::RelatonBib::RequestError => e
|
|
360
|
+
rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
|
|
354
361
|
warn e.message
|
|
355
362
|
nil
|
|
356
363
|
end
|
data/lib/iev/term_builder.rb
CHANGED
data/lib/iev/utilities.rb
CHANGED
|
@@ -13,10 +13,14 @@ module Iev
|
|
|
13
13
|
|
|
14
14
|
def parse_anchor_tag(text, term_domain)
|
|
15
15
|
return nil if text.nil?
|
|
16
|
+
return text unless text.include?("<")
|
|
16
17
|
|
|
17
18
|
text = process_simg_figures(text, term_domain)
|
|
18
19
|
text = fix_unquoted_href(text)
|
|
19
20
|
|
|
21
|
+
# Second check: regex substitutions may have consumed all tags
|
|
22
|
+
return text unless text.include?("<")
|
|
23
|
+
|
|
20
24
|
doc = Nokogiri::HTML::DocumentFragment.parse(text)
|
|
21
25
|
nodes_to_adoc(doc.children, term_domain)
|
|
22
26
|
end
|
|
@@ -86,11 +90,45 @@ module Iev
|
|
|
86
90
|
"#{IMAGE_PATH_PREFIX}/#{term_domain}/#{src}[]"
|
|
87
91
|
when "p", "div", "span"
|
|
88
92
|
inner
|
|
93
|
+
when "i"
|
|
94
|
+
convert_italic(inner)
|
|
95
|
+
when "sub"
|
|
96
|
+
inner.empty? ? "" : "~#{inner}~"
|
|
97
|
+
when "sup"
|
|
98
|
+
inner.empty? ? "" : "^#{inner}^"
|
|
99
|
+
when "ol"
|
|
100
|
+
convert_list(node, ". ")
|
|
101
|
+
when "ul"
|
|
102
|
+
convert_list(node, "* ")
|
|
103
|
+
when "li"
|
|
104
|
+
inner
|
|
105
|
+
when "font"
|
|
106
|
+
convert_font(node, inner)
|
|
89
107
|
else
|
|
90
108
|
node.to_s
|
|
91
109
|
end
|
|
92
110
|
end
|
|
93
111
|
|
|
112
|
+
def convert_italic(text)
|
|
113
|
+
case text.length
|
|
114
|
+
when 0
|
|
115
|
+
""
|
|
116
|
+
when 1..12
|
|
117
|
+
"stem:[#{text}]"
|
|
118
|
+
else
|
|
119
|
+
"_#{text}_"
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def convert_list(node, prefix)
|
|
124
|
+
node.css("li").map { |li| "#{prefix}#{li.text}" }.join
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def convert_font(node, inner)
|
|
128
|
+
style = node["style"].to_s
|
|
129
|
+
style.include?("sans-serif") ? "`#{inner}`" : inner
|
|
130
|
+
end
|
|
131
|
+
|
|
94
132
|
def convert_link(node, inner)
|
|
95
133
|
href = (node["href"] || "").to_s.strip
|
|
96
134
|
|
data/lib/iev/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: iev
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: creek
|