iev 0.3.9 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -3
- data/.github/workflows/release.yml +3 -1
- data/.gitignore +3 -1
- data/CLAUDE.md +50 -0
- data/Gemfile +3 -0
- data/README.adoc +65 -15
- data/exe/iev +11 -0
- data/iev.gemspec +5 -4
- data/lib/iev/cli/command.rb +122 -76
- data/lib/iev/cli/command_helper.rb +55 -36
- data/lib/iev/config.rb +31 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +137 -159
- data/lib/iev/data_source.rb +124 -0
- data/lib/iev/exporter.rb +138 -0
- data/lib/iev/scraper/page_parser.rb +176 -0
- data/lib/iev/scraper.rb +135 -0
- data/lib/iev/source_parser.rb +39 -19
- data/lib/iev/supersession_parser.rb +9 -13
- data/lib/iev/term_attrs_parser.rb +21 -7
- data/lib/iev/term_builder.rb +102 -94
- data/lib/iev/utilities.rb +129 -42
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +47 -35
- metadata +34 -13
- data/lib/iev/db.rb +0 -82
- data/lib/iev/db_cache.rb +0 -124
|
@@ -5,190 +5,168 @@ module Iev
|
|
|
5
5
|
class MathmlToAsciimath
|
|
6
6
|
using DataConversions
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
GREEK_ENTITIES = {
|
|
9
|
+
"α" => "alpha",
|
|
10
|
+
"β" => "beta",
|
|
11
|
+
"γ" => "gamma",
|
|
12
|
+
"Γ" => "Gamma",
|
|
13
|
+
"δ" => "delta",
|
|
14
|
+
"Δ" => "Delta",
|
|
15
|
+
"ε" => "epsilon",
|
|
16
|
+
"ϵ" => "varepsilon",
|
|
17
|
+
"ζ" => "zeta",
|
|
18
|
+
"η" => "eta",
|
|
19
|
+
"θ" => "theta",
|
|
20
|
+
"Θ" => "Theta",
|
|
21
|
+
"ϑ" => "vartheta",
|
|
22
|
+
"ι" => "iota",
|
|
23
|
+
"κ" => "kappa",
|
|
24
|
+
"λ" => "lambda",
|
|
25
|
+
"Λ" => "Lambda",
|
|
26
|
+
"μ" => "mu",
|
|
27
|
+
"ν" => "nu",
|
|
28
|
+
"ξ" => "xi",
|
|
29
|
+
"Ξ" => "Xi",
|
|
30
|
+
"π" => "pi",
|
|
31
|
+
"Π" => "Pi",
|
|
32
|
+
"ρ" => "rho",
|
|
33
|
+
"σ" => "sigma",
|
|
34
|
+
"Σ" => "Sigma",
|
|
35
|
+
"τ" => "tau",
|
|
36
|
+
"υ" => "upsilon",
|
|
37
|
+
"φ" => "phi",
|
|
38
|
+
"Φ" => "Phi",
|
|
39
|
+
"ϕ" => "varphi",
|
|
40
|
+
"χ" => "chi",
|
|
41
|
+
"ψ" => "psi",
|
|
42
|
+
"Ψ" => "Psi",
|
|
43
|
+
"ω" => "omega",
|
|
44
|
+
}.freeze
|
|
45
|
+
|
|
46
|
+
class << self
|
|
47
|
+
def convert(input)
|
|
48
|
+
mathml_to_asciimath(input)
|
|
49
|
+
end
|
|
11
50
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
51
|
+
# Clear the Plurimath expression cache. Call between export runs.
|
|
52
|
+
def clear_cache
|
|
53
|
+
@math_cache = nil
|
|
54
|
+
end
|
|
15
55
|
|
|
16
|
-
|
|
56
|
+
private
|
|
17
57
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
58
|
+
def math_cache
|
|
59
|
+
@math_cache ||= {}
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def mathml_to_asciimath(input)
|
|
63
|
+
return input unless input&.match?(/<|&/)
|
|
23
64
|
|
|
24
|
-
|
|
65
|
+
return html_to_asciimath(input) unless input.include?("<math>")
|
|
25
66
|
|
|
26
|
-
|
|
27
|
-
# puts input
|
|
28
|
-
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
29
|
-
# to_asciimath.remove_namespaces!
|
|
67
|
+
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
30
68
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
69
|
+
to_asciimath.css("math").each do |math_element|
|
|
70
|
+
math_xml = math_element.to_xml
|
|
71
|
+
asciimath = math_cache[math_xml] ||= begin
|
|
72
|
+
Plurimath::Math.parse(math_xml, :mathml).to_asciimath.strip
|
|
73
|
+
rescue Plurimath::Math::ParseError
|
|
74
|
+
""
|
|
75
|
+
end
|
|
36
76
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
77
|
+
if asciimath.empty?
|
|
78
|
+
math_element.remove
|
|
79
|
+
else
|
|
80
|
+
math_element.replace "stem:[#{asciimath}]"
|
|
81
|
+
end
|
|
41
82
|
end
|
|
83
|
+
|
|
84
|
+
html_to_asciimath(
|
|
85
|
+
to_asciimath.children.to_s,
|
|
86
|
+
)
|
|
42
87
|
end
|
|
43
88
|
|
|
44
|
-
html_to_asciimath(
|
|
45
|
-
|
|
46
|
-
)
|
|
47
|
-
end
|
|
89
|
+
def html_to_asciimath(input)
|
|
90
|
+
return input if input.nil? || input.empty?
|
|
48
91
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
to_asciimath.css("i").each do |math_element|
|
|
55
|
-
# puts "HTML MATH!! #{math_element.to_xml}"
|
|
56
|
-
# puts "HTML MATH!! #{math_element.text}"
|
|
57
|
-
decoded = text_to_asciimath(math_element.text)
|
|
58
|
-
case decoded.length
|
|
59
|
-
when 1..12
|
|
60
|
-
# puts "(#{math_element.text} to => #{decoded})"
|
|
61
|
-
math_element.replace "stem:[#{decoded}]"
|
|
62
|
-
when 0
|
|
63
|
-
math_element.remove
|
|
64
|
-
else
|
|
65
|
-
math_element.replace "_#{decoded}_"
|
|
92
|
+
# Fast path: if no HTML elements remain that need Nokogiri processing
|
|
93
|
+
# (after parse_anchor_tag handles <i>/<sub>/<sup>/<ol>/<ul>/<font>),
|
|
94
|
+
# just do the Greek entity replacement.
|
|
95
|
+
unless input.match?(/<([iI]|sub|sup|ol|ul|font)\b/)
|
|
96
|
+
return html_entities_to_stem(input)
|
|
66
97
|
end
|
|
67
|
-
end
|
|
68
98
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
math_element.
|
|
73
|
-
|
|
74
|
-
|
|
99
|
+
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
100
|
+
|
|
101
|
+
to_asciimath.css("i").each do |math_element|
|
|
102
|
+
decoded = text_to_asciimath(math_element.text)
|
|
103
|
+
case decoded.length
|
|
104
|
+
when 1..12
|
|
105
|
+
math_element.replace "stem:[#{decoded}]"
|
|
106
|
+
when 0
|
|
107
|
+
math_element.remove
|
|
108
|
+
else
|
|
109
|
+
math_element.replace "_#{decoded}_"
|
|
110
|
+
end
|
|
75
111
|
end
|
|
76
|
-
end
|
|
77
112
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
113
|
+
to_asciimath.css("sub").each do |math_element|
|
|
114
|
+
case math_element.text.length
|
|
115
|
+
when 0
|
|
116
|
+
math_element.remove
|
|
117
|
+
else
|
|
118
|
+
math_element.replace "~#{text_to_asciimath(math_element.text)}~"
|
|
119
|
+
end
|
|
84
120
|
end
|
|
85
|
-
end
|
|
86
121
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
122
|
+
to_asciimath.css("sup").each do |math_element|
|
|
123
|
+
case math_element.text.length
|
|
124
|
+
when 0
|
|
125
|
+
math_element.remove
|
|
126
|
+
else
|
|
127
|
+
math_element.replace "^#{text_to_asciimath(math_element.text)}^"
|
|
128
|
+
end
|
|
90
129
|
end
|
|
91
|
-
end
|
|
92
130
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
131
|
+
to_asciimath.css("ol").each do |element|
|
|
132
|
+
element.css("li").each do |li|
|
|
133
|
+
li.replace ". #{li.text}"
|
|
134
|
+
end
|
|
96
135
|
end
|
|
97
|
-
end
|
|
98
136
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
137
|
+
to_asciimath.css("ul").each do |element|
|
|
138
|
+
element.css("li").each do |li|
|
|
139
|
+
li.replace "* #{li.text}"
|
|
140
|
+
end
|
|
141
|
+
end
|
|
103
142
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
.gsub(/\]stem:\[/, "")
|
|
108
|
-
.gsub(%r{</?[uo]l>}, ""),
|
|
109
|
-
)
|
|
110
|
-
end
|
|
143
|
+
to_asciimath.css('font[style*="sans-serif"]').each do |x|
|
|
144
|
+
x.replace "`#{x.text}`"
|
|
145
|
+
end
|
|
111
146
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
147
|
+
html_entities_to_stem(
|
|
148
|
+
to_asciimath
|
|
149
|
+
.children.to_s
|
|
150
|
+
.gsub("]stem:[", "")
|
|
151
|
+
.gsub(%r{</?[uo]l>}, ""),
|
|
152
|
+
)
|
|
153
|
+
end
|
|
115
154
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
.
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
.gsub("ζ", "zeta")
|
|
126
|
-
.gsub("η", "eta")
|
|
127
|
-
.gsub("θ", "theta")
|
|
128
|
-
.gsub("Θ", "Theta")
|
|
129
|
-
.gsub("ϑ", "vartheta")
|
|
130
|
-
.gsub("ι", "iota")
|
|
131
|
-
.gsub("κ", "kappa")
|
|
132
|
-
.gsub("λ", "lambda")
|
|
133
|
-
.gsub("Λ", "Lambda")
|
|
134
|
-
.gsub("μ", "mu")
|
|
135
|
-
.gsub("ν", "nu")
|
|
136
|
-
.gsub("ξ", "xi")
|
|
137
|
-
.gsub("Ξ", "Xi")
|
|
138
|
-
.gsub("π", "pi")
|
|
139
|
-
.gsub("Π", "Pi")
|
|
140
|
-
.gsub("ρ", "rho")
|
|
141
|
-
.gsub("β", "beta")
|
|
142
|
-
.gsub("σ", "sigma")
|
|
143
|
-
.gsub("Σ", "Sigma")
|
|
144
|
-
.gsub("τ", "tau")
|
|
145
|
-
.gsub("υ", "upsilon")
|
|
146
|
-
.gsub("φ", "phi")
|
|
147
|
-
.gsub("Φ", "Phi")
|
|
148
|
-
.gsub("ϕ", "varphi")
|
|
149
|
-
.gsub("χ", "chi")
|
|
150
|
-
.gsub("ψ", "psi")
|
|
151
|
-
.gsub("Ψ", "Psi")
|
|
152
|
-
.gsub("ω", "omega")
|
|
153
|
-
end
|
|
155
|
+
def text_to_asciimath(text)
|
|
156
|
+
html_entities_to_asciimath(text.decode_html)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def html_entities_to_asciimath(input)
|
|
160
|
+
GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
|
|
161
|
+
str.gsub(entity, name)
|
|
162
|
+
end
|
|
163
|
+
end
|
|
154
164
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
.gsub("δ", "stem:[delta]")
|
|
161
|
-
.gsub("Δ", "stem:[Delta]")
|
|
162
|
-
.gsub("ε", "stem:[epsilon]")
|
|
163
|
-
.gsub("ϵ", "stem:[varepsilon]")
|
|
164
|
-
.gsub("ζ", "stem:[zeta]")
|
|
165
|
-
.gsub("η", "stem:[eta]")
|
|
166
|
-
.gsub("θ", "stem:[theta]")
|
|
167
|
-
.gsub("Θ", "stem:[Theta]")
|
|
168
|
-
.gsub("ϑ", "stem:[vartheta]")
|
|
169
|
-
.gsub("ι", "stem:[iota]")
|
|
170
|
-
.gsub("κ", "stem:[kappa]")
|
|
171
|
-
.gsub("λ", "stem:[lambda]")
|
|
172
|
-
.gsub("Λ", "stem:[Lambda]")
|
|
173
|
-
.gsub("μ", "stem:[mu]")
|
|
174
|
-
.gsub("ν", "stem:[nu]")
|
|
175
|
-
.gsub("ξ", "stem:[xi]")
|
|
176
|
-
.gsub("Ξ", "stem:[Xi]")
|
|
177
|
-
.gsub("π", "stem:[pi]")
|
|
178
|
-
.gsub("Π", "stem:[Pi]")
|
|
179
|
-
.gsub("ρ", "stem:[rho]")
|
|
180
|
-
.gsub("β", "stem:[beta]")
|
|
181
|
-
.gsub("σ", "stem:[sigma]")
|
|
182
|
-
.gsub("Σ", "stem:[Sigma]")
|
|
183
|
-
.gsub("τ", "stem:[tau]")
|
|
184
|
-
.gsub("υ", "stem:[upsilon]")
|
|
185
|
-
.gsub("φ", "stem:[phi]")
|
|
186
|
-
.gsub("Φ", "stem:[Phi]")
|
|
187
|
-
.gsub("ϕ", "stem:[varphi]")
|
|
188
|
-
.gsub("χ", "stem:[chi]")
|
|
189
|
-
.gsub("ψ", "stem:[psi]")
|
|
190
|
-
.gsub("Ψ", "stem:[Psi]")
|
|
191
|
-
.gsub("ω", "stem:[omega]")
|
|
165
|
+
def html_entities_to_stem(input)
|
|
166
|
+
GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
|
|
167
|
+
str.gsub(entity, "stem:[#{name}]")
|
|
168
|
+
end
|
|
169
|
+
end
|
|
192
170
|
end
|
|
193
171
|
end
|
|
194
172
|
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "uri"
|
|
5
|
+
require "yaml"
|
|
6
|
+
|
|
7
|
+
module Iev
|
|
8
|
+
module DataSource
|
|
9
|
+
class NotFoundError < StandardError; end
|
|
10
|
+
|
|
11
|
+
class << self
|
|
12
|
+
# Fetch full concept data (all languages) for a given IEV code.
|
|
13
|
+
#
|
|
14
|
+
# @param code [String] IEV code, e.g. "103-01-02"
|
|
15
|
+
# @return [Hash, nil] concept data hash or nil if not found
|
|
16
|
+
def fetch_concept(code)
|
|
17
|
+
fetch_concept_data(code)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Fetch localized term data for a given IEV code and language.
|
|
21
|
+
#
|
|
22
|
+
# @param code [String] IEV code, e.g. "103-01-02"
|
|
23
|
+
# @param lang [String] language code, e.g. "en" or "eng"
|
|
24
|
+
# @return [Hash, nil] localized concept data or nil
|
|
25
|
+
def fetch_term(code, lang)
|
|
26
|
+
concept = fetch_concept(code)
|
|
27
|
+
return nil unless concept
|
|
28
|
+
|
|
29
|
+
lang_key = normalize_lang(lang)
|
|
30
|
+
concept[lang_key]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Fetch the term designation string for a given IEV code and language.
|
|
34
|
+
# This is the backward-compatible replacement for the scraping-based Iev.get.
|
|
35
|
+
#
|
|
36
|
+
# @param code [String] IEV code, e.g. "103-01-02"
|
|
37
|
+
# @param lang [String] language code, e.g. "en"
|
|
38
|
+
# @return [String, nil] term designation or nil
|
|
39
|
+
def fetch_term_designation(code, lang)
|
|
40
|
+
term_data = fetch_term(code, lang)
|
|
41
|
+
return nil unless term_data
|
|
42
|
+
|
|
43
|
+
terms = term_data["terms"]
|
|
44
|
+
return nil unless terms&.any?
|
|
45
|
+
|
|
46
|
+
preferred = terms.find { |t| t["normative_status"] == "preferred" }
|
|
47
|
+
(preferred || terms.first)["designation"]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def fetch_concept_data(code)
|
|
53
|
+
from_local(code) || from_remote(code)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def from_local(code)
|
|
57
|
+
data_path = Iev.config.data_path
|
|
58
|
+
return nil unless data_path
|
|
59
|
+
|
|
60
|
+
path = File.join(data_path, "concept-#{code}.yaml")
|
|
61
|
+
return nil unless File.exist?(path)
|
|
62
|
+
|
|
63
|
+
YAML.safe_load(File.read(path, encoding: "utf-8"), permitted_classes: [Date, Time])
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def from_remote(code)
|
|
67
|
+
cache_key = "concept-#{code}.yaml"
|
|
68
|
+
cached = read_cache(cache_key)
|
|
69
|
+
return cached if cached
|
|
70
|
+
|
|
71
|
+
url = "#{Iev.config.remote_base_url}/#{cache_key}"
|
|
72
|
+
data = http_get_yaml(url)
|
|
73
|
+
return nil unless data
|
|
74
|
+
|
|
75
|
+
write_cache(cache_key, data)
|
|
76
|
+
data
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def http_get_yaml(url)
|
|
80
|
+
uri = URI(url)
|
|
81
|
+
response = Net::HTTP.get_response(uri)
|
|
82
|
+
|
|
83
|
+
case response.code
|
|
84
|
+
when "200"
|
|
85
|
+
YAML.safe_load(response.body, permitted_classes: [Date, Time])
|
|
86
|
+
when "404"
|
|
87
|
+
nil
|
|
88
|
+
else
|
|
89
|
+
warn "IEV: Failed to fetch #{url}: HTTP #{response.code}"
|
|
90
|
+
nil
|
|
91
|
+
end
|
|
92
|
+
rescue SocketError, Timeout::Error => e
|
|
93
|
+
warn "IEV: Network error fetching #{url}: #{e.message}"
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def read_cache(filename)
|
|
98
|
+
cache_path = cache_file_path(filename)
|
|
99
|
+
return nil unless File.exist?(cache_path)
|
|
100
|
+
|
|
101
|
+
YAML.safe_load(File.read(cache_path, encoding: "utf-8"), permitted_classes: [Date, Time])
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def write_cache(filename, data)
|
|
105
|
+
cache_path = cache_file_path(filename)
|
|
106
|
+
FileUtils.mkdir_p(File.dirname(cache_path))
|
|
107
|
+
File.write(cache_path, YAML.dump(data), encoding: "utf-8")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def cache_file_path(filename)
|
|
111
|
+
File.join(Iev.config.cache_dir, filename)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Normalize language code: "en" → "eng", "de" → "deu", etc.
|
|
115
|
+
def normalize_lang(lang)
|
|
116
|
+
return lang if lang.length == 3
|
|
117
|
+
|
|
118
|
+
Iso639Code.three_char_code(lang).first
|
|
119
|
+
rescue StandardError
|
|
120
|
+
lang
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
data/lib/iev/exporter.rb
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Exports IEV data to Glossarist YAML format.
|
|
5
|
+
#
|
|
6
|
+
# Automatically detects input format from file extension:
|
|
7
|
+
# .xlsx / .xls → Excel IEV export
|
|
8
|
+
# .sqlite3 / .sqlite / .db → SQLite database
|
|
9
|
+
#
|
|
10
|
+
# @example Programmatic usage
|
|
11
|
+
# exporter = Iev::Exporter.new("data.xlsx", output_dir: "/tmp/output")
|
|
12
|
+
# collection = exporter.export
|
|
13
|
+
#
|
|
14
|
+
# @example With filters
|
|
15
|
+
# Iev::Exporter.new("data.sqlite3",
|
|
16
|
+
# output_dir: "/tmp/output",
|
|
17
|
+
# only_concepts: "103-%",
|
|
18
|
+
# only_languages: "en,fr",
|
|
19
|
+
# ).export
|
|
20
|
+
class Exporter
|
|
21
|
+
XLSX_EXTENSIONS = %w[.xlsx .xls].freeze
|
|
22
|
+
SQLITE_EXTENSIONS = %w[.sqlite3 .sqlite .db].freeze
|
|
23
|
+
|
|
24
|
+
attr_reader :input_path, :output_dir, :filters
|
|
25
|
+
|
|
26
|
+
# @param input_path [String, Pathname] path to Excel or SQLite file
|
|
27
|
+
# @param output_dir [String, Pathname] destination for YAML files
|
|
28
|
+
# @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
|
|
29
|
+
# @param only_languages [String, nil] comma-separated language codes
|
|
30
|
+
# @param fetch_relaton_links [Boolean] whether to fetch source URLs via Relaton
|
|
31
|
+
def initialize(input_path, output_dir: Dir.pwd,
|
|
32
|
+
only_concepts: nil, only_languages: nil,
|
|
33
|
+
fetch_relaton_links: false)
|
|
34
|
+
@input_path = Pathname.new(input_path)
|
|
35
|
+
validate_input!
|
|
36
|
+
|
|
37
|
+
@output_dir = Pathname.new(output_dir)
|
|
38
|
+
@fetch_relaton_links = fetch_relaton_links
|
|
39
|
+
@filters = {
|
|
40
|
+
only_concepts: only_concepts,
|
|
41
|
+
only_languages: only_languages,
|
|
42
|
+
}.compact
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Run the export pipeline: load → transform → save.
|
|
46
|
+
# @return [Glossarist::ManagedConceptCollection]
|
|
47
|
+
def export
|
|
48
|
+
dataset = load_dataset
|
|
49
|
+
collection = build_collection(dataset)
|
|
50
|
+
save_collection(collection)
|
|
51
|
+
collection
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def supported_format?
|
|
57
|
+
ext = input_path.extname.downcase
|
|
58
|
+
XLSX_EXTENSIONS.include?(ext) || SQLITE_EXTENSIONS.include?(ext)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def validate_input!
|
|
62
|
+
unless input_path.exist?
|
|
63
|
+
raise ArgumentError, "Input file not found: #{input_path}"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
return if supported_format?
|
|
67
|
+
|
|
68
|
+
exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
|
|
69
|
+
raise ArgumentError,
|
|
70
|
+
"Unsupported format: #{input_path.extname}. Supported: #{exts}"
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def input_format
|
|
74
|
+
ext = input_path.extname.downcase
|
|
75
|
+
XLSX_EXTENSIONS.include?(ext) ? :xlsx : :sqlite
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def load_dataset
|
|
79
|
+
case input_format
|
|
80
|
+
when :xlsx then load_from_xlsx
|
|
81
|
+
when :sqlite then load_from_sqlite
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def load_from_xlsx
|
|
86
|
+
require "creek"
|
|
87
|
+
db = Sequel.sqlite
|
|
88
|
+
DbWriter.new(db).import_spreadsheet(input_path.to_s)
|
|
89
|
+
apply_filters(db)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def load_from_sqlite
|
|
93
|
+
apply_filters(Sequel.sqlite(input_path.to_s))
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def apply_filters(db)
|
|
97
|
+
query = db[:concepts]
|
|
98
|
+
if filters[:only_concepts]
|
|
99
|
+
query = query.where(Sequel.ilike(:ievref, filters[:only_concepts]))
|
|
100
|
+
end
|
|
101
|
+
if filters[:only_languages]
|
|
102
|
+
query = query.where(language: filters[:only_languages].split(","))
|
|
103
|
+
end
|
|
104
|
+
query
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def build_collection(dataset)
|
|
108
|
+
SourceParser.relaton_enabled = @fetch_relaton_links
|
|
109
|
+
|
|
110
|
+
# Use a hash index for O(1) concept lookup instead of
|
|
111
|
+
# Glossarist's O(n) fetch_or_initialize which does linear scan.
|
|
112
|
+
concept_index = {}
|
|
113
|
+
collection = Glossarist::ManagedConceptCollection.new
|
|
114
|
+
|
|
115
|
+
dataset.each do |row|
|
|
116
|
+
term = TermBuilder.build_from(row)
|
|
117
|
+
next unless term
|
|
118
|
+
|
|
119
|
+
concept = concept_index[term.id] ||= begin
|
|
120
|
+
c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
|
|
121
|
+
collection.store(c)
|
|
122
|
+
c
|
|
123
|
+
end
|
|
124
|
+
concept.add_l10n(term)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
collection
|
|
128
|
+
ensure
|
|
129
|
+
SourceParser.relaton_enabled = true
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def save_collection(collection)
|
|
133
|
+
concepts_dir = output_dir.expand_path.join("concepts")
|
|
134
|
+
FileUtils.mkdir_p(concepts_dir)
|
|
135
|
+
collection.save_to_files(concepts_dir.to_s)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|