iev 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +0 -2
- data/.github/workflows/release.yml +3 -1
- data/.gitignore +3 -1
- data/CLAUDE.md +50 -0
- data/Gemfile +3 -0
- data/README.adoc +65 -15
- data/exe/iev +11 -0
- data/iev.gemspec +5 -4
- data/lib/iev/cli/command.rb +119 -76
- data/lib/iev/cli/command_helper.rb +55 -36
- data/lib/iev/config.rb +31 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +119 -158
- data/lib/iev/data_source.rb +124 -0
- data/lib/iev/exporter.rb +122 -0
- data/lib/iev/scraper/page_parser.rb +176 -0
- data/lib/iev/scraper.rb +135 -0
- data/lib/iev/source_parser.rb +31 -18
- data/lib/iev/supersession_parser.rb +9 -13
- data/lib/iev/term_attrs_parser.rb +21 -7
- data/lib/iev/term_builder.rb +100 -94
- data/lib/iev/utilities.rb +91 -42
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +47 -35
- metadata +34 -13
- data/lib/iev/db.rb +0 -82
- data/lib/iev/db_cache.rb +0 -124
|
@@ -5,190 +5,151 @@ module Iev
|
|
|
5
5
|
class MathmlToAsciimath
|
|
6
6
|
using DataConversions
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
8
|
+
GREEK_ENTITIES = {
|
|
9
|
+
"α" => "alpha",
|
|
10
|
+
"β" => "beta",
|
|
11
|
+
"γ" => "gamma",
|
|
12
|
+
"Γ" => "Gamma",
|
|
13
|
+
"δ" => "delta",
|
|
14
|
+
"Δ" => "Delta",
|
|
15
|
+
"ε" => "epsilon",
|
|
16
|
+
"ϵ" => "varepsilon",
|
|
17
|
+
"ζ" => "zeta",
|
|
18
|
+
"η" => "eta",
|
|
19
|
+
"θ" => "theta",
|
|
20
|
+
"Θ" => "Theta",
|
|
21
|
+
"ϑ" => "vartheta",
|
|
22
|
+
"ι" => "iota",
|
|
23
|
+
"κ" => "kappa",
|
|
24
|
+
"λ" => "lambda",
|
|
25
|
+
"Λ" => "Lambda",
|
|
26
|
+
"μ" => "mu",
|
|
27
|
+
"ν" => "nu",
|
|
28
|
+
"ξ" => "xi",
|
|
29
|
+
"Ξ" => "Xi",
|
|
30
|
+
"π" => "pi",
|
|
31
|
+
"Π" => "Pi",
|
|
32
|
+
"ρ" => "rho",
|
|
33
|
+
"σ" => "sigma",
|
|
34
|
+
"Σ" => "Sigma",
|
|
35
|
+
"τ" => "tau",
|
|
36
|
+
"υ" => "upsilon",
|
|
37
|
+
"φ" => "phi",
|
|
38
|
+
"Φ" => "Phi",
|
|
39
|
+
"ϕ" => "varphi",
|
|
40
|
+
"χ" => "chi",
|
|
41
|
+
"ψ" => "psi",
|
|
42
|
+
"Ψ" => "Psi",
|
|
43
|
+
"ω" => "omega",
|
|
44
|
+
}.freeze
|
|
45
|
+
|
|
46
|
+
class << self
|
|
47
|
+
def convert(input)
|
|
48
|
+
mathml_to_asciimath(input)
|
|
49
|
+
end
|
|
15
50
|
|
|
16
|
-
|
|
51
|
+
private
|
|
17
52
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
# (for entities), then it's certain that it doesn't contain
|
|
21
|
-
# any MathML or HTML formula.
|
|
22
|
-
return input unless input&.match?(/<|&/)
|
|
53
|
+
def mathml_to_asciimath(input)
|
|
54
|
+
return input unless input&.match?(/<|&/)
|
|
23
55
|
|
|
24
|
-
|
|
56
|
+
return html_to_asciimath(input) unless input.include?("<math>")
|
|
25
57
|
|
|
26
|
-
|
|
27
|
-
# puts input
|
|
28
|
-
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
29
|
-
# to_asciimath.remove_namespaces!
|
|
58
|
+
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
30
59
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# puts"ASCIIMATH!! #{asciimath}"
|
|
60
|
+
to_asciimath.css("math").each do |math_element|
|
|
61
|
+
asciimath = Plurimath::Math.parse(
|
|
62
|
+
math_element.to_xml, :mathml
|
|
63
|
+
).to_asciimath.strip
|
|
36
64
|
|
|
37
|
-
|
|
65
|
+
if asciimath.empty?
|
|
66
|
+
math_element.remove
|
|
67
|
+
else
|
|
68
|
+
math_element.replace "stem:[#{asciimath}]"
|
|
69
|
+
end
|
|
70
|
+
rescue Plurimath::Math::ParseError
|
|
38
71
|
math_element.remove
|
|
39
|
-
else
|
|
40
|
-
math_element.replace "stem:[#{asciimath}]"
|
|
41
72
|
end
|
|
73
|
+
|
|
74
|
+
html_to_asciimath(
|
|
75
|
+
to_asciimath.children.to_s,
|
|
76
|
+
)
|
|
42
77
|
end
|
|
43
78
|
|
|
44
|
-
html_to_asciimath(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
79
|
+
def html_to_asciimath(input)
|
|
80
|
+
return input if input.nil? || input.empty?
|
|
81
|
+
|
|
82
|
+
to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
|
|
83
|
+
|
|
84
|
+
to_asciimath.css("i").each do |math_element|
|
|
85
|
+
decoded = text_to_asciimath(math_element.text)
|
|
86
|
+
case decoded.length
|
|
87
|
+
when 1..12
|
|
88
|
+
math_element.replace "stem:[#{decoded}]"
|
|
89
|
+
when 0
|
|
90
|
+
math_element.remove
|
|
91
|
+
else
|
|
92
|
+
math_element.replace "_#{decoded}_"
|
|
93
|
+
end
|
|
94
|
+
end
|
|
48
95
|
|
|
49
|
-
|
|
50
|
-
|
|
96
|
+
to_asciimath.css("sub").each do |math_element|
|
|
97
|
+
case math_element.text.length
|
|
98
|
+
when 0
|
|
99
|
+
math_element.remove
|
|
100
|
+
else
|
|
101
|
+
math_element.replace "~#{text_to_asciimath(math_element.text)}~"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
51
104
|
|
|
52
|
-
|
|
105
|
+
to_asciimath.css("sup").each do |math_element|
|
|
106
|
+
case math_element.text.length
|
|
107
|
+
when 0
|
|
108
|
+
math_element.remove
|
|
109
|
+
else
|
|
110
|
+
math_element.replace "^#{text_to_asciimath(math_element.text)}^"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
53
113
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
case decoded.length
|
|
59
|
-
when 1..12
|
|
60
|
-
# puts "(#{math_element.text} to => #{decoded})"
|
|
61
|
-
math_element.replace "stem:[#{decoded}]"
|
|
62
|
-
when 0
|
|
63
|
-
math_element.remove
|
|
64
|
-
else
|
|
65
|
-
math_element.replace "_#{decoded}_"
|
|
114
|
+
to_asciimath.css("ol").each do |element|
|
|
115
|
+
element.css("li").each do |li|
|
|
116
|
+
li.replace ". #{li.text}"
|
|
117
|
+
end
|
|
66
118
|
end
|
|
67
|
-
end
|
|
68
119
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
else
|
|
74
|
-
math_element.replace "~#{text_to_asciimath(math_element.text)}~"
|
|
120
|
+
to_asciimath.css("ul").each do |element|
|
|
121
|
+
element.css("li").each do |li|
|
|
122
|
+
li.replace "* #{li.text}"
|
|
123
|
+
end
|
|
75
124
|
end
|
|
76
|
-
end
|
|
77
125
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
when 0
|
|
81
|
-
math_element.remove
|
|
82
|
-
else
|
|
83
|
-
math_element.replace "^#{text_to_asciimath(math_element.text)}^"
|
|
126
|
+
to_asciimath.css('font[style*="sans-serif"]').each do |x|
|
|
127
|
+
x.replace "`#{x.text}`"
|
|
84
128
|
end
|
|
129
|
+
|
|
130
|
+
html_entities_to_stem(
|
|
131
|
+
to_asciimath
|
|
132
|
+
.children.to_s
|
|
133
|
+
.gsub("]stem:[", "")
|
|
134
|
+
.gsub(%r{</?[uo]l>}, ""),
|
|
135
|
+
)
|
|
85
136
|
end
|
|
86
137
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
li.replace ". #{li.text}"
|
|
90
|
-
end
|
|
138
|
+
def text_to_asciimath(text)
|
|
139
|
+
html_entities_to_asciimath(text.decode_html)
|
|
91
140
|
end
|
|
92
141
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
142
|
+
def html_entities_to_asciimath(input)
|
|
143
|
+
GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
|
|
144
|
+
str.gsub(entity, name)
|
|
96
145
|
end
|
|
97
146
|
end
|
|
98
147
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
148
|
+
def html_entities_to_stem(input)
|
|
149
|
+
GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
|
|
150
|
+
str.gsub(entity, "stem:[#{name}]")
|
|
151
|
+
end
|
|
102
152
|
end
|
|
103
|
-
|
|
104
|
-
html_entities_to_stem(
|
|
105
|
-
to_asciimath
|
|
106
|
-
.children.to_s
|
|
107
|
-
.gsub(/\]stem:\[/, "")
|
|
108
|
-
.gsub(%r{</?[uo]l>}, ""),
|
|
109
|
-
)
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
def text_to_asciimath(text)
|
|
113
|
-
html_entities_to_asciimath(text.decode_html)
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
def html_entities_to_asciimath(input)
|
|
117
|
-
input.gsub("α", "alpha")
|
|
118
|
-
.gsub("β", "beta")
|
|
119
|
-
.gsub("γ", "gamma")
|
|
120
|
-
.gsub("Γ", "Gamma")
|
|
121
|
-
.gsub("δ", "delta")
|
|
122
|
-
.gsub("Δ", "Delta")
|
|
123
|
-
.gsub("ε", "epsilon")
|
|
124
|
-
.gsub("ϵ", "varepsilon")
|
|
125
|
-
.gsub("ζ", "zeta")
|
|
126
|
-
.gsub("η", "eta")
|
|
127
|
-
.gsub("θ", "theta")
|
|
128
|
-
.gsub("Θ", "Theta")
|
|
129
|
-
.gsub("ϑ", "vartheta")
|
|
130
|
-
.gsub("ι", "iota")
|
|
131
|
-
.gsub("κ", "kappa")
|
|
132
|
-
.gsub("λ", "lambda")
|
|
133
|
-
.gsub("Λ", "Lambda")
|
|
134
|
-
.gsub("μ", "mu")
|
|
135
|
-
.gsub("ν", "nu")
|
|
136
|
-
.gsub("ξ", "xi")
|
|
137
|
-
.gsub("Ξ", "Xi")
|
|
138
|
-
.gsub("π", "pi")
|
|
139
|
-
.gsub("Π", "Pi")
|
|
140
|
-
.gsub("ρ", "rho")
|
|
141
|
-
.gsub("β", "beta")
|
|
142
|
-
.gsub("σ", "sigma")
|
|
143
|
-
.gsub("Σ", "Sigma")
|
|
144
|
-
.gsub("τ", "tau")
|
|
145
|
-
.gsub("υ", "upsilon")
|
|
146
|
-
.gsub("φ", "phi")
|
|
147
|
-
.gsub("Φ", "Phi")
|
|
148
|
-
.gsub("ϕ", "varphi")
|
|
149
|
-
.gsub("χ", "chi")
|
|
150
|
-
.gsub("ψ", "psi")
|
|
151
|
-
.gsub("Ψ", "Psi")
|
|
152
|
-
.gsub("ω", "omega")
|
|
153
|
-
end
|
|
154
|
-
|
|
155
|
-
def html_entities_to_stem(input)
|
|
156
|
-
input.gsub("α", "stem:[alpha]")
|
|
157
|
-
.gsub("β", "stem:[beta]")
|
|
158
|
-
.gsub("γ", "stem:[gamma]")
|
|
159
|
-
.gsub("Γ", "stem:[Gamma]")
|
|
160
|
-
.gsub("δ", "stem:[delta]")
|
|
161
|
-
.gsub("Δ", "stem:[Delta]")
|
|
162
|
-
.gsub("ε", "stem:[epsilon]")
|
|
163
|
-
.gsub("ϵ", "stem:[varepsilon]")
|
|
164
|
-
.gsub("ζ", "stem:[zeta]")
|
|
165
|
-
.gsub("η", "stem:[eta]")
|
|
166
|
-
.gsub("θ", "stem:[theta]")
|
|
167
|
-
.gsub("Θ", "stem:[Theta]")
|
|
168
|
-
.gsub("ϑ", "stem:[vartheta]")
|
|
169
|
-
.gsub("ι", "stem:[iota]")
|
|
170
|
-
.gsub("κ", "stem:[kappa]")
|
|
171
|
-
.gsub("λ", "stem:[lambda]")
|
|
172
|
-
.gsub("Λ", "stem:[Lambda]")
|
|
173
|
-
.gsub("μ", "stem:[mu]")
|
|
174
|
-
.gsub("ν", "stem:[nu]")
|
|
175
|
-
.gsub("ξ", "stem:[xi]")
|
|
176
|
-
.gsub("Ξ", "stem:[Xi]")
|
|
177
|
-
.gsub("π", "stem:[pi]")
|
|
178
|
-
.gsub("Π", "stem:[Pi]")
|
|
179
|
-
.gsub("ρ", "stem:[rho]")
|
|
180
|
-
.gsub("β", "stem:[beta]")
|
|
181
|
-
.gsub("σ", "stem:[sigma]")
|
|
182
|
-
.gsub("Σ", "stem:[Sigma]")
|
|
183
|
-
.gsub("τ", "stem:[tau]")
|
|
184
|
-
.gsub("υ", "stem:[upsilon]")
|
|
185
|
-
.gsub("φ", "stem:[phi]")
|
|
186
|
-
.gsub("Φ", "stem:[Phi]")
|
|
187
|
-
.gsub("ϕ", "stem:[varphi]")
|
|
188
|
-
.gsub("χ", "stem:[chi]")
|
|
189
|
-
.gsub("ψ", "stem:[psi]")
|
|
190
|
-
.gsub("Ψ", "stem:[Psi]")
|
|
191
|
-
.gsub("ω", "stem:[omega]")
|
|
192
153
|
end
|
|
193
154
|
end
|
|
194
155
|
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "uri"
|
|
5
|
+
require "yaml"
|
|
6
|
+
|
|
7
|
+
module Iev
|
|
8
|
+
module DataSource
|
|
9
|
+
class NotFoundError < StandardError; end
|
|
10
|
+
|
|
11
|
+
class << self
|
|
12
|
+
# Fetch full concept data (all languages) for a given IEV code.
|
|
13
|
+
#
|
|
14
|
+
# @param code [String] IEV code, e.g. "103-01-02"
|
|
15
|
+
# @return [Hash, nil] concept data hash or nil if not found
|
|
16
|
+
def fetch_concept(code)
|
|
17
|
+
fetch_concept_data(code)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Fetch localized term data for a given IEV code and language.
|
|
21
|
+
#
|
|
22
|
+
# @param code [String] IEV code, e.g. "103-01-02"
|
|
23
|
+
# @param lang [String] language code, e.g. "en" or "eng"
|
|
24
|
+
# @return [Hash, nil] localized concept data or nil
|
|
25
|
+
def fetch_term(code, lang)
|
|
26
|
+
concept = fetch_concept(code)
|
|
27
|
+
return nil unless concept
|
|
28
|
+
|
|
29
|
+
lang_key = normalize_lang(lang)
|
|
30
|
+
concept[lang_key]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Fetch the term designation string for a given IEV code and language.
|
|
34
|
+
# This is the backward-compatible replacement for the scraping-based Iev.get.
|
|
35
|
+
#
|
|
36
|
+
# @param code [String] IEV code, e.g. "103-01-02"
|
|
37
|
+
# @param lang [String] language code, e.g. "en"
|
|
38
|
+
# @return [String, nil] term designation or nil
|
|
39
|
+
def fetch_term_designation(code, lang)
|
|
40
|
+
term_data = fetch_term(code, lang)
|
|
41
|
+
return nil unless term_data
|
|
42
|
+
|
|
43
|
+
terms = term_data["terms"]
|
|
44
|
+
return nil unless terms&.any?
|
|
45
|
+
|
|
46
|
+
preferred = terms.find { |t| t["normative_status"] == "preferred" }
|
|
47
|
+
(preferred || terms.first)["designation"]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def fetch_concept_data(code)
|
|
53
|
+
from_local(code) || from_remote(code)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def from_local(code)
|
|
57
|
+
data_path = Iev.config.data_path
|
|
58
|
+
return nil unless data_path
|
|
59
|
+
|
|
60
|
+
path = File.join(data_path, "concept-#{code}.yaml")
|
|
61
|
+
return nil unless File.exist?(path)
|
|
62
|
+
|
|
63
|
+
YAML.safe_load(File.read(path, encoding: "utf-8"), permitted_classes: [Date, Time])
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def from_remote(code)
|
|
67
|
+
cache_key = "concept-#{code}.yaml"
|
|
68
|
+
cached = read_cache(cache_key)
|
|
69
|
+
return cached if cached
|
|
70
|
+
|
|
71
|
+
url = "#{Iev.config.remote_base_url}/#{cache_key}"
|
|
72
|
+
data = http_get_yaml(url)
|
|
73
|
+
return nil unless data
|
|
74
|
+
|
|
75
|
+
write_cache(cache_key, data)
|
|
76
|
+
data
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def http_get_yaml(url)
|
|
80
|
+
uri = URI(url)
|
|
81
|
+
response = Net::HTTP.get_response(uri)
|
|
82
|
+
|
|
83
|
+
case response.code
|
|
84
|
+
when "200"
|
|
85
|
+
YAML.safe_load(response.body, permitted_classes: [Date, Time])
|
|
86
|
+
when "404"
|
|
87
|
+
nil
|
|
88
|
+
else
|
|
89
|
+
warn "IEV: Failed to fetch #{url}: HTTP #{response.code}"
|
|
90
|
+
nil
|
|
91
|
+
end
|
|
92
|
+
rescue SocketError, Timeout::Error => e
|
|
93
|
+
warn "IEV: Network error fetching #{url}: #{e.message}"
|
|
94
|
+
nil
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def read_cache(filename)
|
|
98
|
+
cache_path = cache_file_path(filename)
|
|
99
|
+
return nil unless File.exist?(cache_path)
|
|
100
|
+
|
|
101
|
+
YAML.safe_load(File.read(cache_path, encoding: "utf-8"), permitted_classes: [Date, Time])
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def write_cache(filename, data)
|
|
105
|
+
cache_path = cache_file_path(filename)
|
|
106
|
+
FileUtils.mkdir_p(File.dirname(cache_path))
|
|
107
|
+
File.write(cache_path, YAML.dump(data), encoding: "utf-8")
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def cache_file_path(filename)
|
|
111
|
+
File.join(Iev.config.cache_dir, filename)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Normalize language code: "en" → "eng", "de" → "deu", etc.
|
|
115
|
+
def normalize_lang(lang)
|
|
116
|
+
return lang if lang.length == 3
|
|
117
|
+
|
|
118
|
+
Iso639Code.three_char_code(lang).first
|
|
119
|
+
rescue StandardError
|
|
120
|
+
lang
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
data/lib/iev/exporter.rb
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Exports IEV data to Glossarist YAML format.
|
|
5
|
+
#
|
|
6
|
+
# Automatically detects input format from file extension:
|
|
7
|
+
# .xlsx / .xls → Excel IEV export
|
|
8
|
+
# .sqlite3 / .sqlite / .db → SQLite database
|
|
9
|
+
#
|
|
10
|
+
# @example Programmatic usage
|
|
11
|
+
# exporter = Iev::Exporter.new("data.xlsx", output_dir: "/tmp/output")
|
|
12
|
+
# collection = exporter.export
|
|
13
|
+
#
|
|
14
|
+
# @example With filters
|
|
15
|
+
# Iev::Exporter.new("data.sqlite3",
|
|
16
|
+
# output_dir: "/tmp/output",
|
|
17
|
+
# only_concepts: "103-%",
|
|
18
|
+
# only_languages: "en,fr",
|
|
19
|
+
# ).export
|
|
20
|
+
class Exporter
|
|
21
|
+
XLSX_EXTENSIONS = %w[.xlsx .xls].freeze
|
|
22
|
+
SQLITE_EXTENSIONS = %w[.sqlite3 .sqlite .db].freeze
|
|
23
|
+
|
|
24
|
+
attr_reader :input_path, :output_dir, :filters
|
|
25
|
+
|
|
26
|
+
# @param input_path [String, Pathname] path to Excel or SQLite file
|
|
27
|
+
# @param output_dir [String, Pathname] destination for YAML files
|
|
28
|
+
# @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
|
|
29
|
+
# @param only_languages [String, nil] comma-separated language codes
|
|
30
|
+
def initialize(input_path, output_dir: Dir.pwd,
|
|
31
|
+
only_concepts: nil, only_languages: nil)
|
|
32
|
+
@input_path = Pathname.new(input_path)
|
|
33
|
+
validate_input!
|
|
34
|
+
|
|
35
|
+
@output_dir = Pathname.new(output_dir)
|
|
36
|
+
@filters = {
|
|
37
|
+
only_concepts: only_concepts,
|
|
38
|
+
only_languages: only_languages,
|
|
39
|
+
}.compact
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Run the export pipeline: load → transform → save.
|
|
43
|
+
# @return [Glossarist::ManagedConceptCollection]
|
|
44
|
+
def export
|
|
45
|
+
dataset = load_dataset
|
|
46
|
+
collection = build_collection(dataset)
|
|
47
|
+
save_collection(collection)
|
|
48
|
+
collection
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def supported_format?
|
|
54
|
+
ext = input_path.extname.downcase
|
|
55
|
+
XLSX_EXTENSIONS.include?(ext) || SQLITE_EXTENSIONS.include?(ext)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def validate_input!
|
|
59
|
+
unless input_path.exist?
|
|
60
|
+
raise ArgumentError, "Input file not found: #{input_path}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
return if supported_format?
|
|
64
|
+
|
|
65
|
+
exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
|
|
66
|
+
raise ArgumentError,
|
|
67
|
+
"Unsupported format: #{input_path.extname}. Supported: #{exts}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def input_format
|
|
71
|
+
ext = input_path.extname.downcase
|
|
72
|
+
XLSX_EXTENSIONS.include?(ext) ? :xlsx : :sqlite
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def load_dataset
|
|
76
|
+
case input_format
|
|
77
|
+
when :xlsx then load_from_xlsx
|
|
78
|
+
when :sqlite then load_from_sqlite
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def load_from_xlsx
|
|
83
|
+
require "creek"
|
|
84
|
+
db = Sequel.sqlite
|
|
85
|
+
DbWriter.new(db).import_spreadsheet(input_path.to_s)
|
|
86
|
+
apply_filters(db)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def load_from_sqlite
|
|
90
|
+
apply_filters(Sequel.sqlite(input_path.to_s))
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def apply_filters(db)
|
|
94
|
+
query = db[:concepts]
|
|
95
|
+
if filters[:only_concepts]
|
|
96
|
+
query = query.where(Sequel.ilike(:ievref, filters[:only_concepts]))
|
|
97
|
+
end
|
|
98
|
+
if filters[:only_languages]
|
|
99
|
+
query = query.where(language: filters[:only_languages].split(","))
|
|
100
|
+
end
|
|
101
|
+
query
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def build_collection(dataset)
|
|
105
|
+
Glossarist::ManagedConceptCollection.new.tap do |collection|
|
|
106
|
+
dataset.each do |row|
|
|
107
|
+
term = TermBuilder.build_from(row)
|
|
108
|
+
next unless term
|
|
109
|
+
|
|
110
|
+
concept = collection.fetch_or_initialize(term.id)
|
|
111
|
+
concept.add_l10n(term)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def save_collection(collection)
|
|
117
|
+
concepts_dir = output_dir.expand_path.join("concepts")
|
|
118
|
+
FileUtils.mkdir_p(concepts_dir)
|
|
119
|
+
collection.save_to_files(concepts_dir.to_s)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|