iev 0.3.9 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,190 +5,168 @@ module Iev
5
5
  class MathmlToAsciimath
6
6
  using DataConversions
7
7
 
8
- def self.convert(input)
9
- new.convert(input)
10
- end
8
+ GREEK_ENTITIES = {
9
+ "α" => "alpha",
10
+ "β" => "beta",
11
+ "γ" => "gamma",
12
+ "Γ" => "Gamma",
13
+ "δ" => "delta",
14
+ "Δ" => "Delta",
15
+ "ε" => "epsilon",
16
+ "ϵ" => "varepsilon",
17
+ "ζ" => "zeta",
18
+ "η" => "eta",
19
+ "θ" => "theta",
20
+ "Θ" => "Theta",
21
+ "ϑ" => "vartheta",
22
+ "ι" => "iota",
23
+ "κ" => "kappa",
24
+ "λ" => "lambda",
25
+ "Λ" => "Lambda",
26
+ "μ" => "mu",
27
+ "ν" => "nu",
28
+ "ξ" => "xi",
29
+ "Ξ" => "Xi",
30
+ "π" => "pi",
31
+ "Π" => "Pi",
32
+ "ρ" => "rho",
33
+ "σ" => "sigma",
34
+ "Σ" => "Sigma",
35
+ "τ" => "tau",
36
+ "υ" => "upsilon",
37
+ "φ" => "phi",
38
+ "Φ" => "Phi",
39
+ "ϕ" => "varphi",
40
+ "χ" => "chi",
41
+ "ψ" => "psi",
42
+ "Ψ" => "Psi",
43
+ "ω" => "omega",
44
+ }.freeze
45
+
46
+ class << self
47
+ def convert(input)
48
+ mathml_to_asciimath(input)
49
+ end
11
50
 
12
- def convert(input)
13
- mathml_to_asciimath(input)
14
- end
51
+ # Clear the Plurimath expression cache. Call between export runs.
52
+ def clear_cache
53
+ @math_cache = nil
54
+ end
15
55
 
16
- private
56
+ private
17
57
 
18
- def mathml_to_asciimath(input)
19
- # If given string does not include '<' (for elements) nor '&'
20
- # (for entities), then it's certain that it doesn't contain
21
- # any MathML or HTML formula.
22
- return input unless input&.match?(/<|&/)
58
+ def math_cache
59
+ @math_cache ||= {}
60
+ end
61
+
62
+ def mathml_to_asciimath(input)
63
+ return input unless input&.match?(/<|&/)
23
64
 
24
- return html_to_asciimath(input) unless input.include?("<math>")
65
+ return html_to_asciimath(input) unless input.include?("<math>")
25
66
 
26
- # puts "GOING TO MATHML MATH"
27
- # puts input
28
- to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
29
- # to_asciimath.remove_namespaces!
67
+ to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
30
68
 
31
- to_asciimath.css("math").each do |math_element|
32
- asciimath = Plurimath::Math.parse(
33
- text_to_asciimath(math_element.to_xml), :mathml
34
- ).to_asciimath.strip
35
- # puts"ASCIIMATH!! #{asciimath}"
69
+ to_asciimath.css("math").each do |math_element|
70
+ math_xml = math_element.to_xml
71
+ asciimath = math_cache[math_xml] ||= begin
72
+ Plurimath::Math.parse(math_xml, :mathml).to_asciimath.strip
73
+ rescue Plurimath::Math::ParseError
74
+ ""
75
+ end
36
76
 
37
- if asciimath.empty?
38
- math_element.remove
39
- else
40
- math_element.replace "stem:[#{asciimath}]"
77
+ if asciimath.empty?
78
+ math_element.remove
79
+ else
80
+ math_element.replace "stem:[#{asciimath}]"
81
+ end
41
82
  end
83
+
84
+ html_to_asciimath(
85
+ to_asciimath.children.to_s,
86
+ )
42
87
  end
43
88
 
44
- html_to_asciimath(
45
- to_asciimath.children.to_s,
46
- )
47
- end
89
+ def html_to_asciimath(input)
90
+ return input if input.nil? || input.empty?
48
91
 
49
- def html_to_asciimath(input)
50
- return input if input.nil? || input.empty?
51
-
52
- to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
53
-
54
- to_asciimath.css("i").each do |math_element|
55
- # puts "HTML MATH!! #{math_element.to_xml}"
56
- # puts "HTML MATH!! #{math_element.text}"
57
- decoded = text_to_asciimath(math_element.text)
58
- case decoded.length
59
- when 1..12
60
- # puts "(#{math_element.text} to => #{decoded})"
61
- math_element.replace "stem:[#{decoded}]"
62
- when 0
63
- math_element.remove
64
- else
65
- math_element.replace "_#{decoded}_"
92
+ # Fast path: if no HTML elements remain that need Nokogiri processing
93
+ # (after parse_anchor_tag handles <i>/<sub>/<sup>/<ol>/<ul>/<font>),
94
+ # just do the Greek entity replacement.
95
+ unless input.match?(/<([iI]|sub|sup|ol|ul|font)\b/)
96
+ return html_entities_to_stem(input)
66
97
  end
67
- end
68
98
 
69
- to_asciimath.css("sub").each do |math_element|
70
- case math_element.text.length
71
- when 0
72
- math_element.remove
73
- else
74
- math_element.replace "~#{text_to_asciimath(math_element.text)}~"
99
+ to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
100
+
101
+ to_asciimath.css("i").each do |math_element|
102
+ decoded = text_to_asciimath(math_element.text)
103
+ case decoded.length
104
+ when 1..12
105
+ math_element.replace "stem:[#{decoded}]"
106
+ when 0
107
+ math_element.remove
108
+ else
109
+ math_element.replace "_#{decoded}_"
110
+ end
75
111
  end
76
- end
77
112
 
78
- to_asciimath.css("sup").each do |math_element|
79
- case math_element.text.length
80
- when 0
81
- math_element.remove
82
- else
83
- math_element.replace "^#{text_to_asciimath(math_element.text)}^"
113
+ to_asciimath.css("sub").each do |math_element|
114
+ case math_element.text.length
115
+ when 0
116
+ math_element.remove
117
+ else
118
+ math_element.replace "~#{text_to_asciimath(math_element.text)}~"
119
+ end
84
120
  end
85
- end
86
121
 
87
- to_asciimath.css("ol").each do |element|
88
- element.css("li").each do |li|
89
- li.replace ". #{li.text}"
122
+ to_asciimath.css("sup").each do |math_element|
123
+ case math_element.text.length
124
+ when 0
125
+ math_element.remove
126
+ else
127
+ math_element.replace "^#{text_to_asciimath(math_element.text)}^"
128
+ end
90
129
  end
91
- end
92
130
 
93
- to_asciimath.css("ul").each do |element|
94
- element.css("li").each do |li|
95
- li.replace "* #{li.text}"
131
+ to_asciimath.css("ol").each do |element|
132
+ element.css("li").each do |li|
133
+ li.replace ". #{li.text}"
134
+ end
96
135
  end
97
- end
98
136
 
99
- # Replace sans-serif font with monospace
100
- to_asciimath.css('font[style*="sans-serif"]').each do |x|
101
- x.replace "`#{x.text}`"
102
- end
137
+ to_asciimath.css("ul").each do |element|
138
+ element.css("li").each do |li|
139
+ li.replace "* #{li.text}"
140
+ end
141
+ end
103
142
 
104
- html_entities_to_stem(
105
- to_asciimath
106
- .children.to_s
107
- .gsub(/\]stem:\[/, "")
108
- .gsub(%r{</?[uo]l>}, ""),
109
- )
110
- end
143
+ to_asciimath.css('font[style*="sans-serif"]').each do |x|
144
+ x.replace "`#{x.text}`"
145
+ end
111
146
 
112
- def text_to_asciimath(text)
113
- html_entities_to_asciimath(text.decode_html)
114
- end
147
+ html_entities_to_stem(
148
+ to_asciimath
149
+ .children.to_s
150
+ .gsub("]stem:[", "")
151
+ .gsub(%r{</?[uo]l>}, ""),
152
+ )
153
+ end
115
154
 
116
- def html_entities_to_asciimath(input)
117
- input.gsub("&alpha;", "alpha")
118
- .gsub("&beta;", "beta")
119
- .gsub("&gamma;", "gamma")
120
- .gsub("&Gamma;", "Gamma")
121
- .gsub("&delta;", "delta")
122
- .gsub("&Delta;", "Delta")
123
- .gsub("&epsilon;", "epsilon")
124
- .gsub("&varepsilon;", "varepsilon")
125
- .gsub("&zeta;", "zeta")
126
- .gsub("&eta;", "eta")
127
- .gsub("&theta;", "theta")
128
- .gsub("&Theta;", "Theta")
129
- .gsub("&vartheta;", "vartheta")
130
- .gsub("&iota;", "iota")
131
- .gsub("&kappa;", "kappa")
132
- .gsub("&lambda;", "lambda")
133
- .gsub("&Lambda;", "Lambda")
134
- .gsub("&mu;", "mu")
135
- .gsub("&nu;", "nu")
136
- .gsub("&xi;", "xi")
137
- .gsub("&Xi;", "Xi")
138
- .gsub("&pi;", "pi")
139
- .gsub("&Pi;", "Pi")
140
- .gsub("&rho;", "rho")
141
- .gsub("&beta;", "beta")
142
- .gsub("&sigma;", "sigma")
143
- .gsub("&Sigma;", "Sigma")
144
- .gsub("&tau;", "tau")
145
- .gsub("&upsilon;", "upsilon")
146
- .gsub("&phi;", "phi")
147
- .gsub("&Phi;", "Phi")
148
- .gsub("&varphi;", "varphi")
149
- .gsub("&chi;", "chi")
150
- .gsub("&psi;", "psi")
151
- .gsub("&Psi;", "Psi")
152
- .gsub("&omega;", "omega")
153
- end
155
+ def text_to_asciimath(text)
156
+ html_entities_to_asciimath(text.decode_html)
157
+ end
158
+
159
+ def html_entities_to_asciimath(input)
160
+ GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
161
+ str.gsub(entity, name)
162
+ end
163
+ end
154
164
 
155
- def html_entities_to_stem(input)
156
- input.gsub("&alpha;", "stem:[alpha]")
157
- .gsub("&beta;", "stem:[beta]")
158
- .gsub("&gamma;", "stem:[gamma]")
159
- .gsub("&Gamma;", "stem:[Gamma]")
160
- .gsub("&delta;", "stem:[delta]")
161
- .gsub("&Delta;", "stem:[Delta]")
162
- .gsub("&epsilon;", "stem:[epsilon]")
163
- .gsub("&varepsilon;", "stem:[varepsilon]")
164
- .gsub("&zeta;", "stem:[zeta]")
165
- .gsub("&eta;", "stem:[eta]")
166
- .gsub("&theta;", "stem:[theta]")
167
- .gsub("&Theta;", "stem:[Theta]")
168
- .gsub("&vartheta;", "stem:[vartheta]")
169
- .gsub("&iota;", "stem:[iota]")
170
- .gsub("&kappa;", "stem:[kappa]")
171
- .gsub("&lambda;", "stem:[lambda]")
172
- .gsub("&Lambda;", "stem:[Lambda]")
173
- .gsub("&mu;", "stem:[mu]")
174
- .gsub("&nu;", "stem:[nu]")
175
- .gsub("&xi;", "stem:[xi]")
176
- .gsub("&Xi;", "stem:[Xi]")
177
- .gsub("&pi;", "stem:[pi]")
178
- .gsub("&Pi;", "stem:[Pi]")
179
- .gsub("&rho;", "stem:[rho]")
180
- .gsub("&beta;", "stem:[beta]")
181
- .gsub("&sigma;", "stem:[sigma]")
182
- .gsub("&Sigma;", "stem:[Sigma]")
183
- .gsub("&tau;", "stem:[tau]")
184
- .gsub("&upsilon;", "stem:[upsilon]")
185
- .gsub("&phi;", "stem:[phi]")
186
- .gsub("&Phi;", "stem:[Phi]")
187
- .gsub("&varphi;", "stem:[varphi]")
188
- .gsub("&chi;", "stem:[chi]")
189
- .gsub("&psi;", "stem:[psi]")
190
- .gsub("&Psi;", "stem:[Psi]")
191
- .gsub("&omega;", "stem:[omega]")
165
+ def html_entities_to_stem(input)
166
+ GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
167
+ str.gsub(entity, "stem:[#{name}]")
168
+ end
169
+ end
192
170
  end
193
171
  end
194
172
  end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+ require "yaml"
6
+
7
+ module Iev
8
+ module DataSource
9
+ class NotFoundError < StandardError; end
10
+
11
+ class << self
12
+ # Fetch full concept data (all languages) for a given IEV code.
13
+ #
14
+ # @param code [String] IEV code, e.g. "103-01-02"
15
+ # @return [Hash, nil] concept data hash or nil if not found
16
+ def fetch_concept(code)
17
+ fetch_concept_data(code)
18
+ end
19
+
20
+ # Fetch localized term data for a given IEV code and language.
21
+ #
22
+ # @param code [String] IEV code, e.g. "103-01-02"
23
+ # @param lang [String] language code, e.g. "en" or "eng"
24
+ # @return [Hash, nil] localized concept data or nil
25
+ def fetch_term(code, lang)
26
+ concept = fetch_concept(code)
27
+ return nil unless concept
28
+
29
+ lang_key = normalize_lang(lang)
30
+ concept[lang_key]
31
+ end
32
+
33
+ # Fetch the term designation string for a given IEV code and language.
34
+ # This is the backward-compatible replacement for the scraping-based Iev.get.
35
+ #
36
+ # @param code [String] IEV code, e.g. "103-01-02"
37
+ # @param lang [String] language code, e.g. "en"
38
+ # @return [String, nil] term designation or nil
39
+ def fetch_term_designation(code, lang)
40
+ term_data = fetch_term(code, lang)
41
+ return nil unless term_data
42
+
43
+ terms = term_data["terms"]
44
+ return nil unless terms&.any?
45
+
46
+ preferred = terms.find { |t| t["normative_status"] == "preferred" }
47
+ (preferred || terms.first)["designation"]
48
+ end
49
+
50
+ private
51
+
52
+ def fetch_concept_data(code)
53
+ from_local(code) || from_remote(code)
54
+ end
55
+
56
+ def from_local(code)
57
+ data_path = Iev.config.data_path
58
+ return nil unless data_path
59
+
60
+ path = File.join(data_path, "concept-#{code}.yaml")
61
+ return nil unless File.exist?(path)
62
+
63
+ YAML.safe_load(File.read(path, encoding: "utf-8"), permitted_classes: [Date, Time])
64
+ end
65
+
66
+ def from_remote(code)
67
+ cache_key = "concept-#{code}.yaml"
68
+ cached = read_cache(cache_key)
69
+ return cached if cached
70
+
71
+ url = "#{Iev.config.remote_base_url}/#{cache_key}"
72
+ data = http_get_yaml(url)
73
+ return nil unless data
74
+
75
+ write_cache(cache_key, data)
76
+ data
77
+ end
78
+
79
+ def http_get_yaml(url)
80
+ uri = URI(url)
81
+ response = Net::HTTP.get_response(uri)
82
+
83
+ case response.code
84
+ when "200"
85
+ YAML.safe_load(response.body, permitted_classes: [Date, Time])
86
+ when "404"
87
+ nil
88
+ else
89
+ warn "IEV: Failed to fetch #{url}: HTTP #{response.code}"
90
+ nil
91
+ end
92
+ rescue SocketError, Timeout::Error => e
93
+ warn "IEV: Network error fetching #{url}: #{e.message}"
94
+ nil
95
+ end
96
+
97
+ def read_cache(filename)
98
+ cache_path = cache_file_path(filename)
99
+ return nil unless File.exist?(cache_path)
100
+
101
+ YAML.safe_load(File.read(cache_path, encoding: "utf-8"), permitted_classes: [Date, Time])
102
+ end
103
+
104
+ def write_cache(filename, data)
105
+ cache_path = cache_file_path(filename)
106
+ FileUtils.mkdir_p(File.dirname(cache_path))
107
+ File.write(cache_path, YAML.dump(data), encoding: "utf-8")
108
+ end
109
+
110
+ def cache_file_path(filename)
111
+ File.join(Iev.config.cache_dir, filename)
112
+ end
113
+
114
+ # Normalize language code: "en" → "eng", "de" → "deu", etc.
115
+ def normalize_lang(lang)
116
+ return lang if lang.length == 3
117
+
118
+ Iso639Code.three_char_code(lang).first
119
+ rescue StandardError
120
+ lang
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Exports IEV data to Glossarist YAML format.
5
+ #
6
+ # Automatically detects input format from file extension:
7
+ # .xlsx / .xls → Excel IEV export
8
+ # .sqlite3 / .sqlite / .db → SQLite database
9
+ #
10
+ # @example Programmatic usage
11
+ # exporter = Iev::Exporter.new("data.xlsx", output_dir: "/tmp/output")
12
+ # collection = exporter.export
13
+ #
14
+ # @example With filters
15
+ # Iev::Exporter.new("data.sqlite3",
16
+ # output_dir: "/tmp/output",
17
+ # only_concepts: "103-%",
18
+ # only_languages: "en,fr",
19
+ # ).export
20
+ class Exporter
21
+ XLSX_EXTENSIONS = %w[.xlsx .xls].freeze
22
+ SQLITE_EXTENSIONS = %w[.sqlite3 .sqlite .db].freeze
23
+
24
+ attr_reader :input_path, :output_dir, :filters
25
+
26
+ # @param input_path [String, Pathname] path to Excel or SQLite file
27
+ # @param output_dir [String, Pathname] destination for YAML files
28
+ # @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
29
+ # @param only_languages [String, nil] comma-separated language codes
30
+ # @param fetch_relaton_links [Boolean] whether to fetch source URLs via Relaton
31
+ def initialize(input_path, output_dir: Dir.pwd,
32
+ only_concepts: nil, only_languages: nil,
33
+ fetch_relaton_links: false)
34
+ @input_path = Pathname.new(input_path)
35
+ validate_input!
36
+
37
+ @output_dir = Pathname.new(output_dir)
38
+ @fetch_relaton_links = fetch_relaton_links
39
+ @filters = {
40
+ only_concepts: only_concepts,
41
+ only_languages: only_languages,
42
+ }.compact
43
+ end
44
+
45
+ # Run the export pipeline: load → transform → save.
46
+ # @return [Glossarist::ManagedConceptCollection]
47
+ def export
48
+ dataset = load_dataset
49
+ collection = build_collection(dataset)
50
+ save_collection(collection)
51
+ collection
52
+ end
53
+
54
+ private
55
+
56
+ def supported_format?
57
+ ext = input_path.extname.downcase
58
+ XLSX_EXTENSIONS.include?(ext) || SQLITE_EXTENSIONS.include?(ext)
59
+ end
60
+
61
+ def validate_input!
62
+ unless input_path.exist?
63
+ raise ArgumentError, "Input file not found: #{input_path}"
64
+ end
65
+
66
+ return if supported_format?
67
+
68
+ exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
69
+ raise ArgumentError,
70
+ "Unsupported format: #{input_path.extname}. Supported: #{exts}"
71
+ end
72
+
73
+ def input_format
74
+ ext = input_path.extname.downcase
75
+ XLSX_EXTENSIONS.include?(ext) ? :xlsx : :sqlite
76
+ end
77
+
78
+ def load_dataset
79
+ case input_format
80
+ when :xlsx then load_from_xlsx
81
+ when :sqlite then load_from_sqlite
82
+ end
83
+ end
84
+
85
+ def load_from_xlsx
86
+ require "creek"
87
+ db = Sequel.sqlite
88
+ DbWriter.new(db).import_spreadsheet(input_path.to_s)
89
+ apply_filters(db)
90
+ end
91
+
92
+ def load_from_sqlite
93
+ apply_filters(Sequel.sqlite(input_path.to_s))
94
+ end
95
+
96
+ def apply_filters(db)
97
+ query = db[:concepts]
98
+ if filters[:only_concepts]
99
+ query = query.where(Sequel.ilike(:ievref, filters[:only_concepts]))
100
+ end
101
+ if filters[:only_languages]
102
+ query = query.where(language: filters[:only_languages].split(","))
103
+ end
104
+ query
105
+ end
106
+
107
+ def build_collection(dataset)
108
+ SourceParser.relaton_enabled = @fetch_relaton_links
109
+
110
+ # Use a hash index for O(1) concept lookup instead of
111
+ # Glossarist's O(n) fetch_or_initialize which does linear scan.
112
+ concept_index = {}
113
+ collection = Glossarist::ManagedConceptCollection.new
114
+
115
+ dataset.each do |row|
116
+ term = TermBuilder.build_from(row)
117
+ next unless term
118
+
119
+ concept = concept_index[term.id] ||= begin
120
+ c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
121
+ collection.store(c)
122
+ c
123
+ end
124
+ concept.add_l10n(term)
125
+ end
126
+
127
+ collection
128
+ ensure
129
+ SourceParser.relaton_enabled = true
130
+ end
131
+
132
+ def save_collection(collection)
133
+ concepts_dir = output_dir.expand_path.join("concepts")
134
+ FileUtils.mkdir_p(concepts_dir)
135
+ collection.save_to_files(concepts_dir.to_s)
136
+ end
137
+ end
138
+ end