iev 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,190 +5,151 @@ module Iev
5
5
  class MathmlToAsciimath
6
6
  using DataConversions
7
7
 
8
- def self.convert(input)
9
- new.convert(input)
10
- end
11
-
12
- def convert(input)
13
- mathml_to_asciimath(input)
14
- end
8
+ GREEK_ENTITIES = {
9
+ "α" => "alpha",
10
+ "β" => "beta",
11
+ "γ" => "gamma",
12
+ "Γ" => "Gamma",
13
+ "δ" => "delta",
14
+ "Δ" => "Delta",
15
+ "ε" => "epsilon",
16
+ "ϵ" => "varepsilon",
17
+ "ζ" => "zeta",
18
+ "η" => "eta",
19
+ "θ" => "theta",
20
+ "Θ" => "Theta",
21
+ "ϑ" => "vartheta",
22
+ "ι" => "iota",
23
+ "κ" => "kappa",
24
+ "λ" => "lambda",
25
+ "Λ" => "Lambda",
26
+ "μ" => "mu",
27
+ "ν" => "nu",
28
+ "ξ" => "xi",
29
+ "Ξ" => "Xi",
30
+ "π" => "pi",
31
+ "Π" => "Pi",
32
+ "ρ" => "rho",
33
+ "σ" => "sigma",
34
+ "Σ" => "Sigma",
35
+ "τ" => "tau",
36
+ "υ" => "upsilon",
37
+ "φ" => "phi",
38
+ "Φ" => "Phi",
39
+ "ϕ" => "varphi",
40
+ "χ" => "chi",
41
+ "ψ" => "psi",
42
+ "Ψ" => "Psi",
43
+ "ω" => "omega",
44
+ }.freeze
45
+
46
+ class << self
47
+ def convert(input)
48
+ mathml_to_asciimath(input)
49
+ end
15
50
 
16
- private
51
+ private
17
52
 
18
- def mathml_to_asciimath(input)
19
- # If given string does not include '<' (for elements) nor '&'
20
- # (for entities), then it's certain that it doesn't contain
21
- # any MathML or HTML formula.
22
- return input unless input&.match?(/<|&/)
53
+ def mathml_to_asciimath(input)
54
+ return input unless input&.match?(/<|&/)
23
55
 
24
- return html_to_asciimath(input) unless input.include?("<math>")
56
+ return html_to_asciimath(input) unless input.include?("<math>")
25
57
 
26
- # puts "GOING TO MATHML MATH"
27
- # puts input
28
- to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
29
- # to_asciimath.remove_namespaces!
58
+ to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
30
59
 
31
- to_asciimath.css("math").each do |math_element|
32
- asciimath = Plurimath::Math.parse(
33
- text_to_asciimath(math_element.to_xml), :mathml
34
- ).to_asciimath.strip
35
- # puts"ASCIIMATH!! #{asciimath}"
60
+ to_asciimath.css("math").each do |math_element|
61
+ asciimath = Plurimath::Math.parse(
62
+ math_element.to_xml, :mathml
63
+ ).to_asciimath.strip
36
64
 
37
- if asciimath.empty?
65
+ if asciimath.empty?
66
+ math_element.remove
67
+ else
68
+ math_element.replace "stem:[#{asciimath}]"
69
+ end
70
+ rescue Plurimath::Math::ParseError
38
71
  math_element.remove
39
- else
40
- math_element.replace "stem:[#{asciimath}]"
41
72
  end
73
+
74
+ html_to_asciimath(
75
+ to_asciimath.children.to_s,
76
+ )
42
77
  end
43
78
 
44
- html_to_asciimath(
45
- to_asciimath.children.to_s,
46
- )
47
- end
79
+ def html_to_asciimath(input)
80
+ return input if input.nil? || input.empty?
81
+
82
+ to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
83
+
84
+ to_asciimath.css("i").each do |math_element|
85
+ decoded = text_to_asciimath(math_element.text)
86
+ case decoded.length
87
+ when 1..12
88
+ math_element.replace "stem:[#{decoded}]"
89
+ when 0
90
+ math_element.remove
91
+ else
92
+ math_element.replace "_#{decoded}_"
93
+ end
94
+ end
48
95
 
49
- def html_to_asciimath(input)
50
- return input if input.nil? || input.empty?
96
+ to_asciimath.css("sub").each do |math_element|
97
+ case math_element.text.length
98
+ when 0
99
+ math_element.remove
100
+ else
101
+ math_element.replace "~#{text_to_asciimath(math_element.text)}~"
102
+ end
103
+ end
51
104
 
52
- to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
105
+ to_asciimath.css("sup").each do |math_element|
106
+ case math_element.text.length
107
+ when 0
108
+ math_element.remove
109
+ else
110
+ math_element.replace "^#{text_to_asciimath(math_element.text)}^"
111
+ end
112
+ end
53
113
 
54
- to_asciimath.css("i").each do |math_element|
55
- # puts "HTML MATH!! #{math_element.to_xml}"
56
- # puts "HTML MATH!! #{math_element.text}"
57
- decoded = text_to_asciimath(math_element.text)
58
- case decoded.length
59
- when 1..12
60
- # puts "(#{math_element.text} to => #{decoded})"
61
- math_element.replace "stem:[#{decoded}]"
62
- when 0
63
- math_element.remove
64
- else
65
- math_element.replace "_#{decoded}_"
114
+ to_asciimath.css("ol").each do |element|
115
+ element.css("li").each do |li|
116
+ li.replace ". #{li.text}"
117
+ end
66
118
  end
67
- end
68
119
 
69
- to_asciimath.css("sub").each do |math_element|
70
- case math_element.text.length
71
- when 0
72
- math_element.remove
73
- else
74
- math_element.replace "~#{text_to_asciimath(math_element.text)}~"
120
+ to_asciimath.css("ul").each do |element|
121
+ element.css("li").each do |li|
122
+ li.replace "* #{li.text}"
123
+ end
75
124
  end
76
- end
77
125
 
78
- to_asciimath.css("sup").each do |math_element|
79
- case math_element.text.length
80
- when 0
81
- math_element.remove
82
- else
83
- math_element.replace "^#{text_to_asciimath(math_element.text)}^"
126
+ to_asciimath.css('font[style*="sans-serif"]').each do |x|
127
+ x.replace "`#{x.text}`"
84
128
  end
129
+
130
+ html_entities_to_stem(
131
+ to_asciimath
132
+ .children.to_s
133
+ .gsub("]stem:[", "")
134
+ .gsub(%r{</?[uo]l>}, ""),
135
+ )
85
136
  end
86
137
 
87
- to_asciimath.css("ol").each do |element|
88
- element.css("li").each do |li|
89
- li.replace ". #{li.text}"
90
- end
138
+ def text_to_asciimath(text)
139
+ html_entities_to_asciimath(text.decode_html)
91
140
  end
92
141
 
93
- to_asciimath.css("ul").each do |element|
94
- element.css("li").each do |li|
95
- li.replace "* #{li.text}"
142
+ def html_entities_to_asciimath(input)
143
+ GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
144
+ str.gsub(entity, name)
96
145
  end
97
146
  end
98
147
 
99
- # Replace sans-serif font with monospace
100
- to_asciimath.css('font[style*="sans-serif"]').each do |x|
101
- x.replace "`#{x.text}`"
148
+ def html_entities_to_stem(input)
149
+ GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
150
+ str.gsub(entity, "stem:[#{name}]")
151
+ end
102
152
  end
103
-
104
- html_entities_to_stem(
105
- to_asciimath
106
- .children.to_s
107
- .gsub(/\]stem:\[/, "")
108
- .gsub(%r{</?[uo]l>}, ""),
109
- )
110
- end
111
-
112
- def text_to_asciimath(text)
113
- html_entities_to_asciimath(text.decode_html)
114
- end
115
-
116
- def html_entities_to_asciimath(input)
117
- input.gsub("&alpha;", "alpha")
118
- .gsub("&beta;", "beta")
119
- .gsub("&gamma;", "gamma")
120
- .gsub("&Gamma;", "Gamma")
121
- .gsub("&delta;", "delta")
122
- .gsub("&Delta;", "Delta")
123
- .gsub("&epsilon;", "epsilon")
124
- .gsub("&varepsilon;", "varepsilon")
125
- .gsub("&zeta;", "zeta")
126
- .gsub("&eta;", "eta")
127
- .gsub("&theta;", "theta")
128
- .gsub("&Theta;", "Theta")
129
- .gsub("&vartheta;", "vartheta")
130
- .gsub("&iota;", "iota")
131
- .gsub("&kappa;", "kappa")
132
- .gsub("&lambda;", "lambda")
133
- .gsub("&Lambda;", "Lambda")
134
- .gsub("&mu;", "mu")
135
- .gsub("&nu;", "nu")
136
- .gsub("&xi;", "xi")
137
- .gsub("&Xi;", "Xi")
138
- .gsub("&pi;", "pi")
139
- .gsub("&Pi;", "Pi")
140
- .gsub("&rho;", "rho")
141
- .gsub("&beta;", "beta")
142
- .gsub("&sigma;", "sigma")
143
- .gsub("&Sigma;", "Sigma")
144
- .gsub("&tau;", "tau")
145
- .gsub("&upsilon;", "upsilon")
146
- .gsub("&phi;", "phi")
147
- .gsub("&Phi;", "Phi")
148
- .gsub("&varphi;", "varphi")
149
- .gsub("&chi;", "chi")
150
- .gsub("&psi;", "psi")
151
- .gsub("&Psi;", "Psi")
152
- .gsub("&omega;", "omega")
153
- end
154
-
155
- def html_entities_to_stem(input)
156
- input.gsub("&alpha;", "stem:[alpha]")
157
- .gsub("&beta;", "stem:[beta]")
158
- .gsub("&gamma;", "stem:[gamma]")
159
- .gsub("&Gamma;", "stem:[Gamma]")
160
- .gsub("&delta;", "stem:[delta]")
161
- .gsub("&Delta;", "stem:[Delta]")
162
- .gsub("&epsilon;", "stem:[epsilon]")
163
- .gsub("&varepsilon;", "stem:[varepsilon]")
164
- .gsub("&zeta;", "stem:[zeta]")
165
- .gsub("&eta;", "stem:[eta]")
166
- .gsub("&theta;", "stem:[theta]")
167
- .gsub("&Theta;", "stem:[Theta]")
168
- .gsub("&vartheta;", "stem:[vartheta]")
169
- .gsub("&iota;", "stem:[iota]")
170
- .gsub("&kappa;", "stem:[kappa]")
171
- .gsub("&lambda;", "stem:[lambda]")
172
- .gsub("&Lambda;", "stem:[Lambda]")
173
- .gsub("&mu;", "stem:[mu]")
174
- .gsub("&nu;", "stem:[nu]")
175
- .gsub("&xi;", "stem:[xi]")
176
- .gsub("&Xi;", "stem:[Xi]")
177
- .gsub("&pi;", "stem:[pi]")
178
- .gsub("&Pi;", "stem:[Pi]")
179
- .gsub("&rho;", "stem:[rho]")
180
- .gsub("&beta;", "stem:[beta]")
181
- .gsub("&sigma;", "stem:[sigma]")
182
- .gsub("&Sigma;", "stem:[Sigma]")
183
- .gsub("&tau;", "stem:[tau]")
184
- .gsub("&upsilon;", "stem:[upsilon]")
185
- .gsub("&phi;", "stem:[phi]")
186
- .gsub("&Phi;", "stem:[Phi]")
187
- .gsub("&varphi;", "stem:[varphi]")
188
- .gsub("&chi;", "stem:[chi]")
189
- .gsub("&psi;", "stem:[psi]")
190
- .gsub("&Psi;", "stem:[Psi]")
191
- .gsub("&omega;", "stem:[omega]")
192
153
  end
193
154
  end
194
155
  end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+ require "yaml"
6
+
7
+ module Iev
8
+ module DataSource
9
+ class NotFoundError < StandardError; end
10
+
11
+ class << self
12
+ # Fetch full concept data (all languages) for a given IEV code.
13
+ #
14
+ # @param code [String] IEV code, e.g. "103-01-02"
15
+ # @return [Hash, nil] concept data hash or nil if not found
16
+ def fetch_concept(code)
17
+ fetch_concept_data(code)
18
+ end
19
+
20
+ # Fetch localized term data for a given IEV code and language.
21
+ #
22
+ # @param code [String] IEV code, e.g. "103-01-02"
23
+ # @param lang [String] language code, e.g. "en" or "eng"
24
+ # @return [Hash, nil] localized concept data or nil
25
+ def fetch_term(code, lang)
26
+ concept = fetch_concept(code)
27
+ return nil unless concept
28
+
29
+ lang_key = normalize_lang(lang)
30
+ concept[lang_key]
31
+ end
32
+
33
+ # Fetch the term designation string for a given IEV code and language.
34
+ # This is the backward-compatible replacement for the scraping-based Iev.get.
35
+ #
36
+ # @param code [String] IEV code, e.g. "103-01-02"
37
+ # @param lang [String] language code, e.g. "en"
38
+ # @return [String, nil] term designation or nil
39
+ def fetch_term_designation(code, lang)
40
+ term_data = fetch_term(code, lang)
41
+ return nil unless term_data
42
+
43
+ terms = term_data["terms"]
44
+ return nil unless terms&.any?
45
+
46
+ preferred = terms.find { |t| t["normative_status"] == "preferred" }
47
+ (preferred || terms.first)["designation"]
48
+ end
49
+
50
+ private
51
+
52
+ def fetch_concept_data(code)
53
+ from_local(code) || from_remote(code)
54
+ end
55
+
56
+ def from_local(code)
57
+ data_path = Iev.config.data_path
58
+ return nil unless data_path
59
+
60
+ path = File.join(data_path, "concept-#{code}.yaml")
61
+ return nil unless File.exist?(path)
62
+
63
+ YAML.safe_load(File.read(path, encoding: "utf-8"), permitted_classes: [Date, Time])
64
+ end
65
+
66
+ def from_remote(code)
67
+ cache_key = "concept-#{code}.yaml"
68
+ cached = read_cache(cache_key)
69
+ return cached if cached
70
+
71
+ url = "#{Iev.config.remote_base_url}/#{cache_key}"
72
+ data = http_get_yaml(url)
73
+ return nil unless data
74
+
75
+ write_cache(cache_key, data)
76
+ data
77
+ end
78
+
79
+ def http_get_yaml(url)
80
+ uri = URI(url)
81
+ response = Net::HTTP.get_response(uri)
82
+
83
+ case response.code
84
+ when "200"
85
+ YAML.safe_load(response.body, permitted_classes: [Date, Time])
86
+ when "404"
87
+ nil
88
+ else
89
+ warn "IEV: Failed to fetch #{url}: HTTP #{response.code}"
90
+ nil
91
+ end
92
+ rescue SocketError, Timeout::Error => e
93
+ warn "IEV: Network error fetching #{url}: #{e.message}"
94
+ nil
95
+ end
96
+
97
+ def read_cache(filename)
98
+ cache_path = cache_file_path(filename)
99
+ return nil unless File.exist?(cache_path)
100
+
101
+ YAML.safe_load(File.read(cache_path, encoding: "utf-8"), permitted_classes: [Date, Time])
102
+ end
103
+
104
+ def write_cache(filename, data)
105
+ cache_path = cache_file_path(filename)
106
+ FileUtils.mkdir_p(File.dirname(cache_path))
107
+ File.write(cache_path, YAML.dump(data), encoding: "utf-8")
108
+ end
109
+
110
+ def cache_file_path(filename)
111
+ File.join(Iev.config.cache_dir, filename)
112
+ end
113
+
114
+ # Normalize language code: "en" → "eng", "de" → "deu", etc.
115
+ def normalize_lang(lang)
116
+ return lang if lang.length == 3
117
+
118
+ Iso639Code.three_char_code(lang).first
119
+ rescue StandardError
120
+ lang
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Exports IEV data to Glossarist YAML format.
5
+ #
6
+ # Automatically detects input format from file extension:
7
+ # .xlsx / .xls → Excel IEV export
8
+ # .sqlite3 / .sqlite / .db → SQLite database
9
+ #
10
+ # @example Programmatic usage
11
+ # exporter = Iev::Exporter.new("data.xlsx", output_dir: "/tmp/output")
12
+ # collection = exporter.export
13
+ #
14
+ # @example With filters
15
+ # Iev::Exporter.new("data.sqlite3",
16
+ # output_dir: "/tmp/output",
17
+ # only_concepts: "103-%",
18
+ # only_languages: "en,fr",
19
+ # ).export
20
+ class Exporter
21
+ XLSX_EXTENSIONS = %w[.xlsx .xls].freeze
22
+ SQLITE_EXTENSIONS = %w[.sqlite3 .sqlite .db].freeze
23
+
24
+ attr_reader :input_path, :output_dir, :filters
25
+
26
+ # @param input_path [String, Pathname] path to Excel or SQLite file
27
+ # @param output_dir [String, Pathname] destination for YAML files
28
+ # @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
29
+ # @param only_languages [String, nil] comma-separated language codes
30
+ def initialize(input_path, output_dir: Dir.pwd,
31
+ only_concepts: nil, only_languages: nil)
32
+ @input_path = Pathname.new(input_path)
33
+ validate_input!
34
+
35
+ @output_dir = Pathname.new(output_dir)
36
+ @filters = {
37
+ only_concepts: only_concepts,
38
+ only_languages: only_languages,
39
+ }.compact
40
+ end
41
+
42
+ # Run the export pipeline: load → transform → save.
43
+ # @return [Glossarist::ManagedConceptCollection]
44
+ def export
45
+ dataset = load_dataset
46
+ collection = build_collection(dataset)
47
+ save_collection(collection)
48
+ collection
49
+ end
50
+
51
+ private
52
+
53
+ def supported_format?
54
+ ext = input_path.extname.downcase
55
+ XLSX_EXTENSIONS.include?(ext) || SQLITE_EXTENSIONS.include?(ext)
56
+ end
57
+
58
+ def validate_input!
59
+ unless input_path.exist?
60
+ raise ArgumentError, "Input file not found: #{input_path}"
61
+ end
62
+
63
+ return if supported_format?
64
+
65
+ exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
66
+ raise ArgumentError,
67
+ "Unsupported format: #{input_path.extname}. Supported: #{exts}"
68
+ end
69
+
70
+ def input_format
71
+ ext = input_path.extname.downcase
72
+ XLSX_EXTENSIONS.include?(ext) ? :xlsx : :sqlite
73
+ end
74
+
75
+ def load_dataset
76
+ case input_format
77
+ when :xlsx then load_from_xlsx
78
+ when :sqlite then load_from_sqlite
79
+ end
80
+ end
81
+
82
+ def load_from_xlsx
83
+ require "creek"
84
+ db = Sequel.sqlite
85
+ DbWriter.new(db).import_spreadsheet(input_path.to_s)
86
+ apply_filters(db)
87
+ end
88
+
89
+ def load_from_sqlite
90
+ apply_filters(Sequel.sqlite(input_path.to_s))
91
+ end
92
+
93
+ def apply_filters(db)
94
+ query = db[:concepts]
95
+ if filters[:only_concepts]
96
+ query = query.where(Sequel.ilike(:ievref, filters[:only_concepts]))
97
+ end
98
+ if filters[:only_languages]
99
+ query = query.where(language: filters[:only_languages].split(","))
100
+ end
101
+ query
102
+ end
103
+
104
+ def build_collection(dataset)
105
+ Glossarist::ManagedConceptCollection.new.tap do |collection|
106
+ dataset.each do |row|
107
+ term = TermBuilder.build_from(row)
108
+ next unless term
109
+
110
+ concept = collection.fetch_or_initialize(term.id)
111
+ concept.add_l10n(term)
112
+ end
113
+ end
114
+ end
115
+
116
+ def save_collection(collection)
117
+ concepts_dir = output_dir.expand_path.join("concepts")
118
+ FileUtils.mkdir_p(concepts_dir)
119
+ collection.save_to_files(concepts_dir.to_s)
120
+ end
121
+ end
122
+ end