iev 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 382780f439ab289a11d44053962bb7dbf043688fd68cb2328b8746429c19c6d3
4
- data.tar.gz: 72151b973c479bfc82e61b3d0da4648f11ac54e9344b8c71e3bbf5d91a3a3c3b
3
+ metadata.gz: 5100fcef07e984496be43178ab8ebaf48fb6b77a2144ccd722a1d7acc1135e57
4
+ data.tar.gz: 724f02e6431f8a3534a1f28e6894dd5462d65ad67ce736695b8e5f8c1b7f04ad
5
5
  SHA512:
6
- metadata.gz: d36fad5c1d853d39ffa5d8d67e98ecaedd3c40206237706ccb177ff14957ebea9616420bd41878b84ac34cdab1e9c5c7890143024b0a3070fb6c8a994d769518
7
- data.tar.gz: 8c5e7f0a9510fd562fc92db4ba225e5f9b7580e0e703525ec062b3d404ab57c804da824f94af0e3047738033577aaae8394970d47c349b1ac6b1979f7fbfa02d
6
+ metadata.gz: c239187ba2d7d06645dbebca3478bda513dd5e6da8937ff4c9fc51be1647bf2866338d6b79f865e9b3462359b45dce67999e78e8d0cd486c2264c3d3060b6abc
7
+ data.tar.gz: 88aded4d4dcedc76afb3160f7adb7f1746088d250a34750bb681bb69bc8c4f3d025b8d2f4c2d50df4aa66dfb4543e8699c9261ef5a3fd31ee84a6f5600828d52
@@ -1,7 +1,7 @@
1
1
  name: rake
2
2
 
3
3
  permissions:
4
- contents: read
4
+ contents: write
5
5
 
6
6
  on:
7
7
  push:
data/README.adoc CHANGED
@@ -85,19 +85,21 @@ Electropedia administrator.
85
85
  Iev.get("103-01-02", "en")
86
86
  => "functional"
87
87
 
88
- # If code not found
88
+ # If code not found, returns nil (does not raise)
89
89
  Iev.get("111-11-11", "en")
90
- => ""
90
+ => nil
91
91
 
92
- # If language not found
92
+ # If language not found, returns nil
93
93
  Iev.get("103-01-02", "eee")
94
94
  => nil
95
95
 
96
96
  # Fetch full concept data (all languages)
97
+ # Raises Iev::DataSource::NotFoundError if code not found
97
98
  Iev.fetch_concept("103-01-02")
98
99
  => { "id" => "103-01-02", "data" => { ... } }
99
100
 
100
101
  # Fetch localized term data
102
+ # Raises Iev::DataSource::NotFoundError if code not found
101
103
  Iev.fetch_term("103-01-02", "en")
102
104
  => { "term" => "functional", ... }
103
105
  ----
@@ -8,6 +8,11 @@ module Iev
8
8
  class Command < Thor
9
9
  include CommandHelper
10
10
 
11
+ desc "version", "Show iev gem version"
12
+ def version
13
+ puts "iev #{Iev::VERSION}"
14
+ end
15
+
11
16
  desc "export FILE", "Export IEV data to Glossarist YAML format"
12
17
  long_desc <<~DESC
13
18
  Exports IEV data from an Excel (.xlsx/.xls) or SQLite (.sqlite3/.sqlite/.db)
@@ -34,20 +39,30 @@ module Iev
34
39
  desc: "Enables debug messages about authoritative sources recognition"
35
40
  option :debug_relaton, type: :boolean, default: false,
36
41
  desc: "Enables debug messages about Relaton integration"
42
+ option :relaton, type: :boolean, default: false,
43
+ desc: "Fetch source URLs via Relaton (slow, makes network requests)"
37
44
  def export(file)
38
45
  handle_generic_options(options)
39
46
 
40
- Iev::Exporter.new(
47
+ exporter = Iev::Exporter.new(
41
48
  file,
42
49
  output_dir: options[:output],
43
50
  only_concepts: options[:only_concepts],
44
51
  only_languages: options[:only_languages],
45
- ).export
46
-
47
- info "Done!"
52
+ fetch_relaton_links: options[:relaton],
53
+ on_progress: method(:export_progress),
54
+ )
55
+ exporter.export
56
+ print_export_summary(exporter.stats)
57
+ rescue ArgumentError => e
58
+ error e.message
59
+ exit 1
60
+ rescue Sequel::Error => e
61
+ error "Database error: #{e.message}"
62
+ exit 1
48
63
  end
49
64
 
50
- desc "xlsx2yaml FILE", "Converts Excel IEV exports to YAMLs."
65
+ desc "xlsx2yaml FILE", "[DEPRECATED] Use 'export' instead."
51
66
  option :output, desc: "Output directory", aliases: :o, default: Dir.pwd
52
67
  option :only_concepts,
53
68
  desc: "Only process concepts with IEVREF matching this argument, " \
@@ -66,6 +81,7 @@ module Iev
66
81
  option :debug_sources, type: :boolean, default: false
67
82
  option :debug_relaton, type: :boolean, default: false
68
83
  def xlsx2yaml(file)
84
+ warn "[DEPRECATED] 'xlsx2yaml' is deprecated. Use 'export' instead."
69
85
  handle_generic_options(options)
70
86
 
71
87
  Iev::Exporter.new(
@@ -78,7 +94,7 @@ module Iev
78
94
  summary
79
95
  end
80
96
 
81
- desc "db2yaml DB_FILE", "Exports SQLite to IEV YAMLs."
97
+ desc "db2yaml DB_FILE", "[DEPRECATED] Use 'export' instead."
82
98
  option :output, desc: "Output directory", aliases: :o, default: Dir.pwd
83
99
  option :only_concepts,
84
100
  desc: "Only process concepts with IEVREF matching this argument, " \
@@ -97,6 +113,7 @@ module Iev
97
113
  option :debug_sources, type: :boolean, default: false
98
114
  option :debug_relaton, type: :boolean, default: false
99
115
  def db2yaml(dbfile)
116
+ warn "[DEPRECATED] 'db2yaml' is deprecated. Use 'export' instead."
100
117
  handle_generic_options(options)
101
118
 
102
119
  Iev::Exporter.new(
@@ -135,13 +152,14 @@ module Iev
135
152
  DataSource.fetch_concept(code)
136
153
  end
137
154
 
138
- unless raw
139
- warn "IEV: concept #{code} not found."
140
- exit 1
141
- end
142
-
143
155
  concept = build_concept_from_raw(code, raw)
144
156
  print_concept_grouped_yaml(concept)
157
+ rescue Iev::DataSource::NotFoundError
158
+ error "IEV concept not found: #{code}"
159
+ exit 1
160
+ rescue Ferrum::Error => e
161
+ error "Scraping failed: #{e.message}"
162
+ exit 1
145
163
  end
146
164
 
147
165
  def self.exit_on_failure?
@@ -24,6 +24,36 @@ module Iev
24
24
  info "Done!"
25
25
  end
26
26
 
27
+ def export_progress(current, total)
28
+ return unless $IEV_PROGRESS
29
+ return if total <= 1 # single-row dataset, skip progress
30
+
31
+ if current == total
32
+ Ui.info "" # clear progress line
33
+ else
34
+ Ui.progress "Processing #{current}/#{total}..."
35
+ end
36
+ end
37
+
38
+ def print_export_summary(stats)
39
+ return unless stats
40
+
41
+ s = stats
42
+ elapsed = format_elapsed(s[:elapsed_seconds])
43
+ info "Exported #{s[:concept_count]} concepts " \
44
+ "(#{s[:localized_count]} localized) in #{elapsed}"
45
+ end
46
+
47
+ def format_elapsed(seconds)
48
+ if seconds < 60
49
+ "%.1fs" % seconds
50
+ else
51
+ mins = (seconds / 60).to_i
52
+ secs = (seconds % 60).round
53
+ "#{mins}m #{secs}s"
54
+ end
55
+ end
56
+
27
57
  def handle_generic_options(options)
28
58
  $IEV_PROFILE = options[:profile]
29
59
  $IEV_PROGRESS = options.fetch(:progress, !ENV["CI"])
data/lib/iev/cli/ui.rb CHANGED
@@ -32,6 +32,11 @@ module Iev
32
32
  print "#{Helper.clear_progress}#{message}\n"
33
33
  end
34
34
 
35
+ # Prints error message to stderr.
36
+ def error(message)
37
+ Kernel.warn "Error: #{message}"
38
+ end
39
+
35
40
  # Sets an UI tag which will be prepended to messages printed with
36
41
  # #debug and #warn.
37
42
  def set_ui_tag(str)
@@ -48,8 +48,17 @@ module Iev
48
48
  mathml_to_asciimath(input)
49
49
  end
50
50
 
51
+ # Clear the Plurimath expression cache. Call between export runs.
52
+ def clear_cache
53
+ @math_cache = nil
54
+ end
55
+
51
56
  private
52
57
 
58
+ def math_cache
59
+ @math_cache ||= {}
60
+ end
61
+
53
62
  def mathml_to_asciimath(input)
54
63
  return input unless input&.match?(/<|&/)
55
64
 
@@ -58,17 +67,18 @@ module Iev
58
67
  to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
59
68
 
60
69
  to_asciimath.css("math").each do |math_element|
61
- asciimath = Plurimath::Math.parse(
62
- math_element.to_xml, :mathml
63
- ).to_asciimath.strip
70
+ math_xml = math_element.to_xml
71
+ asciimath = math_cache[math_xml] ||= begin
72
+ Plurimath::Math.parse(math_xml, :mathml).to_asciimath.strip
73
+ rescue Plurimath::Math::ParseError
74
+ ""
75
+ end
64
76
 
65
77
  if asciimath.empty?
66
78
  math_element.remove
67
79
  else
68
80
  math_element.replace "stem:[#{asciimath}]"
69
81
  end
70
- rescue Plurimath::Math::ParseError
71
- math_element.remove
72
82
  end
73
83
 
74
84
  html_to_asciimath(
@@ -79,6 +89,13 @@ module Iev
79
89
  def html_to_asciimath(input)
80
90
  return input if input.nil? || input.empty?
81
91
 
92
+ # Fast path: if no HTML elements remain that need Nokogiri processing
93
+ # (after parse_anchor_tag handles <i>/<sub>/<sup>/<ol>/<ul>/<font>),
94
+ # just do the Greek entity replacement.
95
+ unless input.match?(/<([iI]|sub|sup|ol|ul|font)\b/)
96
+ return html_entities_to_stem(input)
97
+ end
98
+
82
99
  to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
83
100
 
84
101
  to_asciimath.css("i").each do |math_element|
@@ -12,19 +12,21 @@ module Iev
12
12
  # Fetch full concept data (all languages) for a given IEV code.
13
13
  #
14
14
  # @param code [String] IEV code, e.g. "103-01-02"
15
- # @return [Hash, nil] concept data hash or nil if not found
15
+ # @return [Hash] concept data hash
16
+ # @raise [NotFoundError] if the concept does not exist
16
17
  def fetch_concept(code)
17
- fetch_concept_data(code)
18
+ fetch_concept_data(code) ||
19
+ raise(NotFoundError, "IEV concept not found: #{code}")
18
20
  end
19
21
 
20
22
  # Fetch localized term data for a given IEV code and language.
21
23
  #
22
24
  # @param code [String] IEV code, e.g. "103-01-02"
23
25
  # @param lang [String] language code, e.g. "en" or "eng"
24
- # @return [Hash, nil] localized concept data or nil
26
+ # @return [Hash, nil] localized concept data or nil if language not found
27
+ # @raise [NotFoundError] if the concept does not exist
25
28
  def fetch_term(code, lang)
26
29
  concept = fetch_concept(code)
27
- return nil unless concept
28
30
 
29
31
  lang_key = normalize_lang(lang)
30
32
  concept[lang_key]
@@ -35,7 +37,8 @@ module Iev
35
37
  #
36
38
  # @param code [String] IEV code, e.g. "103-01-02"
37
39
  # @param lang [String] language code, e.g. "en"
38
- # @return [String, nil] term designation or nil
40
+ # @return [String, nil] term designation or nil if not found
41
+ # @raise [NotFoundError] if the concept does not exist
39
42
  def fetch_term_designation(code, lang)
40
43
  term_data = fetch_term(code, lang)
41
44
  return nil unless term_data
data/lib/iev/exporter.rb CHANGED
@@ -27,12 +27,18 @@ module Iev
27
27
  # @param output_dir [String, Pathname] destination for YAML files
28
28
  # @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
29
29
  # @param only_languages [String, nil] comma-separated language codes
30
+ # @param fetch_relaton_links [Boolean] fetch source URLs via Relaton
31
+ # @param on_progress [Proc, nil] callback (current, total) during build
30
32
  def initialize(input_path, output_dir: Dir.pwd,
31
- only_concepts: nil, only_languages: nil)
33
+ only_concepts: nil, only_languages: nil,
34
+ fetch_relaton_links: false,
35
+ on_progress: nil)
32
36
  @input_path = Pathname.new(input_path)
33
37
  validate_input!
34
38
 
35
39
  @output_dir = Pathname.new(output_dir)
40
+ @fetch_relaton_links = fetch_relaton_links
41
+ @on_progress = on_progress
36
42
  @filters = {
37
43
  only_concepts: only_concepts,
38
44
  only_languages: only_languages,
@@ -42,12 +48,23 @@ module Iev
42
48
  # Run the export pipeline: load → transform → save.
43
49
  # @return [Glossarist::ManagedConceptCollection]
44
50
  def export
51
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
45
52
  dataset = load_dataset
46
53
  collection = build_collection(dataset)
47
54
  save_collection(collection)
55
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
56
+
57
+ @stats = {
58
+ concept_count: collection.count,
59
+ localized_count: localized_count(collection),
60
+ elapsed_seconds: elapsed,
61
+ }
48
62
  collection
49
63
  end
50
64
 
65
+ # @return [Hash, nil] stats from last export, or nil if export hasn't run
66
+ attr_reader :stats
67
+
51
68
  private
52
69
 
53
70
  def supported_format?
@@ -102,15 +119,33 @@ module Iev
102
119
  end
103
120
 
104
121
  def build_collection(dataset)
105
- Glossarist::ManagedConceptCollection.new.tap do |collection|
106
- dataset.each do |row|
107
- term = TermBuilder.build_from(row)
108
- next unless term
109
-
110
- concept = collection.fetch_or_initialize(term.id)
111
- concept.add_l10n(term)
122
+ SourceParser.relaton_enabled = @fetch_relaton_links
123
+
124
+ # Use a hash index for O(1) concept lookup instead of
125
+ # Glossarist's O(n) fetch_or_initialize which does linear scan.
126
+ concept_index = {}
127
+ collection = Glossarist::ManagedConceptCollection.new
128
+ row_count = dataset.count
129
+ current = 0
130
+
131
+ dataset.each do |row|
132
+ current += 1
133
+ @on_progress&.call(current, row_count)
134
+
135
+ term = TermBuilder.build_from(row)
136
+ next unless term
137
+
138
+ concept = concept_index[term.id] ||= begin
139
+ c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
140
+ collection.store(c)
141
+ c
112
142
  end
143
+ concept.add_l10n(term)
113
144
  end
145
+
146
+ collection
147
+ ensure
148
+ SourceParser.relaton_enabled = true
114
149
  end
115
150
 
116
151
  def save_collection(collection)
@@ -118,5 +153,9 @@ module Iev
118
153
  FileUtils.mkdir_p(concepts_dir)
119
154
  collection.save_to_files(concepts_dir.to_s)
120
155
  end
156
+
157
+ def localized_count(collection)
158
+ collection.sum { |c| c.localized_concepts.count }
159
+ end
121
160
  end
122
161
  end
@@ -14,6 +14,12 @@ module Iev
14
14
  include Utilities
15
15
  using DataConversions
16
16
 
17
+ # When false, obtain_source_link skips Relaton network calls.
18
+ @relaton_enabled = true
19
+ class << self
20
+ attr_accessor :relaton_enabled
21
+ end
22
+
17
23
  attr_reader :src_split, :parsed_sources, :raw_str, :src_str
18
24
 
19
25
  def initialize(source_str, term_domain)
@@ -89,7 +95,7 @@ module Iev
89
95
  origin: origin,
90
96
  modification: relationship[:modification],
91
97
  )
92
- rescue ::RelatonBib::RequestError => e
98
+ rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
93
99
  warn e.message
94
100
  end
95
101
 
@@ -347,10 +353,11 @@ module Iev
347
353
 
348
354
  # Uses Relaton to obtain link for given source ref.
349
355
  def obtain_source_link(ref)
356
+ return nil unless self.class.relaton_enabled
350
357
  return nil unless defined?(RelatonDb)
351
358
 
352
359
  RelatonDb.instance.fetch(ref)&.url
353
- rescue ::RelatonBib::RequestError => e
360
+ rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
354
361
  warn e.message
355
362
  nil
356
363
  end
@@ -298,6 +298,8 @@ module Iev
298
298
  end
299
299
 
300
300
  def strip_html_comments(str)
301
+ return str unless str.include?("<!--")
302
+
301
303
  doc = Nokogiri::HTML::DocumentFragment.parse(str)
302
304
  comments = doc.children.select(&:comment?)
303
305
  return str if comments.empty?
data/lib/iev/utilities.rb CHANGED
@@ -13,10 +13,14 @@ module Iev
13
13
 
14
14
  def parse_anchor_tag(text, term_domain)
15
15
  return nil if text.nil?
16
+ return text unless text.include?("<")
16
17
 
17
18
  text = process_simg_figures(text, term_domain)
18
19
  text = fix_unquoted_href(text)
19
20
 
21
+ # Second check: regex substitutions may have consumed all tags
22
+ return text unless text.include?("<")
23
+
20
24
  doc = Nokogiri::HTML::DocumentFragment.parse(text)
21
25
  nodes_to_adoc(doc.children, term_domain)
22
26
  end
@@ -86,11 +90,45 @@ module Iev
86
90
  "#{IMAGE_PATH_PREFIX}/#{term_domain}/#{src}[]"
87
91
  when "p", "div", "span"
88
92
  inner
93
+ when "i"
94
+ convert_italic(inner)
95
+ when "sub"
96
+ inner.empty? ? "" : "~#{inner}~"
97
+ when "sup"
98
+ inner.empty? ? "" : "^#{inner}^"
99
+ when "ol"
100
+ convert_list(node, ". ")
101
+ when "ul"
102
+ convert_list(node, "* ")
103
+ when "li"
104
+ inner
105
+ when "font"
106
+ convert_font(node, inner)
89
107
  else
90
108
  node.to_s
91
109
  end
92
110
  end
93
111
 
112
+ def convert_italic(text)
113
+ case text.length
114
+ when 0
115
+ ""
116
+ when 1..12
117
+ "stem:[#{text}]"
118
+ else
119
+ "_#{text}_"
120
+ end
121
+ end
122
+
123
+ def convert_list(node, prefix)
124
+ node.css("li").map { |li| "#{prefix}#{li.text}" }.join
125
+ end
126
+
127
+ def convert_font(node, inner)
128
+ style = node["style"].to_s
129
+ style.include?("sans-serif") ? "`#{inner}`" : inner
130
+ end
131
+
94
132
  def convert_link(node, inner)
95
133
  href = (node["href"] || "").to_s.strip
96
134
 
data/lib/iev/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Iev
4
- VERSION = "0.4.0"
4
+ VERSION = "0.4.2"
5
5
  end
data/lib/iev.rb CHANGED
@@ -45,17 +45,19 @@ module Iev
45
45
  # @param [String] lang language code, for example "en"
46
46
  #
47
47
  # @return [String, nil] if found then term,
48
- # if code not found then nil,
49
- # if language not found then nil.
48
+ # if code or language not found then nil.
50
49
  #
51
50
  def self.get(code, lang)
52
51
  DataSource.fetch_term_designation(code, lang)
52
+ rescue DataSource::NotFoundError
53
+ nil
53
54
  end
54
55
 
55
56
  # Fetch full concept data (all languages) for a given IEV code.
56
57
  #
57
58
  # @param [String] code IEV code, e.g. "103-01-02"
58
- # @return [Hash, nil] concept data hash with all languages
59
+ # @return [Hash] concept data hash with all languages
60
+ # @raise [DataSource::NotFoundError] if concept not found
59
61
  def self.fetch_concept(code)
60
62
  DataSource.fetch_concept(code)
61
63
  end
@@ -64,7 +66,8 @@ module Iev
64
66
  #
65
67
  # @param [String] code IEV code, e.g. "103-01-02"
66
68
  # @param [String] lang language code, e.g. "en" or "eng"
67
- # @return [Hash, nil] localized concept data
69
+ # @return [Hash, nil] localized concept data or nil if not found
70
+ # @raise [DataSource::NotFoundError] if concept not found
68
71
  def self.fetch_term(code, lang)
69
72
  DataSource.fetch_term(code, lang)
70
73
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iev
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-27 00:00:00.000000000 Z
11
+ date: 2026-04-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: creek