iev 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +0 -2
- data/.github/workflows/release.yml +3 -1
- data/.gitignore +3 -1
- data/CLAUDE.md +50 -0
- data/Gemfile +3 -0
- data/README.adoc +65 -15
- data/exe/iev +11 -0
- data/iev.gemspec +5 -4
- data/lib/iev/cli/command.rb +119 -76
- data/lib/iev/cli/command_helper.rb +55 -36
- data/lib/iev/config.rb +31 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +119 -158
- data/lib/iev/data_source.rb +124 -0
- data/lib/iev/exporter.rb +122 -0
- data/lib/iev/scraper/page_parser.rb +176 -0
- data/lib/iev/scraper.rb +135 -0
- data/lib/iev/source_parser.rb +31 -18
- data/lib/iev/supersession_parser.rb +9 -13
- data/lib/iev/term_attrs_parser.rb +21 -7
- data/lib/iev/term_builder.rb +100 -94
- data/lib/iev/utilities.rb +91 -42
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +47 -35
- metadata +34 -13
- data/lib/iev/db.rb +0 -82
- data/lib/iev/db_cache.rb +0 -124
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 382780f439ab289a11d44053962bb7dbf043688fd68cb2328b8746429c19c6d3
|
|
4
|
+
data.tar.gz: 72151b973c479bfc82e61b3d0da4648f11ac54e9344b8c71e3bbf5d91a3a3c3b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d36fad5c1d853d39ffa5d8d67e98ecaedd3c40206237706ccb177ff14957ebea9616420bd41878b84ac34cdab1e9c5c7890143024b0a3070fb6c8a994d769518
|
|
7
|
+
data.tar.gz: 8c5e7f0a9510fd562fc92db4ba225e5f9b7580e0e703525ec062b3d404ab57c804da824f94af0e3047738033577aaae8394970d47c349b1ac6b1979f7fbfa02d
|
data/.github/workflows/rake.yml
CHANGED
|
@@ -2,6 +2,8 @@ name: release
|
|
|
2
2
|
|
|
3
3
|
permissions:
|
|
4
4
|
contents: write
|
|
5
|
+
packages: write
|
|
6
|
+
id-token: write
|
|
5
7
|
|
|
6
8
|
on:
|
|
7
9
|
workflow_dispatch:
|
|
@@ -22,4 +24,4 @@ jobs:
|
|
|
22
24
|
next_version: ${{ github.event.inputs.next_version }}
|
|
23
25
|
secrets:
|
|
24
26
|
rubygems-api-key: ${{ secrets.GLOSSARIST_CI_RUBYGEMS_API_KEY }}
|
|
25
|
-
pat_token: ${{ secrets.
|
|
27
|
+
pat_token: ${{ secrets.GLOSSARIST_CI_PAT_TOKEN }}
|
data/.gitignore
CHANGED
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Build and Test Commands
|
|
6
|
+
|
|
7
|
+
- Run all tests: `bundle exec rake` or `bundle exec rspec`
|
|
8
|
+
- Run a single test file: `bundle exec rspec spec/iev/term_builder_spec.rb`
|
|
9
|
+
- Run a single test example: `bundle exec rspec spec/iev/term_builder_spec.rb:42`
|
|
10
|
+
- Lint: `bundle exec rubocop`
|
|
11
|
+
- CI uses the shared `metanorma/ci` generic-rake workflow
|
|
12
|
+
|
|
13
|
+
## Architecture
|
|
14
|
+
|
|
15
|
+
This is a Ruby gem (`iev`) for working with the International Electrotechnical Vocabulary (IEV) from IEC Electropedia. It is part of the Glossarist ecosystem and depends on the `glossarist` gem for concept modeling.
|
|
16
|
+
|
|
17
|
+
### Two Main Data Flows
|
|
18
|
+
|
|
19
|
+
**1. Fetching terms (API usage):** `Iev.get(code, lang)` or `Iev.fetch_concept(code)`
|
|
20
|
+
- `DataSource` checks a local path (`IEV_DATA_PATH` env var) for YAML concept files, then falls back to fetching from a remote GitHub repo (`glossarist/glossarist-data-iev`), with file-based caching in `IEV_CACHE_DIR` (defaults to system tmpdir).
|
|
21
|
+
- `Db` wraps `DataSource` with a two-tier cache (global + local) using `DbCache`, which stores versioned XML files on disk.
|
|
22
|
+
|
|
23
|
+
**2. Converting Excel exports (CLI usage):** via `exe/iev` (Thor-based)
|
|
24
|
+
- `xlsx2yaml`: Excel → `DbWriter` (in-memory SQLite) → `TermBuilder` (row-by-row) → `Glossarist::LocalizedConcept` objects → YAML concept files
|
|
25
|
+
- `xlsx2db`: Excel → `DbWriter` → SQLite file
|
|
26
|
+
- `db2yaml`: SQLite → `TermBuilder` → YAML concept files
|
|
27
|
+
|
|
28
|
+
### Key Modules
|
|
29
|
+
|
|
30
|
+
- `TermBuilder` — the core converter that turns a spreadsheet row into a `Glossarist::LocalizedConcept`. Handles definition splitting (notes/examples extraction), term designation parsing, and source parsing.
|
|
31
|
+
- `SourceParser` — parses the SOURCE column from IEV exports, normalizing references (CEI→IEC, UIT→ITU, etc.) and extracting ref/clause/relationship using extensive regex matching.
|
|
32
|
+
- `TermAttrsParser` — parses the TERMATTRIBUTE field (gender, plurality, part of speech, geographical area, abbreviations).
|
|
33
|
+
- `SupersessionParser` — parses the REPLACES field for deprecated term relationships.
|
|
34
|
+
- `Converter::MathmlToAsciimath` — converts MathML markup to AsciiMath using Plurimath.
|
|
35
|
+
- `Utilities` — HTML processing: converts IEV cross-references (`<a href=IEV...>`) to `{{term, IEV:code}}` format, handles figures, images, bold tags, and newline normalization.
|
|
36
|
+
|
|
37
|
+
### Configuration
|
|
38
|
+
|
|
39
|
+
`Iev.configure` yields a `Config` object with:
|
|
40
|
+
- `data_path` — local path to YAML concept files (env: `IEV_DATA_PATH`)
|
|
41
|
+
- `cache_dir` — cache directory (env: `IEV_CACHE_DIR`, default: system tmpdir)
|
|
42
|
+
- `remote_base_url` — base URL for remote concept YAML fetching
|
|
43
|
+
|
|
44
|
+
## Key Conventions
|
|
45
|
+
|
|
46
|
+
- Ruby >= 3.1.0 required
|
|
47
|
+
- `plurimath` and `unitsml` are optional runtime dependencies — loaded with `rescue LoadError`, so the `DataSource`/`Db` APIs work without them
|
|
48
|
+
- The IEV Excel export format is specific to IEC-internal use; column structure is documented in README.adoc
|
|
49
|
+
- Language codes: the spreadsheet uses ISO 639-1 (2-char like "en"), internally converted to ISO 639-2/3 (3-char like "eng") via `Iso639Code` and `DataConversions`
|
|
50
|
+
- `DataConversions` is a refinement (`using DataConversions`) that adds `.sanitize` and `.decode_html` methods to String
|
data/Gemfile
CHANGED
data/README.adoc
CHANGED
|
@@ -6,6 +6,13 @@ image:https://github.com/glossarist/iev/workflows/rake/badge.svg["Build Status",
|
|
|
6
6
|
image:https://img.shields.io/github/issues-pr-raw/glossarist/iev.svg["Pull Requests", link="https://github.com/glossarist/iev/pulls"]
|
|
7
7
|
image:https://img.shields.io/github/commits-since/glossarist/iev/latest.svg["Commits since latest",link="https://github.com/glossarist/iev/releases"]
|
|
8
8
|
|
|
9
|
+
[WARNING]
|
|
10
|
+
--
|
|
11
|
+
As of 2026-02-10, Electropedia is behind AWS WAF, which blocks simple HTTP clients.
|
|
12
|
+
Scraping now uses Ferrum (headless Chrome via DevTools Protocol) to handle the WAF challenge.
|
|
13
|
+
Chrome/Chromium must be installed for scraping to work.
|
|
14
|
+
--
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
== Purpose
|
|
11
18
|
|
|
@@ -54,24 +61,26 @@ $ gem install iev
|
|
|
54
61
|
|
|
55
62
|
The gem comes with the `iev` executable, which provides the following commands:
|
|
56
63
|
|
|
57
|
-
`iev
|
|
58
|
-
|
|
64
|
+
`iev export FILE -o OUTPUT_DIR`::
|
|
65
|
+
Exports IEV data to Glossarist YAML format.
|
|
66
|
+
Supports both Excel (`.xlsx`/`.xls`) and SQLite (`.sqlite3`/`.sqlite`/`.db`) input files.
|
|
67
|
+
Format is detected automatically from the file extension.
|
|
59
68
|
|
|
60
69
|
`iev xlsx2db FILE`::
|
|
61
70
|
Imports Excel to SQLite database.
|
|
62
71
|
|
|
63
|
-
`iev
|
|
64
|
-
|
|
72
|
+
`iev fetch CODE`::
|
|
73
|
+
Fetches a single IEV concept and outputs YAML to stdout.
|
|
65
74
|
|
|
66
75
|
WARNING: The IEV XLSX export files can only be obtained from the IEC
|
|
67
76
|
Electropedia administrator.
|
|
68
77
|
|
|
69
78
|
|
|
70
|
-
=== Fetching IEV terms from
|
|
79
|
+
=== Fetching IEV terms from cached data
|
|
71
80
|
|
|
72
81
|
[source, ruby]
|
|
73
82
|
----
|
|
74
|
-
# Get term
|
|
83
|
+
# Get term designation (from local YAML or GitHub remote)
|
|
75
84
|
|
|
76
85
|
Iev.get("103-01-02", "en")
|
|
77
86
|
=> "functional"
|
|
@@ -83,26 +92,67 @@ Iev.get("111-11-11", "en")
|
|
|
83
92
|
# If language not found
|
|
84
93
|
Iev.get("103-01-02", "eee")
|
|
85
94
|
=> nil
|
|
95
|
+
|
|
96
|
+
# Fetch full concept data (all languages)
|
|
97
|
+
Iev.fetch_concept("103-01-02")
|
|
98
|
+
=> { "id" => "103-01-02", "data" => { ... } }
|
|
99
|
+
|
|
100
|
+
# Fetch localized term data
|
|
101
|
+
Iev.fetch_term("103-01-02", "en")
|
|
102
|
+
=> { "term" => "functional", ... }
|
|
103
|
+
----
|
|
104
|
+
|
|
105
|
+
=== Scraping IEV terms from Electropedia
|
|
106
|
+
|
|
107
|
+
Requires Chrome/Chromium installed. Uses Ferrum (headless Chrome) to bypass AWS WAF.
|
|
108
|
+
|
|
109
|
+
[source, ruby]
|
|
110
|
+
----
|
|
111
|
+
# Scrape concept data directly from electropedia.org
|
|
112
|
+
Iev.scrape_concept("103-01-02")
|
|
113
|
+
=> { "id" => "103-01-02", "data" => { "identifier" => "103-01-02", "localized_concepts" => { "eng" => { ... }, ... } } }
|
|
114
|
+
|
|
115
|
+
# Custom browser options (e.g., headless mode, window size)
|
|
116
|
+
scraper = Iev::Scraper.new(browser_opts: { headless: true, window_size: [1280, 800] })
|
|
117
|
+
concept = scraper.fetch_concept("103-01-02")
|
|
86
118
|
----
|
|
87
119
|
|
|
88
120
|
|
|
89
|
-
=== Converting IEV
|
|
121
|
+
=== Converting IEV data to a Glossarist dataset
|
|
90
122
|
|
|
91
|
-
|
|
123
|
+
The `export` command converts an IEV Excel export or SQLite database into
|
|
124
|
+
Glossarist YAML concept files:
|
|
92
125
|
|
|
93
126
|
[source,sh]
|
|
94
127
|
----
|
|
95
|
-
|
|
128
|
+
# From an Excel export
|
|
129
|
+
$ iev export termbase.xlsx -o /path/to/output
|
|
130
|
+
|
|
131
|
+
# From a SQLite database
|
|
132
|
+
$ iev export termbase.sqlite3 -o /path/to/output
|
|
133
|
+
|
|
134
|
+
# With filters
|
|
135
|
+
$ iev export termbase.xlsx -o /output --only-concepts "103-%" --only-languages "en,fr"
|
|
96
136
|
----
|
|
97
137
|
|
|
98
|
-
|
|
138
|
+
The output directory will contain a `concepts/` subdirectory with Glossarist
|
|
139
|
+
concept and localized concept YAML files.
|
|
99
140
|
|
|
100
|
-
|
|
101
|
-
the current working directory;
|
|
141
|
+
You can also use the `Iev::Exporter` class programmatically:
|
|
102
142
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
143
|
+
[source,ruby]
|
|
144
|
+
----
|
|
145
|
+
# Export from Excel
|
|
146
|
+
Iev::Exporter.new("termbase.xlsx", output_dir: "/path/to/output").export
|
|
147
|
+
|
|
148
|
+
# Export from SQLite with filters
|
|
149
|
+
collection = Iev::Exporter.new("termbase.sqlite3",
|
|
150
|
+
output_dir: "/path/to/output",
|
|
151
|
+
only_concepts: "103-%",
|
|
152
|
+
only_languages: "en,fr",
|
|
153
|
+
).export
|
|
154
|
+
# collection is a Glossarist::ManagedConceptCollection
|
|
155
|
+
----
|
|
106
156
|
|
|
107
157
|
|
|
108
158
|
== Structure of the IEV Excel export
|
data/exe/iev
CHANGED
|
@@ -2,6 +2,17 @@
|
|
|
2
2
|
# frozen_string_literal: true
|
|
3
3
|
|
|
4
4
|
require_relative "../lib/iev"
|
|
5
|
+
|
|
6
|
+
# CLI dependencies — loaded eagerly for the conversion pipeline
|
|
7
|
+
require "benchmark"
|
|
8
|
+
require "creek"
|
|
9
|
+
require "glossarist"
|
|
10
|
+
require "nokogiri"
|
|
11
|
+
require "relaton"
|
|
12
|
+
require "relaton_bib"
|
|
13
|
+
require "sequel"
|
|
14
|
+
require "thor"
|
|
15
|
+
|
|
5
16
|
require_relative "../lib/iev/cli"
|
|
6
17
|
|
|
7
18
|
Iev::Cli.start(ARGV)
|
data/iev.gemspec
CHANGED
|
@@ -19,16 +19,17 @@ Gem::Specification.new do |spec|
|
|
|
19
19
|
spec.bindir = "exe"
|
|
20
20
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
21
21
|
spec.require_paths = ["lib"]
|
|
22
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 3.
|
|
22
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 3.2.0")
|
|
23
23
|
|
|
24
24
|
spec.add_dependency "creek", "~> 2.6"
|
|
25
25
|
spec.add_dependency "glossarist", ">= 2.3.0"
|
|
26
|
-
spec.add_dependency "
|
|
27
|
-
spec.add_dependency "nokogiri", "~> 1.
|
|
26
|
+
spec.add_dependency "ferrum", "~> 0.15"
|
|
27
|
+
spec.add_dependency "nokogiri", "~> 1.19"
|
|
28
28
|
spec.add_dependency "plurimath"
|
|
29
|
+
spec.add_dependency "lutaml-model", "~> 0.8.0"
|
|
29
30
|
spec.add_dependency "relaton", "~> 1.18"
|
|
30
31
|
spec.add_dependency "sequel", "~> 5.40"
|
|
31
|
-
spec.add_dependency "sqlite3", "~> 1.7
|
|
32
|
+
spec.add_dependency "sqlite3", "~> 1.7"
|
|
32
33
|
spec.add_dependency "thor", "~> 1.0"
|
|
33
34
|
spec.add_dependency "unitsml"
|
|
34
35
|
end
|
data/lib/iev/cli/command.rb
CHANGED
|
@@ -8,102 +8,145 @@ module Iev
|
|
|
8
8
|
class Command < Thor
|
|
9
9
|
include CommandHelper
|
|
10
10
|
|
|
11
|
+
desc "export FILE", "Export IEV data to Glossarist YAML format"
|
|
12
|
+
long_desc <<~DESC
|
|
13
|
+
Exports IEV data from an Excel (.xlsx/.xls) or SQLite (.sqlite3/.sqlite/.db)
|
|
14
|
+
file to Glossarist YAML concept files.
|
|
15
|
+
|
|
16
|
+
The input format is detected automatically from the file extension.
|
|
17
|
+
DESC
|
|
18
|
+
option :output, desc: "Output directory", aliases: :o, default: Dir.pwd
|
|
19
|
+
option :only_concepts,
|
|
20
|
+
desc: "Only process concepts with IEVREF matching this pattern " \
|
|
21
|
+
"(SQL LIKE wildcards: % and _)"
|
|
22
|
+
option :only_languages,
|
|
23
|
+
desc: "Only export these languages, skip concepts which aren't " \
|
|
24
|
+
"translated to any of them (comma-separated list, language " \
|
|
25
|
+
"codes must be as in spreadsheet)"
|
|
26
|
+
option :progress, type: :boolean,
|
|
27
|
+
desc: "Enables or disables progress indicator. By default disabled " \
|
|
28
|
+
"when 'CI' environment variable is set and enabled otherwise"
|
|
29
|
+
option :profile, type: :boolean, default: false,
|
|
30
|
+
desc: "Generates profiler reports for this program, requires ruby-prof"
|
|
31
|
+
option :debug_term_attributes, type: :boolean, default: false,
|
|
32
|
+
desc: "Enables debug messages about term attributes recognition"
|
|
33
|
+
option :debug_sources, type: :boolean, default: false,
|
|
34
|
+
desc: "Enables debug messages about authoritative sources recognition"
|
|
35
|
+
option :debug_relaton, type: :boolean, default: false,
|
|
36
|
+
desc: "Enables debug messages about Relaton integration"
|
|
37
|
+
def export(file)
|
|
38
|
+
handle_generic_options(options)
|
|
39
|
+
|
|
40
|
+
Iev::Exporter.new(
|
|
41
|
+
file,
|
|
42
|
+
output_dir: options[:output],
|
|
43
|
+
only_concepts: options[:only_concepts],
|
|
44
|
+
only_languages: options[:only_languages],
|
|
45
|
+
).export
|
|
46
|
+
|
|
47
|
+
info "Done!"
|
|
48
|
+
end
|
|
49
|
+
|
|
11
50
|
desc "xlsx2yaml FILE", "Converts Excel IEV exports to YAMLs."
|
|
51
|
+
option :output, desc: "Output directory", aliases: :o, default: Dir.pwd
|
|
52
|
+
option :only_concepts,
|
|
53
|
+
desc: "Only process concepts with IEVREF matching this argument, " \
|
|
54
|
+
"'%' and '_' wildcards are supported and have meaning as in SQL " \
|
|
55
|
+
"LIKE operator"
|
|
56
|
+
option :only_languages,
|
|
57
|
+
desc: "Only export these languages, skip concepts which aren't " \
|
|
58
|
+
"translated to any of them (comma-separated list, language " \
|
|
59
|
+
"codes must be as in spreadsheet)"
|
|
60
|
+
option :progress, type: :boolean,
|
|
61
|
+
desc: "Enables or disables progress indicator. By default disabled " \
|
|
62
|
+
"when 'CI' environment variable is set and enabled otherwise"
|
|
63
|
+
option :profile, type: :boolean, default: false,
|
|
64
|
+
desc: "Generates profiler reports for this program, requires ruby-prof"
|
|
65
|
+
option :debug_term_attributes, type: :boolean, default: false
|
|
66
|
+
option :debug_sources, type: :boolean, default: false
|
|
67
|
+
option :debug_relaton, type: :boolean, default: false
|
|
12
68
|
def xlsx2yaml(file)
|
|
13
69
|
handle_generic_options(options)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
70
|
+
|
|
71
|
+
Iev::Exporter.new(
|
|
72
|
+
file,
|
|
73
|
+
output_dir: options[:output],
|
|
74
|
+
only_concepts: options[:only_concepts],
|
|
75
|
+
only_languages: options[:only_languages],
|
|
76
|
+
).export
|
|
77
|
+
|
|
78
|
+
summary
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
desc "db2yaml DB_FILE", "Exports SQLite to IEV YAMLs."
|
|
82
|
+
option :output, desc: "Output directory", aliases: :o, default: Dir.pwd
|
|
83
|
+
option :only_concepts,
|
|
84
|
+
desc: "Only process concepts with IEVREF matching this argument, " \
|
|
85
|
+
"'%' and '_' wildcards are supported and have meaning as in SQL " \
|
|
86
|
+
"LIKE operator"
|
|
87
|
+
option :only_languages,
|
|
88
|
+
desc: "Only export these languages, skip concepts which aren't " \
|
|
89
|
+
"translated to any of them (comma-separated list, language " \
|
|
90
|
+
"codes must be as in spreadsheet)"
|
|
91
|
+
option :progress, type: :boolean,
|
|
92
|
+
desc: "Enables or disables progress indicator. By default disabled " \
|
|
93
|
+
"when 'CI' environment variable is set and enabled otherwise"
|
|
94
|
+
option :profile, type: :boolean, default: false,
|
|
95
|
+
desc: "Generates profiler reports for this program, requires ruby-prof"
|
|
96
|
+
option :debug_term_attributes, type: :boolean, default: false
|
|
97
|
+
option :debug_sources, type: :boolean, default: false
|
|
98
|
+
option :debug_relaton, type: :boolean, default: false
|
|
99
|
+
def db2yaml(dbfile)
|
|
100
|
+
handle_generic_options(options)
|
|
101
|
+
|
|
102
|
+
Iev::Exporter.new(
|
|
103
|
+
dbfile,
|
|
104
|
+
output_dir: options[:output],
|
|
105
|
+
only_concepts: options[:only_concepts],
|
|
106
|
+
only_languages: options[:only_languages],
|
|
107
|
+
).export
|
|
108
|
+
|
|
19
109
|
summary
|
|
20
110
|
end
|
|
21
111
|
|
|
22
112
|
desc "xlsx2db FILE", "Imports Excel to SQLite database."
|
|
113
|
+
option :output, desc: "Output file", aliases: :o,
|
|
114
|
+
default: File.join(Dir.pwd, "concepts.sqlite3")
|
|
115
|
+
option :progress, type: :boolean,
|
|
116
|
+
desc: "Enables or disables progress indicator. By default disabled " \
|
|
117
|
+
"when 'CI' environment variable is set and enabled otherwise"
|
|
118
|
+
option :profile, type: :boolean, default: false,
|
|
119
|
+
desc: "Generates profiler reports for this program, requires ruby-prof"
|
|
23
120
|
def xlsx2db(file)
|
|
24
121
|
handle_generic_options(options)
|
|
25
|
-
# Instantiating an in-memory db and dumping it later is faster than
|
|
26
|
-
# just working on file db.
|
|
27
122
|
db = Sequel.sqlite
|
|
28
123
|
DbWriter.new(db).import_spreadsheet(file)
|
|
29
124
|
save_db_to_file(db, options[:output])
|
|
30
125
|
summary
|
|
31
126
|
end
|
|
32
127
|
|
|
33
|
-
desc "
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
128
|
+
desc "fetch CODE", "Fetch an IEV concept and output YAML to stdout."
|
|
129
|
+
option :scrape, type: :boolean, default: false,
|
|
130
|
+
desc: "Scrape from Electropedia instead of using cached data"
|
|
131
|
+
def fetch(code)
|
|
132
|
+
raw = if options[:scrape]
|
|
133
|
+
Scraper.new.fetch_concept(code)
|
|
134
|
+
else
|
|
135
|
+
DataSource.fetch_concept(code)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
unless raw
|
|
139
|
+
warn "IEV: concept #{code} not found."
|
|
140
|
+
exit 1
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
concept = build_concept_from_raw(code, raw)
|
|
144
|
+
print_concept_grouped_yaml(concept)
|
|
41
145
|
end
|
|
42
146
|
|
|
43
147
|
def self.exit_on_failure?
|
|
44
148
|
true
|
|
45
149
|
end
|
|
46
|
-
|
|
47
|
-
# Options must be declared at the bottom because Thor must have commands
|
|
48
|
-
# defined in advance.
|
|
49
|
-
|
|
50
|
-
def self.shared_option(name, methods:, **kwargs)
|
|
51
|
-
[*methods].each { |m| option name, for: m, **kwargs }
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
shared_option :only_concepts,
|
|
55
|
-
desc: "Only process concepts with IEVREF matching this argument, " \
|
|
56
|
-
"'%' and '_' wildcards are supported and have meaning as in SQL " \
|
|
57
|
-
"LIKE operator",
|
|
58
|
-
methods: %i[xlsx2yaml db2yaml]
|
|
59
|
-
|
|
60
|
-
shared_option :only_languages,
|
|
61
|
-
desc: "Only export these languages, skip concepts which aren't " \
|
|
62
|
-
"translated to any of them (comma-separated list, language " \
|
|
63
|
-
"codes must be as in spreadsheet)",
|
|
64
|
-
methods: %i[xlsx2yaml db2yaml]
|
|
65
|
-
|
|
66
|
-
shared_option :output,
|
|
67
|
-
desc: "Output directory",
|
|
68
|
-
aliases: :o,
|
|
69
|
-
default: Dir.pwd,
|
|
70
|
-
methods: %i[xlsx2yaml db2yaml]
|
|
71
|
-
|
|
72
|
-
shared_option :output,
|
|
73
|
-
desc: "Output file",
|
|
74
|
-
aliases: :o,
|
|
75
|
-
default: File.join(Dir.pwd, "concepts.sqlite3"),
|
|
76
|
-
methods: :xlsx2db
|
|
77
|
-
|
|
78
|
-
shared_option :progress,
|
|
79
|
-
type: :boolean,
|
|
80
|
-
desc: "Enables or disables progress indicator. By default disabled " \
|
|
81
|
-
"when 'CI' environment variable is set and enabled otherwise",
|
|
82
|
-
methods: %i[xlsx2yaml xlsx2db db2yaml]
|
|
83
|
-
|
|
84
|
-
shared_option :debug_term_attributes,
|
|
85
|
-
desc: "Enables debug messages about term attributes recognition",
|
|
86
|
-
type: :boolean,
|
|
87
|
-
default: false,
|
|
88
|
-
methods: %i[xlsx2yaml db2yaml]
|
|
89
|
-
|
|
90
|
-
shared_option :debug_sources,
|
|
91
|
-
desc: "Enables debug messages about authoritative sources recognition",
|
|
92
|
-
type: :boolean,
|
|
93
|
-
default: false,
|
|
94
|
-
methods: %i[xlsx2yaml db2yaml]
|
|
95
|
-
|
|
96
|
-
shared_option :debug_relaton,
|
|
97
|
-
desc: "Enables debug messages about Relaton integration",
|
|
98
|
-
type: :boolean,
|
|
99
|
-
default: false,
|
|
100
|
-
methods: %i[xlsx2yaml db2yaml]
|
|
101
|
-
|
|
102
|
-
shared_option :profile,
|
|
103
|
-
desc: "Generates profiler reports for this program, requires ruby-prof",
|
|
104
|
-
type: :boolean,
|
|
105
|
-
default: false,
|
|
106
|
-
methods: %i[xlsx2yaml xlsx2db db2yaml]
|
|
107
150
|
end
|
|
108
151
|
end
|
|
109
152
|
end
|
|
@@ -10,17 +10,6 @@ module Iev
|
|
|
10
10
|
|
|
11
11
|
protected
|
|
12
12
|
|
|
13
|
-
def save_collection_to_files(collection, output_dir)
|
|
14
|
-
Profiler.measure("writing-yamls") do
|
|
15
|
-
info "Writing concepts to files..."
|
|
16
|
-
path = File.expand_path("./concepts", output_dir)
|
|
17
|
-
FileUtils.mkdir_p(path)
|
|
18
|
-
collection.save_to_files(path)
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
# NOTE: Implementation examples here:
|
|
23
|
-
# https://www.rubydoc.info/github/luislavena/sqlite3-ruby/SQLite3/Backup
|
|
24
13
|
def save_db_to_file(src_db, dbfile)
|
|
25
14
|
info "Saving database to a file..."
|
|
26
15
|
src_db.synchronize do |src_conn|
|
|
@@ -35,13 +24,6 @@ module Iev
|
|
|
35
24
|
info "Done!"
|
|
36
25
|
end
|
|
37
26
|
|
|
38
|
-
def collection_file_path(file, output_dir)
|
|
39
|
-
output_dir.join(Pathname.new(file).basename.sub_ext(".yaml"))
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Handles various generic options, e.g. detailed debug switches.
|
|
43
|
-
# Assigns some global variables accordingly, so these settings are
|
|
44
|
-
# available throughout the program.
|
|
45
27
|
def handle_generic_options(options)
|
|
46
28
|
$IEV_PROFILE = options[:profile]
|
|
47
29
|
$IEV_PROGRESS = options.fetch(:progress, !ENV["CI"])
|
|
@@ -54,31 +36,68 @@ module Iev
|
|
|
54
36
|
end
|
|
55
37
|
end
|
|
56
38
|
|
|
57
|
-
def
|
|
58
|
-
|
|
39
|
+
def build_concept_from_raw(code, raw)
|
|
40
|
+
concept = Glossarist::ManagedConcept.of_yaml(
|
|
41
|
+
"id" => code,
|
|
42
|
+
"data" => { "id" => code },
|
|
43
|
+
)
|
|
59
44
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
45
|
+
localized = extract_localized(raw)
|
|
46
|
+
localized.each do |lang, entry|
|
|
47
|
+
l10n = build_localized_concept(code, lang, entry)
|
|
48
|
+
concept.add_l10n(l10n)
|
|
63
49
|
end
|
|
64
50
|
|
|
65
|
-
|
|
51
|
+
concept
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def extract_localized(raw)
|
|
55
|
+
# Scraper format: raw["data"]["localized_concepts"] => {lang => {term, definition}}
|
|
56
|
+
data = raw["data"]
|
|
57
|
+
if data && data["localized_concepts"]
|
|
58
|
+
return data["localized_concepts"]
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# DataSource format: raw itself, keys are lang codes
|
|
62
|
+
raw.each_with_object({}) do |(k, v), h|
|
|
63
|
+
h[k] = v if v.is_a?(Hash) && v["terms"]
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def build_localized_concept(code, lang, entry)
|
|
68
|
+
terms = if entry["terms"]
|
|
69
|
+
entry["terms"].map { |t| Glossarist::Designation::Expression.new(**t.transform_keys(&:to_sym)) }
|
|
70
|
+
else
|
|
71
|
+
[Glossarist::Designation::Expression.new(
|
|
72
|
+
designation: entry["term"],
|
|
73
|
+
normative_status: "preferred",
|
|
74
|
+
)]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
cd = Glossarist::ConceptData.new
|
|
78
|
+
cd.id = code
|
|
79
|
+
cd.language_code = lang
|
|
80
|
+
cd.terms = terms
|
|
81
|
+
|
|
82
|
+
definition = entry["definition"]
|
|
83
|
+
if definition
|
|
84
|
+
content = definition.is_a?(String) ? definition : definition
|
|
85
|
+
cd.definition = [Glossarist::DetailedDefinition.new(content: content)]
|
|
86
|
+
end
|
|
66
87
|
|
|
67
|
-
|
|
88
|
+
l10n = Glossarist::LocalizedConcept.new
|
|
89
|
+
l10n.data = cd
|
|
90
|
+
l10n.id = code
|
|
91
|
+
l10n
|
|
68
92
|
end
|
|
69
93
|
|
|
70
|
-
def
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
next unless term
|
|
76
|
-
|
|
77
|
-
concept = concept_collection.fetch_or_initialize(term.id)
|
|
78
|
-
concept.add_l10n(term)
|
|
79
|
-
end
|
|
80
|
-
end
|
|
94
|
+
def print_concept_grouped_yaml(concept)
|
|
95
|
+
content = []
|
|
96
|
+
content << concept.to_yaml
|
|
97
|
+
concept.localized_concepts.each_key do |lang|
|
|
98
|
+
content << concept.localization(lang).to_yaml
|
|
81
99
|
end
|
|
100
|
+
puts content.join("\n")
|
|
82
101
|
end
|
|
83
102
|
end
|
|
84
103
|
end
|
data/lib/iev/config.rb
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "tmpdir"
|
|
4
|
+
|
|
5
|
+
module Iev
|
|
6
|
+
class Config
|
|
7
|
+
DEFAULT_REMOTE_BASE_URL = "https://raw.githubusercontent.com/glossarist/glossarist-data-iev/main/concepts"
|
|
8
|
+
|
|
9
|
+
attr_accessor :data_path, :cache_dir, :remote_base_url
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@data_path = ENV["IEV_DATA_PATH"]
|
|
13
|
+
@cache_dir = ENV["IEV_CACHE_DIR"] || File.join(Dir.tmpdir, "iev-cache")
|
|
14
|
+
@remote_base_url = DEFAULT_REMOTE_BASE_URL
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class << self
|
|
19
|
+
def config
|
|
20
|
+
@config ||= Config.new
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def configure
|
|
24
|
+
yield(config) if block_given?
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def reset_config!
|
|
28
|
+
@config = nil
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|