iev 0.3.0 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +3 -20
- data/.github/workflows/release.yml +25 -0
- data/.gitignore +4 -0
- data/.rubocop.yml +0 -2
- data/README.adoc +4 -4
- data/exe/iev-glossarist +21 -0
- data/iev.gemspec +12 -3
- data/lib/iev/cli/command.rb +109 -0
- data/lib/iev/cli/command_helper.rb +83 -0
- data/lib/iev/cli/ui.rb +70 -0
- data/lib/iev/cli.rb +22 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +197 -0
- data/lib/iev/converter.rb +9 -0
- data/lib/iev/data_conversions.rb +39 -0
- data/lib/iev/db.rb +3 -3
- data/lib/iev/db_cache.rb +2 -2
- data/lib/iev/db_writer.rb +81 -0
- data/lib/iev/iso_639_2.yaml +4075 -0
- data/lib/iev/iso_639_code.rb +47 -0
- data/lib/iev/profiler.rb +69 -0
- data/lib/iev/relaton_db.rb +63 -0
- data/lib/iev/source_parser.rb +350 -0
- data/lib/iev/supersession_parser.rb +70 -0
- data/lib/iev/term_attrs_parser.rb +143 -0
- data/lib/iev/term_builder.rb +313 -0
- data/lib/iev/utilities.rb +58 -0
- data/lib/iev/version.rb +2 -2
- data/lib/iev.rb +24 -2
- metadata +153 -10
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# (c) Copyright 2020 Ribose Inc.
|
4
|
+
#
|
5
|
+
|
6
|
+
module IEV
|
7
|
+
module DataConversions
|
8
|
+
refine String do
|
9
|
+
def decode_html!
|
10
|
+
replace(decode_html)
|
11
|
+
nil
|
12
|
+
end
|
13
|
+
|
14
|
+
def decode_html
|
15
|
+
HTMLEntities.new(:expanded).decode(self)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Normalize various encoding anomalies like `\uFEFF` in strings
|
19
|
+
def sanitize!
|
20
|
+
unicode_normalize!
|
21
|
+
gsub!("\uFEFF", "")
|
22
|
+
gsub!("\u2011", "-")
|
23
|
+
gsub!("\u00a0", " ")
|
24
|
+
gsub!(/[\u2000-\u2006]/, " ")
|
25
|
+
strip!
|
26
|
+
nil
|
27
|
+
end
|
28
|
+
|
29
|
+
# @see sanitize!
|
30
|
+
def sanitize
|
31
|
+
dup.tap(&:sanitize!)
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_three_char_code
|
35
|
+
IEV::Iso639Code.three_char_code(self).first
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/iev/db.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# require 'pstore'
|
2
2
|
require_relative "db_cache"
|
3
3
|
|
4
|
-
module
|
4
|
+
module IEV
|
5
5
|
# Cache class.
|
6
6
|
class Db
|
7
7
|
# @param global_cache [String] filename of global DB
|
@@ -43,12 +43,12 @@ module Iev
|
|
43
43
|
|
44
44
|
# @return [Hash]
|
45
45
|
def new_bib_entry(code, lang)
|
46
|
-
|
46
|
+
IEV.get(code, lang)
|
47
47
|
end
|
48
48
|
|
49
49
|
# @param dir [String] DB dir
|
50
50
|
# @param global [TrueClass, FalseClass]
|
51
|
-
# @return [
|
51
|
+
# @return [IEV::DbCache, nil]
|
52
52
|
def open_cache_biblio(dir, global: true)
|
53
53
|
return nil if dir.nil?
|
54
54
|
|
data/lib/iev/db_cache.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require "fileutils"
|
2
2
|
|
3
|
-
module
|
3
|
+
module IEV
|
4
4
|
class DbCache
|
5
5
|
# @return [String]
|
6
6
|
attr_reader :dir
|
@@ -73,7 +73,7 @@ module Iev
|
|
73
73
|
end
|
74
74
|
|
75
75
|
# Set version of the DB to the gem version.
|
76
|
-
# @return [
|
76
|
+
# @return [IEV::DbCache]
|
77
77
|
def set_version
|
78
78
|
File.write "#{@dir}/version", VERSION, encoding: "utf-8"
|
79
79
|
self
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# (c) Copyright 2020 Ribose Inc.
|
4
|
+
#
|
5
|
+
|
6
|
+
module IEV
|
7
|
+
class DbWriter
|
8
|
+
include CLI::UI
|
9
|
+
using DataConversions
|
10
|
+
|
11
|
+
attr_reader :db
|
12
|
+
|
13
|
+
def initialize(db)
|
14
|
+
@db = db
|
15
|
+
end
|
16
|
+
|
17
|
+
def import_spreadsheet(file)
|
18
|
+
Profiler.measure("xlsx-import") do
|
19
|
+
workbook = open_workbook(file)
|
20
|
+
row_enumerator = workbook.sheets.first.simple_rows.each
|
21
|
+
|
22
|
+
title_row = row_enumerator.next
|
23
|
+
symbolized_title_row = title_row.compact.transform_values(&:to_sym)
|
24
|
+
|
25
|
+
create_table(symbolized_title_row.values)
|
26
|
+
|
27
|
+
loop do
|
28
|
+
row = row_enumerator.next
|
29
|
+
next if row.empty?
|
30
|
+
data = prepare_data(row, symbolized_title_row)
|
31
|
+
display_progress(data)
|
32
|
+
insert_data(data)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def open_workbook(file)
|
40
|
+
info "Opening spreadsheet..."
|
41
|
+
Creek::Book.new(file)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Creates a database table which is going to be filled with data extracted
|
45
|
+
# from the spreadsheet.
|
46
|
+
#
|
47
|
+
# Note that columns are defined as +VARCHAR(255)+, but they can store
|
48
|
+
# strings of any length without truncating, see:
|
49
|
+
# https://www.sqlite.org/faq.html#q9
|
50
|
+
def create_table(column_names)
|
51
|
+
db.create_table!(:concepts) do
|
52
|
+
column_names.each { |cn| column cn, String }
|
53
|
+
primary_key column_names[0..1], name: :iev_pk
|
54
|
+
index column_names[0]
|
55
|
+
index column_names[1]
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Replaces A, B, C… keys with real column names and sanitizes cell
|
60
|
+
# content.
|
61
|
+
def prepare_data(row, title_row)
|
62
|
+
data = row.dup
|
63
|
+
data.transform_keys! { |k| title_row[k] }
|
64
|
+
data.transform_values! { |v| v&.sanitize }
|
65
|
+
data
|
66
|
+
end
|
67
|
+
|
68
|
+
def display_progress(data)
|
69
|
+
ievref = data[:IEVREF]
|
70
|
+
lang = data[:LANGUAGE].to_three_char_code
|
71
|
+
set_ui_tag "#{ievref} (#{lang})"
|
72
|
+
progress "Importing term #{ievref} (#{lang})..."
|
73
|
+
end
|
74
|
+
|
75
|
+
def insert_data(data)
|
76
|
+
db[:concepts].insert(data)
|
77
|
+
rescue Sequel::UniqueConstraintViolation
|
78
|
+
warn "Duplicated (TERMID, LANGUAGE) pair, skipping"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|