iev 0.3.0 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) Copyright 2020 Ribose Inc.
4
+ #
5
+
6
+ module IEV
7
+ module DataConversions
8
+ refine String do
9
+ def decode_html!
10
+ replace(decode_html)
11
+ nil
12
+ end
13
+
14
+ def decode_html
15
+ HTMLEntities.new(:expanded).decode(self)
16
+ end
17
+
18
+ # Normalize various encoding anomalies like `\uFEFF` in strings
19
+ def sanitize!
20
+ unicode_normalize!
21
+ gsub!("\uFEFF", "")
22
+ gsub!("\u2011", "-")
23
+ gsub!("\u00a0", " ")
24
+ gsub!(/[\u2000-\u2006]/, " ")
25
+ strip!
26
+ nil
27
+ end
28
+
29
+ # @see sanitize!
30
+ def sanitize
31
+ dup.tap(&:sanitize!)
32
+ end
33
+
34
+ def to_three_char_code
35
+ IEV::Iso639Code.three_char_code(self).first
36
+ end
37
+ end
38
+ end
39
+ end
data/lib/iev/db.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  # require 'pstore'
2
2
  require_relative "db_cache"
3
3
 
4
- module Iev
4
+ module IEV
5
5
  # Cache class.
6
6
  class Db
7
7
  # @param global_cache [String] filename of global DB
@@ -43,12 +43,12 @@ module Iev
43
43
 
44
44
  # @return [Hash]
45
45
  def new_bib_entry(code, lang)
46
- Iev.get(code, lang)
46
+ IEV.get(code, lang)
47
47
  end
48
48
 
49
49
  # @param dir [String] DB dir
50
50
  # @param global [TrueClass, FalseClass]
51
- # @return [Iev::DbCache, nil]
51
+ # @return [IEV::DbCache, nil]
52
52
  def open_cache_biblio(dir, global: true)
53
53
  return nil if dir.nil?
54
54
 
data/lib/iev/db_cache.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  require "fileutils"
2
2
 
3
- module Iev
3
+ module IEV
4
4
  class DbCache
5
5
  # @return [String]
6
6
  attr_reader :dir
@@ -73,7 +73,7 @@ module Iev
73
73
  end
74
74
 
75
75
  # Set version of the DB to the gem version.
76
- # @return [Iev::DbCache]
76
+ # @return [IEV::DbCache]
77
77
  def set_version
78
78
  File.write "#{@dir}/version", VERSION, encoding: "utf-8"
79
79
  self
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) Copyright 2020 Ribose Inc.
4
+ #
5
+
6
+ module IEV
7
+ class DbWriter
8
+ include CLI::UI
9
+ using DataConversions
10
+
11
+ attr_reader :db
12
+
13
+ def initialize(db)
14
+ @db = db
15
+ end
16
+
17
+ def import_spreadsheet(file)
18
+ Profiler.measure("xlsx-import") do
19
+ workbook = open_workbook(file)
20
+ row_enumerator = workbook.sheets.first.simple_rows.each
21
+
22
+ title_row = row_enumerator.next
23
+ symbolized_title_row = title_row.compact.transform_values(&:to_sym)
24
+
25
+ create_table(symbolized_title_row.values)
26
+
27
+ loop do
28
+ row = row_enumerator.next
29
+ next if row.empty?
30
+ data = prepare_data(row, symbolized_title_row)
31
+ display_progress(data)
32
+ insert_data(data)
33
+ end
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def open_workbook(file)
40
+ info "Opening spreadsheet..."
41
+ Creek::Book.new(file)
42
+ end
43
+
44
+ # Creates a database table which is going to be filled with data extracted
45
+ # from the spreadsheet.
46
+ #
47
+ # Note that columns are defined as +VARCHAR(255)+, but they can store
48
+ # strings of any length without truncating, see:
49
+ # https://www.sqlite.org/faq.html#q9
50
+ def create_table(column_names)
51
+ db.create_table!(:concepts) do
52
+ column_names.each { |cn| column cn, String }
53
+ primary_key column_names[0..1], name: :iev_pk
54
+ index column_names[0]
55
+ index column_names[1]
56
+ end
57
+ end
58
+
59
+ # Replaces A, B, C… keys with real column names and sanitizes cell
60
+ # content.
61
+ def prepare_data(row, title_row)
62
+ data = row.dup
63
+ data.transform_keys! { |k| title_row[k] }
64
+ data.transform_values! { |v| v&.sanitize }
65
+ data
66
+ end
67
+
68
+ def display_progress(data)
69
+ ievref = data[:IEVREF]
70
+ lang = data[:LANGUAGE].to_three_char_code
71
+ set_ui_tag "#{ievref} (#{lang})"
72
+ progress "Importing term #{ievref} (#{lang})..."
73
+ end
74
+
75
+ def insert_data(data)
76
+ db[:concepts].insert(data)
77
+ rescue Sequel::UniqueConstraintViolation
78
+ warn "Duplicated (TERMID, LANGUAGE) pair, skipping"
79
+ end
80
+ end
81
+ end