unicodedata_rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +34 -0
- data/LICENSE.txt +21 -0
- data/README.md +36 -0
- data/Rakefile +8 -0
- data/lib/unicodedata_rb/UnicodeData.index +0 -0
- data/lib/unicodedata_rb/UnicodeData.txt +34924 -0
- data/lib/unicodedata_rb/codepoint.rb +24 -0
- data/lib/unicodedata_rb/constants.rb +6 -0
- data/lib/unicodedata_rb/generate_index.rb +65 -0
- data/lib/unicodedata_rb/version.rb +5 -0
- data/lib/unicodedata_rb.rb +47 -0
- data/unicodedata_rb.gemspec +33 -0
- metadata +89 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
# Format of UnicodeData.txt: https://www.unicode.org/L2/L1999/UnicodeData.html
|
2
|
+
|
3
|
+
module UnicodedataRb
|
4
|
+
CODEPOINT_FIELDS = [
|
5
|
+
:codepoint, :name, :category, :combining_class, :bidi_class,
|
6
|
+
:decomposition, :digit_value, :non_decimal_digit_value,
|
7
|
+
:numeric_value, :bidi_mirrored, :unicode1_name, :iso_comment,
|
8
|
+
:simple_uppercase_map, :simple_lowercase_map, :simple_titlecase_map,
|
9
|
+
]
|
10
|
+
|
11
|
+
NUMERIC_FIELDS = [:digit_value, :non_decimal_digit_value, :numeric_value]
|
12
|
+
|
13
|
+
class Codepoint < Struct.new(*UnicodedataRb::CODEPOINT_FIELDS)
|
14
|
+
def initialize(*args)
|
15
|
+
super
|
16
|
+
self.codepoint = self.codepoint.to_i(16)
|
17
|
+
NUMERIC_FIELDS.each { |f| send("#{f}=", send(f).to_r) }
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.from_line(line)
|
21
|
+
new *(line.chomp.split ';')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Download data and build indices
|
2
|
+
# Inspired by https://github.com/runpaint/unicode-data
|
3
|
+
require "logger"
|
4
|
+
require "net/http"
|
5
|
+
require_relative "codepoint"
|
6
|
+
require_relative "constants"
|
7
|
+
|
8
|
+
|
9
|
+
module UnicodedataRb
|
10
|
+
class GenerateIndex
|
11
|
+
def self.call(...)
|
12
|
+
new(...).call
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_reader :logger
|
16
|
+
def initialize(logger: Logger.new(STDOUT))
|
17
|
+
@logger = logger
|
18
|
+
end
|
19
|
+
private_class_method :new
|
20
|
+
|
21
|
+
def call
|
22
|
+
download_file("#{unicodedata_url_prefix}UnicodeData.txt", UnicodedataRb::Constants::UNICODEDATA_TXT_PATH)
|
23
|
+
|
24
|
+
# Format of UnicodeData.txt: https://www.unicode.org/L2/L1999/UnicodeData.html
|
25
|
+
File.open(UnicodedataRb::Constants::UNICODEDATA_TXT_PATH) do |f|
|
26
|
+
codepoint_index = {}
|
27
|
+
name_index = {}
|
28
|
+
|
29
|
+
f.each_line do |line|
|
30
|
+
start_line_pos = f.pos - line.size
|
31
|
+
codepoint = UnicodedataRb::Codepoint.from_line(line)
|
32
|
+
codepoint_index[codepoint.codepoint] = start_line_pos
|
33
|
+
name_index[codepoint.name] = start_line_pos
|
34
|
+
end
|
35
|
+
|
36
|
+
index = {
|
37
|
+
codepoint: codepoint_index,
|
38
|
+
name: name_index
|
39
|
+
}
|
40
|
+
File.open(UnicodedataRb::Constants::UNICODEDATA_INDEX_PATH, 'wb') { |f| Marshal.dump(index, f) }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def download_file(url, save_path)
|
45
|
+
logger.info("Downloading #{url}")
|
46
|
+
uri = URI(url)
|
47
|
+
|
48
|
+
Net::HTTP.start(uri.host, :use_ssl => true) do |http|
|
49
|
+
request = Net::HTTP::Get.new uri
|
50
|
+
|
51
|
+
http.request request do |response|
|
52
|
+
open save_path, "w:UTF-8" do |io|
|
53
|
+
response.read_body do |chunk|
|
54
|
+
io.write chunk
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def unicodedata_url_prefix
|
62
|
+
@unicodedata_url_prefix ||= "https://unicode.org/Public/#{RbConfig::CONFIG["UNICODE_VERSION"]}/ucd/"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require_relative "unicodedata_rb/codepoint"
|
2
|
+
require_relative "unicodedata_rb/generate_index"
|
3
|
+
require_relative "unicodedata_rb/constants"
|
4
|
+
require_relative "unicodedata_rb/version"
|
5
|
+
|
6
|
+
module UnicodedataRb
|
7
|
+
def self.codepoint(num)
|
8
|
+
UnicodedataRb::Codepoint.from_line _unicodedata_txt_line_from(codepoint: num)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.codepoint_from_char(c)
|
12
|
+
UnicodedataRb::Codepoint.from_line _unicodedata_txt_line_from(codepoint: c.ord)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.codepoint_from_name(name)
|
16
|
+
UnicodedataRb::Codepoint.from_line _unicodedata_txt_line_from(name:)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self._unicodedata_txt_line_from(codepoint: nil, name: nil)
|
20
|
+
raise ArgumentError if (codepoint.nil? && name.nil?) || (!codepoint.nil? && !name.nil?)
|
21
|
+
_unicodedata_txt_file.rewind
|
22
|
+
offset =
|
23
|
+
if !codepoint.nil?
|
24
|
+
_unicodedata_index[:codepoint][codepoint]
|
25
|
+
else
|
26
|
+
_unicodedata_index[:name][name]
|
27
|
+
end
|
28
|
+
|
29
|
+
raise ArgumentError if offset.nil?
|
30
|
+
|
31
|
+
_unicodedata_txt_file.seek offset
|
32
|
+
_unicodedata_txt_file.readline.chomp
|
33
|
+
end
|
34
|
+
|
35
|
+
def self._unicodedata_txt_file
|
36
|
+
@@_unicodedata_txt_file ||= File.open(UnicodedataRb::Constants::UNICODEDATA_TXT_PATH)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self._unicodedata_index
|
40
|
+
@@_unicodedata_index ||= Marshal.load(File.binread(UnicodedataRb::Constants::UNICODEDATA_INDEX_PATH))
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.generate_index
|
44
|
+
UnicodedataRb::GenerateIndex.call
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/unicodedata_rb/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "unicodedata_rb"
|
7
|
+
spec.version = UnicodedataRb::VERSION
|
8
|
+
spec.authors = ["bubiche"]
|
9
|
+
spec.email = ["bubiche95@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "Ruby wrapper for unicode data."
|
12
|
+
spec.homepage = "https://github.com/bubiche/unicodedata_rb"
|
13
|
+
spec.license = "MIT"
|
14
|
+
spec.required_ruby_version = ">= 3.2.0"
|
15
|
+
|
16
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
17
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
18
|
+
spec.metadata["changelog_uri"] = "https://github.com/bubiche/unicodedata_rb/CHANGELOG.md"
|
19
|
+
|
20
|
+
# Specify which files should be added to the gem when it is released.
|
21
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
22
|
+
spec.files = Dir.chdir(__dir__) do
|
23
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
24
|
+
(File.expand_path(f) == __FILE__) || f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor])
|
25
|
+
end
|
26
|
+
end
|
27
|
+
spec.bindir = "exe"
|
28
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ["lib"]
|
30
|
+
|
31
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
32
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicodedata_rb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- bubiche
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-06-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '13.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '13.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.0'
|
41
|
+
description:
|
42
|
+
email:
|
43
|
+
- bubiche95@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".rspec"
|
49
|
+
- CHANGELOG.md
|
50
|
+
- Gemfile
|
51
|
+
- Gemfile.lock
|
52
|
+
- LICENSE.txt
|
53
|
+
- README.md
|
54
|
+
- Rakefile
|
55
|
+
- lib/unicodedata_rb.rb
|
56
|
+
- lib/unicodedata_rb/UnicodeData.index
|
57
|
+
- lib/unicodedata_rb/UnicodeData.txt
|
58
|
+
- lib/unicodedata_rb/codepoint.rb
|
59
|
+
- lib/unicodedata_rb/constants.rb
|
60
|
+
- lib/unicodedata_rb/generate_index.rb
|
61
|
+
- lib/unicodedata_rb/version.rb
|
62
|
+
- unicodedata_rb.gemspec
|
63
|
+
homepage: https://github.com/bubiche/unicodedata_rb
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata:
|
67
|
+
homepage_uri: https://github.com/bubiche/unicodedata_rb
|
68
|
+
source_code_uri: https://github.com/bubiche/unicodedata_rb
|
69
|
+
changelog_uri: https://github.com/bubiche/unicodedata_rb/CHANGELOG.md
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options: []
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">="
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: 3.2.0
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
requirements: []
|
85
|
+
rubygems_version: 3.4.10
|
86
|
+
signing_key:
|
87
|
+
specification_version: 4
|
88
|
+
summary: Ruby wrapper for unicode data.
|
89
|
+
test_files: []
|