unicodedata_rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ # Format of UnicodeData.txt: https://www.unicode.org/L2/L1999/UnicodeData.html
2
+
3
+ module UnicodedataRb
4
+ CODEPOINT_FIELDS = [
5
+ :codepoint, :name, :category, :combining_class, :bidi_class,
6
+ :decomposition, :digit_value, :non_decimal_digit_value,
7
+ :numeric_value, :bidi_mirrored, :unicode1_name, :iso_comment,
8
+ :simple_uppercase_map, :simple_lowercase_map, :simple_titlecase_map,
9
+ ]
10
+
11
+ NUMERIC_FIELDS = [:digit_value, :non_decimal_digit_value, :numeric_value]
12
+
13
+ class Codepoint < Struct.new(*UnicodedataRb::CODEPOINT_FIELDS)
14
+ def initialize(*args)
15
+ super
16
+ self.codepoint = self.codepoint.to_i(16)
17
+ NUMERIC_FIELDS.each { |f| send("#{f}=", send(f).to_r) }
18
+ end
19
+
20
+ def self.from_line(line)
21
+ new *(line.chomp.split ';')
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,6 @@
1
+ module UnicodedataRb
2
+ class Constants
3
+ UNICODEDATA_TXT_PATH = File.dirname(__FILE__) + "/UnicodeData.txt"
4
+ UNICODEDATA_INDEX_PATH = File.dirname(__FILE__) + "/UnicodeData.index"
5
+ end
6
+ end
@@ -0,0 +1,65 @@
1
+ # Download data and build indices
2
+ # Inspired by https://github.com/runpaint/unicode-data
3
+ require "logger"
4
+ require "net/http"
5
+ require_relative "codepoint"
6
+ require_relative "constants"
7
+
8
+
9
+ module UnicodedataRb
10
+ class GenerateIndex
11
+ def self.call(...)
12
+ new(...).call
13
+ end
14
+
15
+ attr_reader :logger
16
+ def initialize(logger: Logger.new(STDOUT))
17
+ @logger = logger
18
+ end
19
+ private_class_method :new
20
+
21
+ def call
22
+ download_file("#{unicodedata_url_prefix}UnicodeData.txt", UnicodedataRb::Constants::UNICODEDATA_TXT_PATH)
23
+
24
+ # Format of UnicodeData.txt: https://www.unicode.org/L2/L1999/UnicodeData.html
25
+ File.open(UnicodedataRb::Constants::UNICODEDATA_TXT_PATH) do |f|
26
+ codepoint_index = {}
27
+ name_index = {}
28
+
29
+ f.each_line do |line|
30
+ start_line_pos = f.pos - line.size
31
+ codepoint = UnicodedataRb::Codepoint.from_line(line)
32
+ codepoint_index[codepoint.codepoint] = start_line_pos
33
+ name_index[codepoint.name] = start_line_pos
34
+ end
35
+
36
+ index = {
37
+ codepoint: codepoint_index,
38
+ name: name_index
39
+ }
40
+ File.open(UnicodedataRb::Constants::UNICODEDATA_INDEX_PATH, 'wb') { |f| Marshal.dump(index, f) }
41
+ end
42
+ end
43
+
44
+ def download_file(url, save_path)
45
+ logger.info("Downloading #{url}")
46
+ uri = URI(url)
47
+
48
+ Net::HTTP.start(uri.host, :use_ssl => true) do |http|
49
+ request = Net::HTTP::Get.new uri
50
+
51
+ http.request request do |response|
52
+ open save_path, "w:UTF-8" do |io|
53
+ response.read_body do |chunk|
54
+ io.write chunk
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ def unicodedata_url_prefix
62
+ @unicodedata_url_prefix ||= "https://unicode.org/Public/#{RbConfig::CONFIG["UNICODE_VERSION"]}/ucd/"
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnicodedataRb
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,47 @@
1
+ require_relative "unicodedata_rb/codepoint"
2
+ require_relative "unicodedata_rb/generate_index"
3
+ require_relative "unicodedata_rb/constants"
4
+ require_relative "unicodedata_rb/version"
5
+
6
+ module UnicodedataRb
7
+ def self.codepoint(num)
8
+ UnicodedataRb::Codepoint.from_line _unicodedata_txt_line_from(codepoint: num)
9
+ end
10
+
11
+ def self.codepoint_from_char(c)
12
+ UnicodedataRb::Codepoint.from_line _unicodedata_txt_line_from(codepoint: c.ord)
13
+ end
14
+
15
+ def self.codepoint_from_name(name)
16
+ UnicodedataRb::Codepoint.from_line _unicodedata_txt_line_from(name:)
17
+ end
18
+
19
+ def self._unicodedata_txt_line_from(codepoint: nil, name: nil)
20
+ raise ArgumentError if (codepoint.nil? && name.nil?) || (!codepoint.nil? && !name.nil?)
21
+ _unicodedata_txt_file.rewind
22
+ offset =
23
+ if !codepoint.nil?
24
+ _unicodedata_index[:codepoint][codepoint]
25
+ else
26
+ _unicodedata_index[:name][name]
27
+ end
28
+
29
+ raise ArgumentError if offset.nil?
30
+
31
+ _unicodedata_txt_file.seek offset
32
+ _unicodedata_txt_file.readline.chomp
33
+ end
34
+
35
+ def self._unicodedata_txt_file
36
+ @@_unicodedata_txt_file ||= File.open(UnicodedataRb::Constants::UNICODEDATA_TXT_PATH)
37
+ end
38
+
39
+ def self._unicodedata_index
40
+ @@_unicodedata_index ||= Marshal.load(File.binread(UnicodedataRb::Constants::UNICODEDATA_INDEX_PATH))
41
+ end
42
+
43
+ def self.generate_index
44
+ UnicodedataRb::GenerateIndex.call
45
+ end
46
+ end
47
+
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/unicodedata_rb/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "unicodedata_rb"
7
+ spec.version = UnicodedataRb::VERSION
8
+ spec.authors = ["bubiche"]
9
+ spec.email = ["bubiche95@gmail.com"]
10
+
11
+ spec.summary = "Ruby wrapper for unicode data."
12
+ spec.homepage = "https://github.com/bubiche/unicodedata_rb"
13
+ spec.license = "MIT"
14
+ spec.required_ruby_version = ">= 3.2.0"
15
+
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
+ spec.metadata["source_code_uri"] = spec.homepage
18
+ spec.metadata["changelog_uri"] = "https://github.com/bubiche/unicodedata_rb/CHANGELOG.md"
19
+
20
+ # Specify which files should be added to the gem when it is released.
21
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
+ spec.files = Dir.chdir(__dir__) do
23
+ `git ls-files -z`.split("\x0").reject do |f|
24
+ (File.expand_path(f) == __FILE__) || f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor])
25
+ end
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_development_dependency "rake", "~> 13.0"
32
+ spec.add_development_dependency "rspec", "~> 3.0"
33
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicodedata_rb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - bubiche
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-06-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '13.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '13.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.0'
41
+ description:
42
+ email:
43
+ - bubiche95@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".rspec"
49
+ - CHANGELOG.md
50
+ - Gemfile
51
+ - Gemfile.lock
52
+ - LICENSE.txt
53
+ - README.md
54
+ - Rakefile
55
+ - lib/unicodedata_rb.rb
56
+ - lib/unicodedata_rb/UnicodeData.index
57
+ - lib/unicodedata_rb/UnicodeData.txt
58
+ - lib/unicodedata_rb/codepoint.rb
59
+ - lib/unicodedata_rb/constants.rb
60
+ - lib/unicodedata_rb/generate_index.rb
61
+ - lib/unicodedata_rb/version.rb
62
+ - unicodedata_rb.gemspec
63
+ homepage: https://github.com/bubiche/unicodedata_rb
64
+ licenses:
65
+ - MIT
66
+ metadata:
67
+ homepage_uri: https://github.com/bubiche/unicodedata_rb
68
+ source_code_uri: https://github.com/bubiche/unicodedata_rb
69
+ changelog_uri: https://github.com/bubiche/unicodedata_rb/CHANGELOG.md
70
+ post_install_message:
71
+ rdoc_options: []
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: 3.2.0
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubygems_version: 3.4.10
86
+ signing_key:
87
+ specification_version: 4
88
+ summary: Ruby wrapper for unicode data.
89
+ test_files: []