unicoder 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
4
- data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
3
+ metadata.gz: 1b4aa3a99c2805fe8caa17f90210132a2e3a3e1df511e8a63a7cc83af0fd6e74
4
+ data.tar.gz: 1f2174a23878ac589e80fd544f374f56ac4881d6d8416c47b5e1eb0db6b5daa2
5
5
  SHA512:
6
- metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
7
- data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
6
+ metadata.gz: 4e44aa4e9d0328d27f337c1ee012ae60fc8dda50ac685db9c1c944420155e1741502f0a555fdcc821732ed2e7a3980aeb3bbe9642dded0ab14a2f9c33605bdb6
7
+ data.tar.gz: fe152b4e2966b64e5810866cf6b74c50866d6e034c9c59cb9009cbb7d3108770f4c9d3297b495e15b4b1b724c482200a93ee3fa5a31e91fc66c7cb0834c2866e
data/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.5.0
4
+
5
+ - New Emoji locations
6
+ - Update CLDR to v48
7
+ - Allow Ruby 4.0
8
+
9
+ ### 1.4.0
10
+
11
+ - Update Unicode and Emoji to 17.0
12
+ - Some files now have a new location in UCD
13
+ - Update CLDR to v46
14
+ - Update IVD to 2025-07-14
15
+
3
16
  ### 1.3.0
4
17
 
5
18
  - confusable: Add ignorables
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.3.0)
4
+ unicoder (1.5.0)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
@@ -4,15 +4,11 @@ module Unicoder
4
4
  include Builder
5
5
  include MultiDimensionalArrayBuilder
6
6
 
7
- IGNORE_CATEGORIES = %w[Cs Co Cn].freeze
8
- ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
7
+ ZERO_WIDTH_CATEGORIES = %w[Mn Me Zl Zp Cf].freeze
9
8
 
10
- ZERO_WIDTH_RANGES = [
9
+ ZERO_WIDTH_HANGUL = [
11
10
  *0x1160..0x11FF, # HANGUL JUNGSEONG
12
11
  *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
13
- *0x2060..0x206F, # Ignorables
14
- *0xFFF0..0xFFF8, # Ignorables
15
- *0xE0000..0xE0FFF, # Ignorables
16
12
  ].freeze
17
13
 
18
14
  WIDE_RANGES = [
@@ -34,19 +30,36 @@ module Unicoder
34
30
  0xD => 0, # \r CARRIAGE RETURN
35
31
  0xE => 0, # SHIFT OUT
36
32
  0xF => 0, # SHIFT IN
37
- 0x00AD => nil, # SOFT HYPHEN
33
+ # 0x85 => 0, # NEXT LINE
34
+ 0xAD => nil, # SOFT HYPHEN, nil = 1 (default)
38
35
  0x2E3A => 2, # TWO-EM DASH
39
36
  0x2E3B => 3, # THREE-EM DASH
40
37
  }.freeze
41
38
 
42
39
  def initialize_index
43
- @index = []
40
+ @index = {
41
+ WIDTH_ONE: [],
42
+ WIDTH_TWO: [],
43
+ }
44
+ @ignorable = []
44
45
  end
45
46
 
46
47
  def parse!
47
- parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
48
- next if IGNORE_CATEGORIES.include?(line["category"])
48
+ # Find Ignorables
49
+ parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
50
+ if line["codepoints"]['..']
51
+ single_or_multiple_codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
52
+ codepoint.to_i(16)
53
+ })
54
+ else
55
+ single_or_multiple_codepoints = line["codepoints"].to_i(16)
56
+ end
49
57
 
58
+ @ignorable += [*single_or_multiple_codepoints]
59
+ end
60
+
61
+ # Assign based on East Asian Width
62
+ parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
50
63
  if line["codepoints"]['..']
51
64
  codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
52
65
  codepoint.to_i(16)
@@ -56,33 +69,45 @@ module Unicoder
56
69
  end
57
70
 
58
71
  codepoints.each{ |codepoint|
59
- assign_codepoint codepoint, determine_width(codepoint, line["category"], line["width"])
72
+ assign :WIDTH_ONE, codepoint, determine_width(codepoint, line["category"], line["width"], 1)
73
+ assign :WIDTH_TWO, codepoint, determine_width(codepoint, line["category"], line["width"], 2)
60
74
  }
61
75
  end
62
76
 
63
- ZERO_WIDTH_RANGES.each{ |codepoint|
64
- assign_codepoint codepoint, 0
77
+ # Assign Ranges
78
+ ## Zero-width
79
+ (ZERO_WIDTH_HANGUL | @ignorable).each{ |codepoint|
80
+ assign :WIDTH_ONE, codepoint, 0
81
+ assign :WIDTH_TWO, codepoint, 0
65
82
  }
66
83
 
84
+ ## Full-width
67
85
  WIDE_RANGES.each{ |codepoint|
68
- assign_codepoint codepoint, 2
86
+ assign :WIDTH_ONE, codepoint, 2
87
+ assign :WIDTH_TWO, codepoint, 2
69
88
  }
70
89
 
90
+ ## Table
71
91
  SPECIAL_WIDTHS.each{ |codepoint, value|
72
- assign_codepoint codepoint, value
92
+ assign :WIDTH_ONE, codepoint, value
93
+ assign :WIDTH_TWO, codepoint, value
73
94
  }
74
95
 
75
- 4.times{ compress! }
96
+ # Compres Index
97
+ 4.times{ compress! @index[:WIDTH_ONE] }
98
+ 4.times{ compress! @index[:WIDTH_TWO] }
99
+ remove_trailing_nils! @index[:WIDTH_ONE]
100
+ remove_trailing_nils! @index[:WIDTH_TWO]
76
101
  end
77
102
 
78
- def determine_width(codepoint, category, east_asian_width)
103
+ def determine_width(codepoint, category, east_asian_width, ambiguous)
79
104
  if ( ZERO_WIDTH_CATEGORIES.include?(category) &&
80
105
  [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
81
106
  0
82
107
  elsif east_asian_width == "F" || east_asian_width == "W"
83
108
  2
84
109
  elsif east_asian_width == "A"
85
- :A
110
+ ambiguous == 1 ? nil : ambiguous
86
111
  else
87
112
  nil
88
113
  end
@@ -1,9 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.3.0"
4
+ VERSION = "1.5.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
+ 17.0.0
7
8
  16.0.0
8
9
  15.1.0
9
10
  15.0.0
@@ -22,6 +23,7 @@ module Unicoder
22
23
  CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
23
24
 
24
25
  EMOJI_VERSIONS = %w[
26
+ 17.0
25
27
  16.0
26
28
  15.1
27
29
  15.0
@@ -38,6 +40,7 @@ module Unicoder
38
40
  ].freeze
39
41
 
40
42
  EMOJI_RELATED_UNICODE_VERSIONS = {
43
+ "17.0" => "17.0.0",
41
44
  "16.0" => "16.0.0",
42
45
  "15.1" => "15.1.0",
43
46
  "15.0" => "15.0.0",
@@ -55,11 +58,11 @@ module Unicoder
55
58
 
56
59
  CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
57
60
 
58
- IVD_VERSION = "2022-09-13"
61
+ IVD_VERSION = "2025-07-14"
59
62
 
60
- CLDR_VERSION = "46"
63
+ CLDR_VERSION = "48"
61
64
 
62
- UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
65
+ UNICODE_DATA_ENDPOINT = "http://ftp.unicode.org/Public"
63
66
 
64
67
  LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
65
68
 
@@ -67,7 +70,8 @@ module Unicoder
67
70
  east_asian_width: "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
68
71
  unicode_data: "/UNICODE_VERSION/ucd/UnicodeData.txt",
69
72
  name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
- confusables: "/security/UNICODE_VERSION/confusables.txt",
73
+ confusables: "/UNICODE_VERSION/security/confusables.txt",
74
+ confusables_before_17: "/security/UNICODE_VERSION/confusables.txt",
71
75
  blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
72
76
  core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
73
77
  scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
@@ -82,11 +86,14 @@ module Unicoder
82
86
  ivd_sequences: "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
83
87
  # emoji_data: "/EMOJI_VERSION/ucd/emoji/",
84
88
  emoji_data: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
85
- emoji_sequences: "/emoji/EMOJI_VERSION/emoji-sequences.txt",
89
+ # emoji_sequences: "/emoji/EMOJI_VERSION/emoji-sequences.txt",
90
+ emoji_sequences: "/EMOJI_RELATED_VERSION/emoji/emoji-sequences.txt",
86
91
  # emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
87
92
  emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
88
- emoji_zwj_sequences: "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
89
- emoji_test: "/emoji/EMOJI_VERSION/emoji-test.txt",
93
+ # emoji_zwj_sequences: "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
94
+ emoji_zwj_sequences: "/EMOJI_RELATED_VERSION/emoji/emoji-zwj-sequences.txt",
95
+ # emoji_test: "/emoji/EMOJI_VERSION/emoji-test.txt",
96
+ emoji_test: "/EMOJI_RELATED_VERSION/emoji/emoji-test.txt",
90
97
  # valid_subdivisions: "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
91
98
  valid_subdivisions: "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
92
99
  # ""
data/unicoder.gemspec CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.required_ruby_version = ">= 3.0", "< 4.0"
20
+ gem.required_ruby_version = ">= 3.0", "< 5.0"
21
21
  gem.add_dependency "rationalist", "~> 2.0"
22
22
  gem.add_dependency "rubyzip", "~> 1.2"
23
23
  gem.add_dependency "oga", "~> 2.9"
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2024-11-04 00:00:00.000000000 Z
10
+ date: 2026-01-10 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rationalist
@@ -91,7 +90,6 @@ homepage: https://github.com/janlelis/unicoder
91
90
  licenses:
92
91
  - MIT
93
92
  metadata: {}
94
- post_install_message:
95
93
  rdoc_options: []
96
94
  require_paths:
97
95
  - lib
@@ -102,15 +100,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
102
100
  version: '3.0'
103
101
  - - "<"
104
102
  - !ruby/object:Gem::Version
105
- version: '4.0'
103
+ version: '5.0'
106
104
  required_rubygems_version: !ruby/object:Gem::Requirement
107
105
  requirements:
108
106
  - - ">="
109
107
  - !ruby/object:Gem::Version
110
108
  version: '0'
111
109
  requirements: []
112
- rubygems_version: 3.5.21
113
- signing_key:
110
+ rubygems_version: 3.6.2
114
111
  specification_version: 4
115
112
  summary: Creates specialized indexes for Unicode data lookup
116
113
  test_files: []