unicoder 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
4
- data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
3
+ metadata.gz: c698f0042604828d6acf19123dba21935d65387c030df78774885e0e0084c6ef
4
+ data.tar.gz: f8c1b180273b758079232066ecd4729adbfb41901464414ad56bab6df3c83ee5
5
5
  SHA512:
6
- metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
7
- data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
6
+ metadata.gz: 486f62f96ed10a3dac0703f62da4c85fb1b6a594ff922e7bae668bdf62878f3c2b5704bb89f491164d0e416d66e316eb8f7fbb23c5eb971f9231edef26bb5162
7
+ data.tar.gz: cc7bdff24d99a31021d0b1321447da769b0e25d5209b79e75d6d04c5d1590b858f9f9195c68dd7e35a5817f7dbd00dc7f5f3dae3eaaa268542e4608d7095b035
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.4.0
4
+
5
+ - Update Unicode and Emoji to 17.0
6
+ - Some files now have a new location in UCD
7
+ - Update CLDR to v46
8
+ - Update IVD to 2025-07-14
9
+
3
10
  ### 1.3.0
4
11
 
5
12
  - confusable: Add ignorables
@@ -4,15 +4,11 @@ module Unicoder
4
4
  include Builder
5
5
  include MultiDimensionalArrayBuilder
6
6
 
7
- IGNORE_CATEGORIES = %w[Cs Co Cn].freeze
8
- ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
7
+ ZERO_WIDTH_CATEGORIES = %w[Mn Me Zl Zp Cf].freeze
9
8
 
10
- ZERO_WIDTH_RANGES = [
9
+ ZERO_WIDTH_HANGUL = [
11
10
  *0x1160..0x11FF, # HANGUL JUNGSEONG
12
11
  *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
13
- *0x2060..0x206F, # Ignorables
14
- *0xFFF0..0xFFF8, # Ignorables
15
- *0xE0000..0xE0FFF, # Ignorables
16
12
  ].freeze
17
13
 
18
14
  WIDE_RANGES = [
@@ -34,19 +30,36 @@ module Unicoder
34
30
  0xD => 0, # \r CARRIAGE RETURN
35
31
  0xE => 0, # SHIFT OUT
36
32
  0xF => 0, # SHIFT IN
37
- 0x00AD => nil, # SOFT HYPHEN
33
+ # 0x85 => 0, # NEXT LINE
34
+ 0xAD => nil, # SOFT HYPHEN, nil = 1 (default)
38
35
  0x2E3A => 2, # TWO-EM DASH
39
36
  0x2E3B => 3, # THREE-EM DASH
40
37
  }.freeze
41
38
 
42
39
  def initialize_index
43
- @index = []
40
+ @index = {
41
+ WIDTH_ONE: [],
42
+ WIDTH_TWO: [],
43
+ }
44
+ @ignorable = []
44
45
  end
45
46
 
46
47
  def parse!
47
- parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
48
- next if IGNORE_CATEGORIES.include?(line["category"])
48
+ # Find Ignorables
49
+ parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
50
+ if line["codepoints"]['..']
51
+ single_or_multiple_codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
52
+ codepoint.to_i(16)
53
+ })
54
+ else
55
+ single_or_multiple_codepoints = line["codepoints"].to_i(16)
56
+ end
49
57
 
58
+ @ignorable += [*single_or_multiple_codepoints]
59
+ end
60
+
61
+ # Assign based on East Asian Width
62
+ parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
50
63
  if line["codepoints"]['..']
51
64
  codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
52
65
  codepoint.to_i(16)
@@ -56,33 +69,45 @@ module Unicoder
56
69
  end
57
70
 
58
71
  codepoints.each{ |codepoint|
59
- assign_codepoint codepoint, determine_width(codepoint, line["category"], line["width"])
72
+ assign :WIDTH_ONE, codepoint, determine_width(codepoint, line["category"], line["width"], 1)
73
+ assign :WIDTH_TWO, codepoint, determine_width(codepoint, line["category"], line["width"], 2)
60
74
  }
61
75
  end
62
76
 
63
- ZERO_WIDTH_RANGES.each{ |codepoint|
64
- assign_codepoint codepoint, 0
77
+ # Assign Ranges
78
+ ## Zero-width
79
+ (ZERO_WIDTH_HANGUL | @ignorable).each{ |codepoint|
80
+ assign :WIDTH_ONE, codepoint, 0
81
+ assign :WIDTH_TWO, codepoint, 0
65
82
  }
66
83
 
84
+ ## Full-width
67
85
  WIDE_RANGES.each{ |codepoint|
68
- assign_codepoint codepoint, 2
86
+ assign :WIDTH_ONE, codepoint, 2
87
+ assign :WIDTH_TWO, codepoint, 2
69
88
  }
70
89
 
90
+ ## Table
71
91
  SPECIAL_WIDTHS.each{ |codepoint, value|
72
- assign_codepoint codepoint, value
92
+ assign :WIDTH_ONE, codepoint, value
93
+ assign :WIDTH_TWO, codepoint, value
73
94
  }
74
95
 
75
- 4.times{ compress! }
96
+ # Compres Index
97
+ 4.times{ compress! @index[:WIDTH_ONE] }
98
+ 4.times{ compress! @index[:WIDTH_TWO] }
99
+ remove_trailing_nils! @index[:WIDTH_ONE]
100
+ remove_trailing_nils! @index[:WIDTH_TWO]
76
101
  end
77
102
 
78
- def determine_width(codepoint, category, east_asian_width)
103
+ def determine_width(codepoint, category, east_asian_width, ambiguous)
79
104
  if ( ZERO_WIDTH_CATEGORIES.include?(category) &&
80
105
  [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
81
106
  0
82
107
  elsif east_asian_width == "F" || east_asian_width == "W"
83
108
  2
84
109
  elsif east_asian_width == "A"
85
- :A
110
+ ambiguous == 1 ? nil : ambiguous
86
111
  else
87
112
  nil
88
113
  end
@@ -1,9 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.3.0"
4
+ VERSION = "1.4.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
+ 17.0.0
7
8
  16.0.0
8
9
  15.1.0
9
10
  15.0.0
@@ -22,6 +23,7 @@ module Unicoder
22
23
  CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
23
24
 
24
25
  EMOJI_VERSIONS = %w[
26
+ 17.0
25
27
  16.0
26
28
  15.1
27
29
  15.0
@@ -38,6 +40,7 @@ module Unicoder
38
40
  ].freeze
39
41
 
40
42
  EMOJI_RELATED_UNICODE_VERSIONS = {
43
+ "17.0" => "17.0.0",
41
44
  "16.0" => "16.0.0",
42
45
  "15.1" => "15.1.0",
43
46
  "15.0" => "15.0.0",
@@ -55,11 +58,11 @@ module Unicoder
55
58
 
56
59
  CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
57
60
 
58
- IVD_VERSION = "2022-09-13"
61
+ IVD_VERSION = "2025-07-14"
59
62
 
60
- CLDR_VERSION = "46"
63
+ CLDR_VERSION = "47"
61
64
 
62
- UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
65
+ UNICODE_DATA_ENDPOINT = "http://ftp.unicode.org/Public"
63
66
 
64
67
  LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
65
68
 
@@ -67,7 +70,8 @@ module Unicoder
67
70
  east_asian_width: "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
68
71
  unicode_data: "/UNICODE_VERSION/ucd/UnicodeData.txt",
69
72
  name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
- confusables: "/security/UNICODE_VERSION/confusables.txt",
73
+ confusables: "/UNICODE_VERSION/security/confusables.txt",
74
+ confusables_before_17: "/security/UNICODE_VERSION/confusables.txt",
71
75
  blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
72
76
  core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
73
77
  scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-11-04 00:00:00.000000000 Z
11
+ date: 2025-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist