unicoder 1.1.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
4
- data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
3
+ metadata.gz: c698f0042604828d6acf19123dba21935d65387c030df78774885e0e0084c6ef
4
+ data.tar.gz: f8c1b180273b758079232066ecd4729adbfb41901464414ad56bab6df3c83ee5
5
5
  SHA512:
6
- metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
7
- data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
6
+ metadata.gz: 486f62f96ed10a3dac0703f62da4c85fb1b6a594ff922e7bae668bdf62878f3c2b5704bb89f491164d0e416d66e316eb8f7fbb23c5eb971f9231edef26bb5162
7
+ data.tar.gz: cc7bdff24d99a31021d0b1321447da769b0e25d5209b79e75d6d04c5d1590b858f9f9195c68dd7e35a5817f7dbd00dc7f5f3dae3eaaa268542e4608d7095b035
data/CHANGELOG.md CHANGED
@@ -1,5 +1,29 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.4.0
4
+
5
+ - Update Unicode and Emoji to 17.0
6
+ - Some files now have a new location in UCD
7
+ - Update CLDR to v46
8
+ - Update IVD to 2025-07-14
9
+
10
+ ### 1.3.0
11
+
12
+ - confusable: Add ignorables
13
+ - confusable: Nest index and make ESM/charkeys version, fix ";"
14
+
15
+ ### 1.2.1
16
+
17
+ - name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
18
+
19
+ ### 1.2.0
20
+
21
+ - Change format for sequence_name's sub-index for unqalified Emoji sequences
22
+
23
+ ### 1.1.2
24
+
25
+ - Update CLDR to v46
26
+
3
27
  ### 1.1.1
4
28
 
5
29
  - Fix bug related to unsafe characters
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.1.1)
4
+ unicoder (1.3.0)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -39,7 +39,9 @@ Index Name | Module
39
39
  --------------|----
40
40
  name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
41
  numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
- scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
42
+ scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
43
+ blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
44
+ categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
43
45
 
44
46
  ## MIT License
45
47
 
@@ -73,8 +73,18 @@ module Unicoder
73
73
  file = File.read(LOCAL_DATA_DIRECTORY + filename)
74
74
 
75
75
  if parse_mode == :line
76
+ active = !parse_options[:begin]
77
+
76
78
  file.each_line{ |line|
77
- yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
79
+ if !active && parse_options[:begin] && line.match?(parse_options[:begin])
80
+ active = true
81
+ elsif active && parse_options[:end] && line.match?(parse_options[:end])
82
+ active = false
83
+ end
84
+
85
+ if active
86
+ yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
87
+ end
78
88
  }
79
89
  elsif parse_mode == :xml
80
90
  require "oga"
@@ -9,6 +9,12 @@ module Unicoder
9
9
  @index = {
10
10
  CATEGORIES: [],
11
11
  CATEGORY_NAMES: {},
12
+ OFFSETS: [
13
+ 0x10000,
14
+ 0x1000,
15
+ 0x100,
16
+ 0x10
17
+ ],
12
18
  }
13
19
  @range_start = nil
14
20
  end
@@ -3,17 +3,38 @@ module Unicoder
3
3
  class Confusable
4
4
  include Builder
5
5
 
6
+ def initialize_index
7
+ @index = {
8
+ CONFUSABLE: {},
9
+ IGNORABLE: [],
10
+ }
11
+ end
12
+
6
13
  def parse!
7
- parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+)\s+;.*$/ do |line|
14
+ parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
8
15
  source = line["from"].to_i(16)
9
16
  if line["to"].include?(" ")
10
17
  replace_with = line["to"].split(" ").map{ |codepoint|
18
+ cp = codepoint.to_i(16)
19
+ option =~ /charvalues/ ? [cp].pack("U") : cp
20
+ }
21
+ else
22
+ cp = line["to"].to_i(16)
23
+ replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
24
+ end
25
+ assign :CONFUSABLE, source, replace_with
26
+ end
27
+
28
+ parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
29
+ if line["codepoints"]['..']
30
+ single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
11
31
  codepoint.to_i(16)
12
32
  }
13
33
  else
14
- replace_with = line["to"].to_i(16)
34
+ single_or_multiple_codepoints = line["codepoints"].to_i(16)
15
35
  end
16
- @index[source] = replace_with
36
+
37
+ @index[:IGNORABLE] << single_or_multiple_codepoints
17
38
  end
18
39
  end
19
40
  end
@@ -4,15 +4,11 @@ module Unicoder
4
4
  include Builder
5
5
  include MultiDimensionalArrayBuilder
6
6
 
7
- IGNORE_CATEGORIES = %w[Cs Co Cn].freeze
8
- ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
7
+ ZERO_WIDTH_CATEGORIES = %w[Mn Me Zl Zp Cf].freeze
9
8
 
10
- ZERO_WIDTH_RANGES = [
9
+ ZERO_WIDTH_HANGUL = [
11
10
  *0x1160..0x11FF, # HANGUL JUNGSEONG
12
11
  *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
13
- *0x2060..0x206F, # Ignorables
14
- *0xFFF0..0xFFF8, # Ignorables
15
- *0xE0000..0xE0FFF, # Ignorables
16
12
  ].freeze
17
13
 
18
14
  WIDE_RANGES = [
@@ -34,19 +30,36 @@ module Unicoder
34
30
  0xD => 0, # \r CARRIAGE RETURN
35
31
  0xE => 0, # SHIFT OUT
36
32
  0xF => 0, # SHIFT IN
37
- 0x00AD => nil, # SOFT HYPHEN
33
+ # 0x85 => 0, # NEXT LINE
34
+ 0xAD => nil, # SOFT HYPHEN, nil = 1 (default)
38
35
  0x2E3A => 2, # TWO-EM DASH
39
36
  0x2E3B => 3, # THREE-EM DASH
40
37
  }.freeze
41
38
 
42
39
  def initialize_index
43
- @index = []
40
+ @index = {
41
+ WIDTH_ONE: [],
42
+ WIDTH_TWO: [],
43
+ }
44
+ @ignorable = []
44
45
  end
45
46
 
46
47
  def parse!
47
- parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
48
- next if IGNORE_CATEGORIES.include?(line["category"])
48
+ # Find Ignorables
49
+ parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
50
+ if line["codepoints"]['..']
51
+ single_or_multiple_codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
52
+ codepoint.to_i(16)
53
+ })
54
+ else
55
+ single_or_multiple_codepoints = line["codepoints"].to_i(16)
56
+ end
49
57
 
58
+ @ignorable += [*single_or_multiple_codepoints]
59
+ end
60
+
61
+ # Assign based on East Asian Width
62
+ parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
50
63
  if line["codepoints"]['..']
51
64
  codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
52
65
  codepoint.to_i(16)
@@ -56,33 +69,45 @@ module Unicoder
56
69
  end
57
70
 
58
71
  codepoints.each{ |codepoint|
59
- assign_codepoint codepoint, determine_width(codepoint, line["category"], line["width"])
72
+ assign :WIDTH_ONE, codepoint, determine_width(codepoint, line["category"], line["width"], 1)
73
+ assign :WIDTH_TWO, codepoint, determine_width(codepoint, line["category"], line["width"], 2)
60
74
  }
61
75
  end
62
76
 
63
- ZERO_WIDTH_RANGES.each{ |codepoint|
64
- assign_codepoint codepoint, 0
77
+ # Assign Ranges
78
+ ## Zero-width
79
+ (ZERO_WIDTH_HANGUL | @ignorable).each{ |codepoint|
80
+ assign :WIDTH_ONE, codepoint, 0
81
+ assign :WIDTH_TWO, codepoint, 0
65
82
  }
66
83
 
84
+ ## Full-width
67
85
  WIDE_RANGES.each{ |codepoint|
68
- assign_codepoint codepoint, 2
86
+ assign :WIDTH_ONE, codepoint, 2
87
+ assign :WIDTH_TWO, codepoint, 2
69
88
  }
70
89
 
90
+ ## Table
71
91
  SPECIAL_WIDTHS.each{ |codepoint, value|
72
- assign_codepoint codepoint, value
92
+ assign :WIDTH_ONE, codepoint, value
93
+ assign :WIDTH_TWO, codepoint, value
73
94
  }
74
95
 
75
- 4.times{ compress! }
96
+ # Compres Index
97
+ 4.times{ compress! @index[:WIDTH_ONE] }
98
+ 4.times{ compress! @index[:WIDTH_TWO] }
99
+ remove_trailing_nils! @index[:WIDTH_ONE]
100
+ remove_trailing_nils! @index[:WIDTH_TWO]
76
101
  end
77
102
 
78
- def determine_width(codepoint, category, east_asian_width)
103
+ def determine_width(codepoint, category, east_asian_width, ambiguous)
79
104
  if ( ZERO_WIDTH_CATEGORIES.include?(category) &&
80
105
  [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
81
106
  0
82
107
  elsif east_asian_width == "F" || east_asian_width == "W"
83
108
  2
84
109
  elsif east_asian_width == "A"
85
- :A
110
+ ambiguous == 1 ? nil : ambiguous
86
111
  else
87
112
  nil
88
113
  end
@@ -27,7 +27,7 @@ module Unicoder
27
27
  "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
28
  "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
29
  "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
- "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
31
31
  },
32
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
33
33
  JAMO: {
@@ -11,7 +11,7 @@ module Unicoder
11
11
  def initialize_index
12
12
  @index = {
13
13
  SEQUENCES: {},
14
- SEQUENCES_NOT_QUALIFIED: {},
14
+ EMOJI_NOT_QUALIFIED: {},
15
15
  }
16
16
  @words = []
17
17
  end
@@ -74,8 +74,12 @@ module Unicoder
74
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
75
75
  codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
76
  assign_codepoint codepoints, name
77
+
78
+
79
+ # Build all combinations of VS16 present and missing and add to second index
77
80
  if codepoints.include?(0xFE0F)
78
- # Build all combinations of VS16 present and missing
81
+ sequence = codepoints.pack("U*")
82
+
79
83
  codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
84
  if cur.include? 0xFE0F
81
85
  acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
@@ -85,13 +89,13 @@ module Unicoder
85
89
  }.
86
90
  select {|sub_codepoints| sub_codepoints != codepoints }.
87
91
  each { |sub_codepoints|
88
- assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
92
+ sub_sequence = sub_codepoints.pack("U*")
93
+ @index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
89
94
  }
90
95
  end
91
96
  end
92
97
 
93
98
  replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
- replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
95
99
  end
96
100
  end
97
101
  end
@@ -1,9 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.1.1"
4
+ VERSION = "1.4.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
+ 17.0.0
7
8
  16.0.0
8
9
  15.1.0
9
10
  15.0.0
@@ -22,6 +23,7 @@ module Unicoder
22
23
  CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
23
24
 
24
25
  EMOJI_VERSIONS = %w[
26
+ 17.0
25
27
  16.0
26
28
  15.1
27
29
  15.0
@@ -38,6 +40,7 @@ module Unicoder
38
40
  ].freeze
39
41
 
40
42
  EMOJI_RELATED_UNICODE_VERSIONS = {
43
+ "17.0" => "17.0.0",
41
44
  "16.0" => "16.0.0",
42
45
  "15.1" => "15.1.0",
43
46
  "15.0" => "15.0.0",
@@ -55,11 +58,11 @@ module Unicoder
55
58
 
56
59
  CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
57
60
 
58
- IVD_VERSION = "2022-09-13"
61
+ IVD_VERSION = "2025-07-14"
59
62
 
60
- CLDR_VERSION = "45"
63
+ CLDR_VERSION = "47"
61
64
 
62
- UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
65
+ UNICODE_DATA_ENDPOINT = "http://ftp.unicode.org/Public"
63
66
 
64
67
  LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
65
68
 
@@ -67,8 +70,10 @@ module Unicoder
67
70
  east_asian_width: "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
68
71
  unicode_data: "/UNICODE_VERSION/ucd/UnicodeData.txt",
69
72
  name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
- confusables: "/security/UNICODE_VERSION/confusables.txt",
73
+ confusables: "/UNICODE_VERSION/security/confusables.txt",
74
+ confusables_before_17: "/security/UNICODE_VERSION/confusables.txt",
71
75
  blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
76
+ core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
72
77
  scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
73
78
  script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
74
79
  property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-14 00:00:00.000000000 Z
11
+ date: 2025-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -61,7 +61,6 @@ extensions: []
61
61
  extra_rdoc_files: []
62
62
  files:
63
63
  - ".gitignore"
64
- - ".travis.yml"
65
64
  - CHANGELOG.md
66
65
  - CODE_OF_CONDUCT.md
67
66
  - Gemfile
data/.travis.yml DELETED
@@ -1,20 +0,0 @@
1
- sudo: false
2
- language: ruby
3
-
4
- rvm:
5
- - 2.7
6
- - 2.6
7
- - 2.5
8
- - 2.4
9
- - 2.3
10
- - ruby-head
11
- - jruby-9.2.9.0
12
- - truffleruby
13
-
14
- matrix:
15
- allow_failures:
16
- - rvm: 2.3
17
- - rvm: ruby-head
18
- - rvm: jruby-2.9.2.0
19
- - rvm: truffleruby
20
- # fast_finish: true