unicoder 1.1.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
4
- data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
3
+ metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
4
+ data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
5
5
  SHA512:
6
- metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
7
- data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
6
+ metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
7
+ data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
data/CHANGELOG.md CHANGED
@@ -1,5 +1,22 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.3.0
4
+
5
+ - confusable: Add ignorables
6
+ - confusable: Nest index and make ESM/charkeys version, fix ";"
7
+
8
+ ### 1.2.1
9
+
10
+ - name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
11
+
12
+ ### 1.2.0
13
+
14
+ - Change format for sequence_name's sub-index for unqalified Emoji sequences
15
+
16
+ ### 1.1.2
17
+
18
+ - Update CLDR to v46
19
+
3
20
  ### 1.1.1
4
21
 
5
22
  - Fix bug related to unsafe characters
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.1.1)
4
+ unicoder (1.3.0)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -39,7 +39,9 @@ Index Name | Module
39
39
  --------------|----
40
40
  name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
41
  numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
- scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
42
+ scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
43
+ blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
44
+ categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
43
45
 
44
46
  ## MIT License
45
47
 
@@ -73,8 +73,18 @@ module Unicoder
73
73
  file = File.read(LOCAL_DATA_DIRECTORY + filename)
74
74
 
75
75
  if parse_mode == :line
76
+ active = !parse_options[:begin]
77
+
76
78
  file.each_line{ |line|
77
- yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
79
+ if !active && parse_options[:begin] && line.match?(parse_options[:begin])
80
+ active = true
81
+ elsif active && parse_options[:end] && line.match?(parse_options[:end])
82
+ active = false
83
+ end
84
+
85
+ if active
86
+ yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
87
+ end
78
88
  }
79
89
  elsif parse_mode == :xml
80
90
  require "oga"
@@ -9,6 +9,12 @@ module Unicoder
9
9
  @index = {
10
10
  CATEGORIES: [],
11
11
  CATEGORY_NAMES: {},
12
+ OFFSETS: [
13
+ 0x10000,
14
+ 0x1000,
15
+ 0x100,
16
+ 0x10
17
+ ],
12
18
  }
13
19
  @range_start = nil
14
20
  end
@@ -3,17 +3,38 @@ module Unicoder
3
3
  class Confusable
4
4
  include Builder
5
5
 
6
+ def initialize_index
7
+ @index = {
8
+ CONFUSABLE: {},
9
+ IGNORABLE: [],
10
+ }
11
+ end
12
+
6
13
  def parse!
7
- parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+)\s+;.*$/ do |line|
14
+ parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
8
15
  source = line["from"].to_i(16)
9
16
  if line["to"].include?(" ")
10
17
  replace_with = line["to"].split(" ").map{ |codepoint|
18
+ cp = codepoint.to_i(16)
19
+ option =~ /charvalues/ ? [cp].pack("U") : cp
20
+ }
21
+ else
22
+ cp = line["to"].to_i(16)
23
+ replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
24
+ end
25
+ assign :CONFUSABLE, source, replace_with
26
+ end
27
+
28
+ parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
29
+ if line["codepoints"]['..']
30
+ single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
11
31
  codepoint.to_i(16)
12
32
  }
13
33
  else
14
- replace_with = line["to"].to_i(16)
34
+ single_or_multiple_codepoints = line["codepoints"].to_i(16)
15
35
  end
16
- @index[source] = replace_with
36
+
37
+ @index[:IGNORABLE] << single_or_multiple_codepoints
17
38
  end
18
39
  end
19
40
  end
@@ -27,7 +27,7 @@ module Unicoder
27
27
  "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
28
  "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
29
  "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
- "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
31
31
  },
32
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
33
33
  JAMO: {
@@ -11,7 +11,7 @@ module Unicoder
11
11
  def initialize_index
12
12
  @index = {
13
13
  SEQUENCES: {},
14
- SEQUENCES_NOT_QUALIFIED: {},
14
+ EMOJI_NOT_QUALIFIED: {},
15
15
  }
16
16
  @words = []
17
17
  end
@@ -74,8 +74,12 @@ module Unicoder
74
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
75
75
  codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
76
  assign_codepoint codepoints, name
77
+
78
+
79
+ # Build all combinations of VS16 present and missing and add to second index
77
80
  if codepoints.include?(0xFE0F)
78
- # Build all combinations of VS16 present and missing
81
+ sequence = codepoints.pack("U*")
82
+
79
83
  codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
84
  if cur.include? 0xFE0F
81
85
  acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
@@ -85,13 +89,13 @@ module Unicoder
85
89
  }.
86
90
  select {|sub_codepoints| sub_codepoints != codepoints }.
87
91
  each { |sub_codepoints|
88
- assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
92
+ sub_sequence = sub_codepoints.pack("U*")
93
+ @index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
89
94
  }
90
95
  end
91
96
  end
92
97
 
93
98
  replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
- replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
95
99
  end
96
100
  end
97
101
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.1.1"
4
+ VERSION = "1.3.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -57,7 +57,7 @@ module Unicoder
57
57
 
58
58
  IVD_VERSION = "2022-09-13"
59
59
 
60
- CLDR_VERSION = "45"
60
+ CLDR_VERSION = "46"
61
61
 
62
62
  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
63
63
 
@@ -69,6 +69,7 @@ module Unicoder
69
69
  name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
70
  confusables: "/security/UNICODE_VERSION/confusables.txt",
71
71
  blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
72
+ core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
72
73
  scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
73
74
  script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
74
75
  property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-14 00:00:00.000000000 Z
11
+ date: 2024-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -61,7 +61,6 @@ extensions: []
61
61
  extra_rdoc_files: []
62
62
  files:
63
63
  - ".gitignore"
64
- - ".travis.yml"
65
64
  - CHANGELOG.md
66
65
  - CODE_OF_CONDUCT.md
67
66
  - Gemfile
data/.travis.yml DELETED
@@ -1,20 +0,0 @@
1
- sudo: false
2
- language: ruby
3
-
4
- rvm:
5
- - 2.7
6
- - 2.6
7
- - 2.5
8
- - 2.4
9
- - 2.3
10
- - ruby-head
11
- - jruby-9.2.9.0
12
- - truffleruby
13
-
14
- matrix:
15
- allow_failures:
16
- - rvm: 2.3
17
- - rvm: ruby-head
18
- - rvm: jruby-2.9.2.0
19
- - rvm: truffleruby
20
- # fast_finish: true