unicoder 1.1.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/Gemfile.lock +1 -1
- data/README.md +3 -1
- data/lib/unicoder/builder.rb +11 -1
- data/lib/unicoder/builders/categories.rb +6 -0
- data/lib/unicoder/builders/confusable.rb +24 -3
- data/lib/unicoder/builders/name.rb +1 -1
- data/lib/unicoder/builders/sequence_name.rb +8 -4
- data/lib/unicoder/constants.rb +3 -2
- metadata +2 -3
- data/.travis.yml +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
|
4
|
+
data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
|
7
|
+
data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
### 1.3.0
|
4
|
+
|
5
|
+
- confusable: Add ignorables
|
6
|
+
- confusable: Nest index and make ESM/charkeys version, fix ";"
|
7
|
+
|
8
|
+
### 1.2.1
|
9
|
+
|
10
|
+
- name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
|
11
|
+
|
12
|
+
### 1.2.0
|
13
|
+
|
14
|
+
- Change format for sequence_name's sub-index for unqalified Emoji sequences
|
15
|
+
|
16
|
+
### 1.1.2
|
17
|
+
|
18
|
+
- Update CLDR to v46
|
19
|
+
|
3
20
|
### 1.1.1
|
4
21
|
|
5
22
|
- Fix bug related to unsafe characters
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -39,7 +39,9 @@ Index Name | Module
|
|
39
39
|
--------------|----
|
40
40
|
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
41
|
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
|
-
scripts | [unicode-
|
42
|
+
scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
|
43
|
+
blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
|
44
|
+
categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
|
43
45
|
|
44
46
|
## MIT License
|
45
47
|
|
data/lib/unicoder/builder.rb
CHANGED
@@ -73,8 +73,18 @@ module Unicoder
|
|
73
73
|
file = File.read(LOCAL_DATA_DIRECTORY + filename)
|
74
74
|
|
75
75
|
if parse_mode == :line
|
76
|
+
active = !parse_options[:begin]
|
77
|
+
|
76
78
|
file.each_line{ |line|
|
77
|
-
|
79
|
+
if !active && parse_options[:begin] && line.match?(parse_options[:begin])
|
80
|
+
active = true
|
81
|
+
elsif active && parse_options[:end] && line.match?(parse_options[:end])
|
82
|
+
active = false
|
83
|
+
end
|
84
|
+
|
85
|
+
if active
|
86
|
+
yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
|
87
|
+
end
|
78
88
|
}
|
79
89
|
elsif parse_mode == :xml
|
80
90
|
require "oga"
|
@@ -3,17 +3,38 @@ module Unicoder
|
|
3
3
|
class Confusable
|
4
4
|
include Builder
|
5
5
|
|
6
|
+
def initialize_index
|
7
|
+
@index = {
|
8
|
+
CONFUSABLE: {},
|
9
|
+
IGNORABLE: [],
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
6
13
|
def parse!
|
7
|
-
parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to
|
14
|
+
parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
|
8
15
|
source = line["from"].to_i(16)
|
9
16
|
if line["to"].include?(" ")
|
10
17
|
replace_with = line["to"].split(" ").map{ |codepoint|
|
18
|
+
cp = codepoint.to_i(16)
|
19
|
+
option =~ /charvalues/ ? [cp].pack("U") : cp
|
20
|
+
}
|
21
|
+
else
|
22
|
+
cp = line["to"].to_i(16)
|
23
|
+
replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
|
24
|
+
end
|
25
|
+
assign :CONFUSABLE, source, replace_with
|
26
|
+
end
|
27
|
+
|
28
|
+
parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
|
29
|
+
if line["codepoints"]['..']
|
30
|
+
single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
|
11
31
|
codepoint.to_i(16)
|
12
32
|
}
|
13
33
|
else
|
14
|
-
|
34
|
+
single_or_multiple_codepoints = line["codepoints"].to_i(16)
|
15
35
|
end
|
16
|
-
|
36
|
+
|
37
|
+
@index[:IGNORABLE] << single_or_multiple_codepoints
|
17
38
|
end
|
18
39
|
end
|
19
40
|
end
|
@@ -27,7 +27,7 @@ module Unicoder
|
|
27
27
|
"EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
|
28
28
|
"KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
|
29
29
|
"NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
|
30
|
-
"CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
|
30
|
+
"CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
|
31
31
|
},
|
32
32
|
# see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
|
33
33
|
JAMO: {
|
@@ -11,7 +11,7 @@ module Unicoder
|
|
11
11
|
def initialize_index
|
12
12
|
@index = {
|
13
13
|
SEQUENCES: {},
|
14
|
-
|
14
|
+
EMOJI_NOT_QUALIFIED: {},
|
15
15
|
}
|
16
16
|
@words = []
|
17
17
|
end
|
@@ -74,8 +74,12 @@ module Unicoder
|
|
74
74
|
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
75
75
|
codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
|
76
76
|
assign_codepoint codepoints, name
|
77
|
+
|
78
|
+
|
79
|
+
# Build all combinations of VS16 present and missing and add to second index
|
77
80
|
if codepoints.include?(0xFE0F)
|
78
|
-
|
81
|
+
sequence = codepoints.pack("U*")
|
82
|
+
|
79
83
|
codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
|
80
84
|
if cur.include? 0xFE0F
|
81
85
|
acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
|
@@ -85,13 +89,13 @@ module Unicoder
|
|
85
89
|
}.
|
86
90
|
select {|sub_codepoints| sub_codepoints != codepoints }.
|
87
91
|
each { |sub_codepoints|
|
88
|
-
|
92
|
+
sub_sequence = sub_codepoints.pack("U*")
|
93
|
+
@index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
|
89
94
|
}
|
90
95
|
end
|
91
96
|
end
|
92
97
|
|
93
98
|
replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
94
|
-
replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
95
99
|
end
|
96
100
|
end
|
97
101
|
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Unicoder
|
4
|
-
VERSION = "1.
|
4
|
+
VERSION = "1.3.0"
|
5
5
|
|
6
6
|
UNICODE_VERSIONS = %w[
|
7
7
|
16.0.0
|
@@ -57,7 +57,7 @@ module Unicoder
|
|
57
57
|
|
58
58
|
IVD_VERSION = "2022-09-13"
|
59
59
|
|
60
|
-
CLDR_VERSION = "
|
60
|
+
CLDR_VERSION = "46"
|
61
61
|
|
62
62
|
UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
|
63
63
|
|
@@ -69,6 +69,7 @@ module Unicoder
|
|
69
69
|
name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
|
70
70
|
confusables: "/security/UNICODE_VERSION/confusables.txt",
|
71
71
|
blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
|
72
|
+
core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
|
72
73
|
scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
|
73
74
|
script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
|
74
75
|
property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicoder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rationalist
|
@@ -61,7 +61,6 @@ extensions: []
|
|
61
61
|
extra_rdoc_files: []
|
62
62
|
files:
|
63
63
|
- ".gitignore"
|
64
|
-
- ".travis.yml"
|
65
64
|
- CHANGELOG.md
|
66
65
|
- CODE_OF_CONDUCT.md
|
67
66
|
- Gemfile
|
data/.travis.yml
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
sudo: false
|
2
|
-
language: ruby
|
3
|
-
|
4
|
-
rvm:
|
5
|
-
- 2.7
|
6
|
-
- 2.6
|
7
|
-
- 2.5
|
8
|
-
- 2.4
|
9
|
-
- 2.3
|
10
|
-
- ruby-head
|
11
|
-
- jruby-9.2.9.0
|
12
|
-
- truffleruby
|
13
|
-
|
14
|
-
matrix:
|
15
|
-
allow_failures:
|
16
|
-
- rvm: 2.3
|
17
|
-
- rvm: ruby-head
|
18
|
-
- rvm: jruby-2.9.2.0
|
19
|
-
- rvm: truffleruby
|
20
|
-
# fast_finish: true
|