unicoder 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Gemfile.lock +1 -1
- data/README.md +3 -0
- data/lib/unicoder/builder.rb +11 -1
- data/lib/unicoder/builders/blocks.rb +4 -2
- data/lib/unicoder/builders/categories.rb +6 -0
- data/lib/unicoder/builders/confusable.rb +24 -3
- data/lib/unicoder/builders/name.rb +2 -2
- data/lib/unicoder/builders/scripts.rb +19 -2
- data/lib/unicoder/builders/sequence_name.rb +8 -4
- data/lib/unicoder/constants.rb +3 -2
- data/lib/unicoder/replace_common_words.rb +3 -2
- metadata +2 -3
- data/.travis.yml +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
|
4
|
+
data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
|
7
|
+
data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,28 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
### 1.3.0
|
4
|
+
|
5
|
+
- confusable: Add ignorables
|
6
|
+
- confusable: Nest index and make ESM/charkeys version, fix ";"
|
7
|
+
|
8
|
+
### 1.2.1
|
9
|
+
|
10
|
+
- name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
|
11
|
+
|
12
|
+
### 1.2.0
|
13
|
+
|
14
|
+
- Change format for sequence_name's sub-index for unqalified Emoji sequences
|
15
|
+
|
16
|
+
### 1.1.2
|
17
|
+
|
18
|
+
- Update CLDR to v46
|
19
|
+
|
20
|
+
### 1.1.1
|
21
|
+
|
22
|
+
- Fix bug related to unsafe characters
|
23
|
+
- Fix squared CJK
|
24
|
+
- Small adjustments for scripts and blocks index builders
|
25
|
+
|
3
26
|
### 1.1.0
|
4
27
|
|
5
28
|
- Improve name index size: Support ranges
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -39,6 +39,9 @@ Index Name | Module
|
|
39
39
|
--------------|----
|
40
40
|
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
41
|
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
|
+
scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
|
43
|
+
blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
|
44
|
+
categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
|
42
45
|
|
43
46
|
## MIT License
|
44
47
|
|
data/lib/unicoder/builder.rb
CHANGED
@@ -73,8 +73,18 @@ module Unicoder
|
|
73
73
|
file = File.read(LOCAL_DATA_DIRECTORY + filename)
|
74
74
|
|
75
75
|
if parse_mode == :line
|
76
|
+
active = !parse_options[:begin]
|
77
|
+
|
76
78
|
file.each_line{ |line|
|
77
|
-
|
79
|
+
if !active && parse_options[:begin] && line.match?(parse_options[:begin])
|
80
|
+
active = true
|
81
|
+
elsif active && parse_options[:end] && line.match?(parse_options[:end])
|
82
|
+
active = false
|
83
|
+
end
|
84
|
+
|
85
|
+
if active
|
86
|
+
yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
|
87
|
+
end
|
78
88
|
}
|
79
89
|
elsif parse_mode == :xml
|
80
90
|
require "oga"
|
@@ -4,12 +4,14 @@ module Unicoder
|
|
4
4
|
include Builder
|
5
5
|
|
6
6
|
def initialize_index
|
7
|
-
@index =
|
7
|
+
@index = {
|
8
|
+
BLOCKS: []
|
9
|
+
}
|
8
10
|
end
|
9
11
|
|
10
12
|
def parse!
|
11
13
|
parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
|
12
|
-
@index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
14
|
+
@index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
@@ -3,17 +3,38 @@ module Unicoder
|
|
3
3
|
class Confusable
|
4
4
|
include Builder
|
5
5
|
|
6
|
+
def initialize_index
|
7
|
+
@index = {
|
8
|
+
CONFUSABLE: {},
|
9
|
+
IGNORABLE: [],
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
6
13
|
def parse!
|
7
|
-
parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to
|
14
|
+
parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
|
8
15
|
source = line["from"].to_i(16)
|
9
16
|
if line["to"].include?(" ")
|
10
17
|
replace_with = line["to"].split(" ").map{ |codepoint|
|
18
|
+
cp = codepoint.to_i(16)
|
19
|
+
option =~ /charvalues/ ? [cp].pack("U") : cp
|
20
|
+
}
|
21
|
+
else
|
22
|
+
cp = line["to"].to_i(16)
|
23
|
+
replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
|
24
|
+
end
|
25
|
+
assign :CONFUSABLE, source, replace_with
|
26
|
+
end
|
27
|
+
|
28
|
+
parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
|
29
|
+
if line["codepoints"]['..']
|
30
|
+
single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
|
11
31
|
codepoint.to_i(16)
|
12
32
|
}
|
13
33
|
else
|
14
|
-
|
34
|
+
single_or_multiple_codepoints = line["codepoints"].to_i(16)
|
15
35
|
end
|
16
|
-
|
36
|
+
|
37
|
+
@index[:IGNORABLE] << single_or_multiple_codepoints
|
17
38
|
end
|
18
39
|
end
|
19
40
|
end
|
@@ -27,7 +27,7 @@ module Unicoder
|
|
27
27
|
"EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
|
28
28
|
"KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
|
29
29
|
"NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
|
30
|
-
"CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
|
30
|
+
"CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
|
31
31
|
},
|
32
32
|
# see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
|
33
33
|
JAMO: {
|
@@ -68,7 +68,7 @@ module Unicoder
|
|
68
68
|
elsif line["name"] != "<control>"
|
69
69
|
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
70
70
|
end
|
71
|
-
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
|
71
|
+
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
|
72
72
|
# ignore
|
73
73
|
else
|
74
74
|
assign :NAMES, line["codepoint"].to_i(16), line["name"]
|
@@ -10,6 +10,12 @@ module Unicoder
|
|
10
10
|
SCRIPT_EXTENSIONS: {},
|
11
11
|
SCRIPT_ALIASES: {},
|
12
12
|
SCRIPT_NAMES: [],
|
13
|
+
OFFSETS: [
|
14
|
+
0x10000,
|
15
|
+
0x1000,
|
16
|
+
0x100,
|
17
|
+
0x10
|
18
|
+
],
|
13
19
|
}
|
14
20
|
@reverse_script_names = {}
|
15
21
|
@reverse_script_extension_names = {}
|
@@ -21,6 +27,17 @@ module Unicoder
|
|
21
27
|
}
|
22
28
|
end
|
23
29
|
|
30
|
+
# TODO refactor how multiple indexes are organized
|
31
|
+
def assign_classic(sub_index_name, codepoint, value)
|
32
|
+
idx = @index[sub_index_name]
|
33
|
+
|
34
|
+
if option =~ /charkeys/
|
35
|
+
idx[[codepoint].pack("U*")] = value
|
36
|
+
else
|
37
|
+
idx[codepoint] = value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
24
41
|
def parse!
|
25
42
|
parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
|
26
43
|
@index[:SCRIPT_NAMES] << line["long"]
|
@@ -47,10 +64,10 @@ module Unicoder
|
|
47
64
|
parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
|
48
65
|
if line["to"]
|
49
66
|
(line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
|
50
|
-
|
67
|
+
assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
|
51
68
|
}
|
52
69
|
else
|
53
|
-
|
70
|
+
assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
|
54
71
|
end
|
55
72
|
end
|
56
73
|
end
|
@@ -11,7 +11,7 @@ module Unicoder
|
|
11
11
|
def initialize_index
|
12
12
|
@index = {
|
13
13
|
SEQUENCES: {},
|
14
|
-
|
14
|
+
EMOJI_NOT_QUALIFIED: {},
|
15
15
|
}
|
16
16
|
@words = []
|
17
17
|
end
|
@@ -74,8 +74,12 @@ module Unicoder
|
|
74
74
|
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
75
75
|
codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
|
76
76
|
assign_codepoint codepoints, name
|
77
|
+
|
78
|
+
|
79
|
+
# Build all combinations of VS16 present and missing and add to second index
|
77
80
|
if codepoints.include?(0xFE0F)
|
78
|
-
|
81
|
+
sequence = codepoints.pack("U*")
|
82
|
+
|
79
83
|
codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
|
80
84
|
if cur.include? 0xFE0F
|
81
85
|
acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
|
@@ -85,13 +89,13 @@ module Unicoder
|
|
85
89
|
}.
|
86
90
|
select {|sub_codepoints| sub_codepoints != codepoints }.
|
87
91
|
each { |sub_codepoints|
|
88
|
-
|
92
|
+
sub_sequence = sub_codepoints.pack("U*")
|
93
|
+
@index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
|
89
94
|
}
|
90
95
|
end
|
91
96
|
end
|
92
97
|
|
93
98
|
replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
94
|
-
replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
95
99
|
end
|
96
100
|
end
|
97
101
|
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Unicoder
|
4
|
-
VERSION = "1.
|
4
|
+
VERSION = "1.3.0"
|
5
5
|
|
6
6
|
UNICODE_VERSIONS = %w[
|
7
7
|
16.0.0
|
@@ -57,7 +57,7 @@ module Unicoder
|
|
57
57
|
|
58
58
|
IVD_VERSION = "2022-09-13"
|
59
59
|
|
60
|
-
CLDR_VERSION = "
|
60
|
+
CLDR_VERSION = "46"
|
61
61
|
|
62
62
|
UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
|
63
63
|
|
@@ -69,6 +69,7 @@ module Unicoder
|
|
69
69
|
name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
|
70
70
|
confusables: "/security/UNICODE_VERSION/confusables.txt",
|
71
71
|
blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
|
72
|
+
core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
|
72
73
|
scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
|
73
74
|
script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
|
74
75
|
property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
|
@@ -2,8 +2,9 @@ require "json"
|
|
2
2
|
|
3
3
|
module Unicoder
|
4
4
|
module ReplaceCommonWords
|
5
|
-
def replace_common_words!(which_index, words, count = 500,
|
6
|
-
|
5
|
+
def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
|
6
|
+
base = @words.join.chars.max.ord + 1
|
7
|
+
puts "Starting to replace the #{count} most common words (replace base: #{base})"
|
7
8
|
@index[:REPLACE_BASE] = base
|
8
9
|
@index[:COMMON_WORDS] = words.
|
9
10
|
select{_1.size >= min_word_length}.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicoder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rationalist
|
@@ -61,7 +61,6 @@ extensions: []
|
|
61
61
|
extra_rdoc_files: []
|
62
62
|
files:
|
63
63
|
- ".gitignore"
|
64
|
-
- ".travis.yml"
|
65
64
|
- CHANGELOG.md
|
66
65
|
- CODE_OF_CONDUCT.md
|
67
66
|
- Gemfile
|
data/.travis.yml
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
sudo: false
|
2
|
-
language: ruby
|
3
|
-
|
4
|
-
rvm:
|
5
|
-
- 2.7
|
6
|
-
- 2.6
|
7
|
-
- 2.5
|
8
|
-
- 2.4
|
9
|
-
- 2.3
|
10
|
-
- ruby-head
|
11
|
-
- jruby-9.2.9.0
|
12
|
-
- truffleruby
|
13
|
-
|
14
|
-
matrix:
|
15
|
-
allow_failures:
|
16
|
-
- rvm: 2.3
|
17
|
-
- rvm: ruby-head
|
18
|
-
- rvm: jruby-2.9.2.0
|
19
|
-
- rvm: truffleruby
|
20
|
-
# fast_finish: true
|