unicoder 1.1.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Gemfile.lock +1 -1
- data/README.md +3 -0
- data/lib/unicoder/builder.rb +11 -1
- data/lib/unicoder/builders/blocks.rb +4 -2
- data/lib/unicoder/builders/categories.rb +6 -0
- data/lib/unicoder/builders/confusable.rb +24 -3
- data/lib/unicoder/builders/name.rb +2 -2
- data/lib/unicoder/builders/scripts.rb +19 -2
- data/lib/unicoder/builders/sequence_name.rb +8 -4
- data/lib/unicoder/constants.rb +3 -2
- data/lib/unicoder/replace_common_words.rb +3 -2
- metadata +2 -3
- data/.travis.yml +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
|
4
|
+
data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
|
7
|
+
data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,28 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
### 1.3.0
|
4
|
+
|
5
|
+
- confusable: Add ignorables
|
6
|
+
- confusable: Nest index and make ESM/charkeys version, fix ";"
|
7
|
+
|
8
|
+
### 1.2.1
|
9
|
+
|
10
|
+
- name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
|
11
|
+
|
12
|
+
### 1.2.0
|
13
|
+
|
14
|
+
- Change format for sequence_name's sub-index for unqalified Emoji sequences
|
15
|
+
|
16
|
+
### 1.1.2
|
17
|
+
|
18
|
+
- Update CLDR to v46
|
19
|
+
|
20
|
+
### 1.1.1
|
21
|
+
|
22
|
+
- Fix bug related to unsafe characters
|
23
|
+
- Fix squared CJK
|
24
|
+
- Small adjustments for scripts and blocks index builders
|
25
|
+
|
3
26
|
### 1.1.0
|
4
27
|
|
5
28
|
- Improve name index size: Support ranges
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -39,6 +39,9 @@ Index Name | Module
|
|
39
39
|
--------------|----
|
40
40
|
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
41
|
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
|
+
scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
|
43
|
+
blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
|
44
|
+
categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
|
42
45
|
|
43
46
|
## MIT License
|
44
47
|
|
data/lib/unicoder/builder.rb
CHANGED
@@ -73,8 +73,18 @@ module Unicoder
|
|
73
73
|
file = File.read(LOCAL_DATA_DIRECTORY + filename)
|
74
74
|
|
75
75
|
if parse_mode == :line
|
76
|
+
active = !parse_options[:begin]
|
77
|
+
|
76
78
|
file.each_line{ |line|
|
77
|
-
|
79
|
+
if !active && parse_options[:begin] && line.match?(parse_options[:begin])
|
80
|
+
active = true
|
81
|
+
elsif active && parse_options[:end] && line.match?(parse_options[:end])
|
82
|
+
active = false
|
83
|
+
end
|
84
|
+
|
85
|
+
if active
|
86
|
+
yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
|
87
|
+
end
|
78
88
|
}
|
79
89
|
elsif parse_mode == :xml
|
80
90
|
require "oga"
|
@@ -4,12 +4,14 @@ module Unicoder
|
|
4
4
|
include Builder
|
5
5
|
|
6
6
|
def initialize_index
|
7
|
-
@index =
|
7
|
+
@index = {
|
8
|
+
BLOCKS: []
|
9
|
+
}
|
8
10
|
end
|
9
11
|
|
10
12
|
def parse!
|
11
13
|
parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
|
12
|
-
@index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
14
|
+
@index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
@@ -3,17 +3,38 @@ module Unicoder
|
|
3
3
|
class Confusable
|
4
4
|
include Builder
|
5
5
|
|
6
|
+
def initialize_index
|
7
|
+
@index = {
|
8
|
+
CONFUSABLE: {},
|
9
|
+
IGNORABLE: [],
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
6
13
|
def parse!
|
7
|
-
parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to
|
14
|
+
parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
|
8
15
|
source = line["from"].to_i(16)
|
9
16
|
if line["to"].include?(" ")
|
10
17
|
replace_with = line["to"].split(" ").map{ |codepoint|
|
18
|
+
cp = codepoint.to_i(16)
|
19
|
+
option =~ /charvalues/ ? [cp].pack("U") : cp
|
20
|
+
}
|
21
|
+
else
|
22
|
+
cp = line["to"].to_i(16)
|
23
|
+
replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
|
24
|
+
end
|
25
|
+
assign :CONFUSABLE, source, replace_with
|
26
|
+
end
|
27
|
+
|
28
|
+
parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
|
29
|
+
if line["codepoints"]['..']
|
30
|
+
single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
|
11
31
|
codepoint.to_i(16)
|
12
32
|
}
|
13
33
|
else
|
14
|
-
|
34
|
+
single_or_multiple_codepoints = line["codepoints"].to_i(16)
|
15
35
|
end
|
16
|
-
|
36
|
+
|
37
|
+
@index[:IGNORABLE] << single_or_multiple_codepoints
|
17
38
|
end
|
18
39
|
end
|
19
40
|
end
|
@@ -27,7 +27,7 @@ module Unicoder
|
|
27
27
|
"EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
|
28
28
|
"KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
|
29
29
|
"NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
|
30
|
-
"CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
|
30
|
+
"CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
|
31
31
|
},
|
32
32
|
# see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
|
33
33
|
JAMO: {
|
@@ -68,7 +68,7 @@ module Unicoder
|
|
68
68
|
elsif line["name"] != "<control>"
|
69
69
|
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
70
70
|
end
|
71
|
-
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
|
71
|
+
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
|
72
72
|
# ignore
|
73
73
|
else
|
74
74
|
assign :NAMES, line["codepoint"].to_i(16), line["name"]
|
@@ -10,6 +10,12 @@ module Unicoder
|
|
10
10
|
SCRIPT_EXTENSIONS: {},
|
11
11
|
SCRIPT_ALIASES: {},
|
12
12
|
SCRIPT_NAMES: [],
|
13
|
+
OFFSETS: [
|
14
|
+
0x10000,
|
15
|
+
0x1000,
|
16
|
+
0x100,
|
17
|
+
0x10
|
18
|
+
],
|
13
19
|
}
|
14
20
|
@reverse_script_names = {}
|
15
21
|
@reverse_script_extension_names = {}
|
@@ -21,6 +27,17 @@ module Unicoder
|
|
21
27
|
}
|
22
28
|
end
|
23
29
|
|
30
|
+
# TODO refactor how multiple indexes are organized
|
31
|
+
def assign_classic(sub_index_name, codepoint, value)
|
32
|
+
idx = @index[sub_index_name]
|
33
|
+
|
34
|
+
if option =~ /charkeys/
|
35
|
+
idx[[codepoint].pack("U*")] = value
|
36
|
+
else
|
37
|
+
idx[codepoint] = value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
24
41
|
def parse!
|
25
42
|
parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
|
26
43
|
@index[:SCRIPT_NAMES] << line["long"]
|
@@ -47,10 +64,10 @@ module Unicoder
|
|
47
64
|
parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
|
48
65
|
if line["to"]
|
49
66
|
(line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
|
50
|
-
|
67
|
+
assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
|
51
68
|
}
|
52
69
|
else
|
53
|
-
|
70
|
+
assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
|
54
71
|
end
|
55
72
|
end
|
56
73
|
end
|
@@ -11,7 +11,7 @@ module Unicoder
|
|
11
11
|
def initialize_index
|
12
12
|
@index = {
|
13
13
|
SEQUENCES: {},
|
14
|
-
|
14
|
+
EMOJI_NOT_QUALIFIED: {},
|
15
15
|
}
|
16
16
|
@words = []
|
17
17
|
end
|
@@ -74,8 +74,12 @@ module Unicoder
|
|
74
74
|
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
75
75
|
codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
|
76
76
|
assign_codepoint codepoints, name
|
77
|
+
|
78
|
+
|
79
|
+
# Build all combinations of VS16 present and missing and add to second index
|
77
80
|
if codepoints.include?(0xFE0F)
|
78
|
-
|
81
|
+
sequence = codepoints.pack("U*")
|
82
|
+
|
79
83
|
codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
|
80
84
|
if cur.include? 0xFE0F
|
81
85
|
acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
|
@@ -85,13 +89,13 @@ module Unicoder
|
|
85
89
|
}.
|
86
90
|
select {|sub_codepoints| sub_codepoints != codepoints }.
|
87
91
|
each { |sub_codepoints|
|
88
|
-
|
92
|
+
sub_sequence = sub_codepoints.pack("U*")
|
93
|
+
@index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
|
89
94
|
}
|
90
95
|
end
|
91
96
|
end
|
92
97
|
|
93
98
|
replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
94
|
-
replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
95
99
|
end
|
96
100
|
end
|
97
101
|
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Unicoder
|
4
|
-
VERSION = "1.
|
4
|
+
VERSION = "1.3.0"
|
5
5
|
|
6
6
|
UNICODE_VERSIONS = %w[
|
7
7
|
16.0.0
|
@@ -57,7 +57,7 @@ module Unicoder
|
|
57
57
|
|
58
58
|
IVD_VERSION = "2022-09-13"
|
59
59
|
|
60
|
-
CLDR_VERSION = "
|
60
|
+
CLDR_VERSION = "46"
|
61
61
|
|
62
62
|
UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
|
63
63
|
|
@@ -69,6 +69,7 @@ module Unicoder
|
|
69
69
|
name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
|
70
70
|
confusables: "/security/UNICODE_VERSION/confusables.txt",
|
71
71
|
blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
|
72
|
+
core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
|
72
73
|
scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
|
73
74
|
script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
|
74
75
|
property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
|
@@ -2,8 +2,9 @@ require "json"
|
|
2
2
|
|
3
3
|
module Unicoder
|
4
4
|
module ReplaceCommonWords
|
5
|
-
def replace_common_words!(which_index, words, count = 500,
|
6
|
-
|
5
|
+
def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
|
6
|
+
base = @words.join.chars.max.ord + 1
|
7
|
+
puts "Starting to replace the #{count} most common words (replace base: #{base})"
|
7
8
|
@index[:REPLACE_BASE] = base
|
8
9
|
@index[:COMMON_WORDS] = words.
|
9
10
|
select{_1.size >= min_word_length}.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicoder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rationalist
|
@@ -61,7 +61,6 @@ extensions: []
|
|
61
61
|
extra_rdoc_files: []
|
62
62
|
files:
|
63
63
|
- ".gitignore"
|
64
|
-
- ".travis.yml"
|
65
64
|
- CHANGELOG.md
|
66
65
|
- CODE_OF_CONDUCT.md
|
67
66
|
- Gemfile
|
data/.travis.yml
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
sudo: false
|
2
|
-
language: ruby
|
3
|
-
|
4
|
-
rvm:
|
5
|
-
- 2.7
|
6
|
-
- 2.6
|
7
|
-
- 2.5
|
8
|
-
- 2.4
|
9
|
-
- 2.3
|
10
|
-
- ruby-head
|
11
|
-
- jruby-9.2.9.0
|
12
|
-
- truffleruby
|
13
|
-
|
14
|
-
matrix:
|
15
|
-
allow_failures:
|
16
|
-
- rvm: 2.3
|
17
|
-
- rvm: ruby-head
|
18
|
-
- rvm: jruby-2.9.2.0
|
19
|
-
- rvm: truffleruby
|
20
|
-
# fast_finish: true
|