unicoder 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Gemfile.lock +1 -1
- data/README.md +1 -0
- data/lib/unicoder/builders/blocks.rb +4 -2
- data/lib/unicoder/builders/name.rb +1 -1
- data/lib/unicoder/builders/scripts.rb +19 -2
- data/lib/unicoder/constants.rb +1 -1
- data/lib/unicoder/replace_common_words.rb +3 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
|
4
|
+
data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
|
7
|
+
data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -39,6 +39,7 @@ Index Name | Module
|
|
39
39
|
--------------|----
|
40
40
|
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
41
|
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
|
+
scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
|
42
43
|
|
43
44
|
## MIT License
|
44
45
|
|
@@ -4,12 +4,14 @@ module Unicoder
|
|
4
4
|
include Builder
|
5
5
|
|
6
6
|
def initialize_index
|
7
|
-
@index =
|
7
|
+
@index = {
|
8
|
+
BLOCKS: []
|
9
|
+
}
|
8
10
|
end
|
9
11
|
|
10
12
|
def parse!
|
11
13
|
parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
|
12
|
-
@index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
14
|
+
@index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
@@ -68,7 +68,7 @@ module Unicoder
|
|
68
68
|
elsif line["name"] != "<control>"
|
69
69
|
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
70
70
|
end
|
71
|
-
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
|
71
|
+
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
|
72
72
|
# ignore
|
73
73
|
else
|
74
74
|
assign :NAMES, line["codepoint"].to_i(16), line["name"]
|
@@ -10,6 +10,12 @@ module Unicoder
|
|
10
10
|
SCRIPT_EXTENSIONS: {},
|
11
11
|
SCRIPT_ALIASES: {},
|
12
12
|
SCRIPT_NAMES: [],
|
13
|
+
OFFSETS: [
|
14
|
+
0x10000,
|
15
|
+
0x1000,
|
16
|
+
0x100,
|
17
|
+
0x10
|
18
|
+
],
|
13
19
|
}
|
14
20
|
@reverse_script_names = {}
|
15
21
|
@reverse_script_extension_names = {}
|
@@ -21,6 +27,17 @@ module Unicoder
|
|
21
27
|
}
|
22
28
|
end
|
23
29
|
|
30
|
+
# TODO refactor how multiple indexes are organized
|
31
|
+
def assign_classic(sub_index_name, codepoint, value)
|
32
|
+
idx = @index[sub_index_name]
|
33
|
+
|
34
|
+
if option =~ /charkeys/
|
35
|
+
idx[[codepoint].pack("U*")] = value
|
36
|
+
else
|
37
|
+
idx[codepoint] = value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
24
41
|
def parse!
|
25
42
|
parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
|
26
43
|
@index[:SCRIPT_NAMES] << line["long"]
|
@@ -47,10 +64,10 @@ module Unicoder
|
|
47
64
|
parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
|
48
65
|
if line["to"]
|
49
66
|
(line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
|
50
|
-
|
67
|
+
assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
|
51
68
|
}
|
52
69
|
else
|
53
|
-
|
70
|
+
assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
|
54
71
|
end
|
55
72
|
end
|
56
73
|
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -2,8 +2,9 @@ require "json"
|
|
2
2
|
|
3
3
|
module Unicoder
|
4
4
|
module ReplaceCommonWords
|
5
|
-
def replace_common_words!(which_index, words, count = 500,
|
6
|
-
|
5
|
+
def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
|
6
|
+
base = @words.join.chars.max.ord + 1
|
7
|
+
puts "Starting to replace the #{count} most common words (replace base: #{base})"
|
7
8
|
@index[:REPLACE_BASE] = base
|
8
9
|
@index[:COMMON_WORDS] = words.
|
9
10
|
select{_1.size >= min_word_length}.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicoder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rationalist
|