unicoder 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
4
- data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
3
+ metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
4
+ data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
5
5
  SHA512:
6
- metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
7
- data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
6
+ metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
7
+ data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.1.1
4
+
5
+ - Fix bug related to unsafe characters
6
+ - Fix squared CJK
7
+ - Small adjustments for scripts and blocks index builders
8
+
3
9
  ### 1.1.0
4
10
 
5
11
  - Improve name index size: Support ranges
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.1.0)
4
+ unicoder (1.1.1)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -39,6 +39,7 @@ Index Name | Module
39
39
  --------------|----
40
40
  name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
41
  numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
+ scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
42
43
 
43
44
  ## MIT License
44
45
 
@@ -4,12 +4,14 @@ module Unicoder
4
4
  include Builder
5
5
 
6
6
  def initialize_index
7
- @index = []
7
+ @index = {
8
+ BLOCKS: []
9
+ }
8
10
  end
9
11
 
10
12
  def parse!
11
13
  parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
12
- @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
14
+ @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
13
15
  end
14
16
  end
15
17
  end
@@ -68,7 +68,7 @@ module Unicoder
68
68
  elsif line["name"] != "<control>"
69
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
70
70
  end
71
- elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
72
72
  # ignore
73
73
  else
74
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
@@ -10,6 +10,12 @@ module Unicoder
10
10
  SCRIPT_EXTENSIONS: {},
11
11
  SCRIPT_ALIASES: {},
12
12
  SCRIPT_NAMES: [],
13
+ OFFSETS: [
14
+ 0x10000,
15
+ 0x1000,
16
+ 0x100,
17
+ 0x10
18
+ ],
13
19
  }
14
20
  @reverse_script_names = {}
15
21
  @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
21
27
  }
22
28
  end
23
29
 
30
+ # TODO refactor how multiple indexes are organized
31
+ def assign_classic(sub_index_name, codepoint, value)
32
+ idx = @index[sub_index_name]
33
+
34
+ if option =~ /charkeys/
35
+ idx[[codepoint].pack("U*")] = value
36
+ else
37
+ idx[codepoint] = value
38
+ end
39
+ end
40
+
24
41
  def parse!
25
42
  parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
26
43
  @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
47
64
  parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
48
65
  if line["to"]
49
66
  (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
50
- @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
67
+ assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
51
68
  }
52
69
  else
53
- @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
70
+ assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
54
71
  end
55
72
  end
56
73
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.1.0"
4
+ VERSION = "1.1.1"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -2,8 +2,9 @@ require "json"
2
2
 
3
3
  module Unicoder
4
4
  module ReplaceCommonWords
5
- def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
- puts "Starting to replace the #{count} most common words"
5
+ def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
6
+ base = @words.join.chars.max.ord + 1
7
+ puts "Starting to replace the #{count} most common words (replace base: #{base})"
7
8
  @index[:REPLACE_BASE] = base
8
9
  @index[:COMMON_WORDS] = words.
9
10
  select{_1.size >= min_word_length}.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-09 00:00:00.000000000 Z
11
+ date: 2024-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist