unicoder 1.1.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
4
- data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
3
+ metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
4
+ data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
5
5
  SHA512:
6
- metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
7
- data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
6
+ metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
7
+ data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.1.1
4
+
5
+ - Fix bug related to unsafe characters
6
+ - Fix squared CJK
7
+ - Small adjustments for scripts and blocks index builders
8
+
3
9
  ### 1.1.0
4
10
 
5
11
  - Improve name index size: Support ranges
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.1.0)
4
+ unicoder (1.1.1)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -39,6 +39,7 @@ Index Name | Module
39
39
  --------------|----
40
40
  name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
41
  numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
+ scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
42
43
 
43
44
  ## MIT License
44
45
 
@@ -4,12 +4,14 @@ module Unicoder
4
4
  include Builder
5
5
 
6
6
  def initialize_index
7
- @index = []
7
+ @index = {
8
+ BLOCKS: []
9
+ }
8
10
  end
9
11
 
10
12
  def parse!
11
13
  parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
12
- @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
14
+ @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
13
15
  end
14
16
  end
15
17
  end
@@ -68,7 +68,7 @@ module Unicoder
68
68
  elsif line["name"] != "<control>"
69
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
70
70
  end
71
- elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
72
72
  # ignore
73
73
  else
74
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
@@ -10,6 +10,12 @@ module Unicoder
10
10
  SCRIPT_EXTENSIONS: {},
11
11
  SCRIPT_ALIASES: {},
12
12
  SCRIPT_NAMES: [],
13
+ OFFSETS: [
14
+ 0x10000,
15
+ 0x1000,
16
+ 0x100,
17
+ 0x10
18
+ ],
13
19
  }
14
20
  @reverse_script_names = {}
15
21
  @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
21
27
  }
22
28
  end
23
29
 
30
+ # TODO refactor how multiple indexes are organized
31
+ def assign_classic(sub_index_name, codepoint, value)
32
+ idx = @index[sub_index_name]
33
+
34
+ if option =~ /charkeys/
35
+ idx[[codepoint].pack("U*")] = value
36
+ else
37
+ idx[codepoint] = value
38
+ end
39
+ end
40
+
24
41
  def parse!
25
42
  parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
26
43
  @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
47
64
  parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
48
65
  if line["to"]
49
66
  (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
50
- @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
67
+ assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
51
68
  }
52
69
  else
53
- @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
70
+ assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
54
71
  end
55
72
  end
56
73
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.1.0"
4
+ VERSION = "1.1.1"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -2,8 +2,9 @@ require "json"
2
2
 
3
3
  module Unicoder
4
4
  module ReplaceCommonWords
5
- def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
- puts "Starting to replace the #{count} most common words"
5
+ def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
6
+ base = @words.join.chars.max.ord + 1
7
+ puts "Starting to replace the #{count} most common words (replace base: #{base})"
7
8
  @index[:REPLACE_BASE] = base
8
9
  @index[:COMMON_WORDS] = words.
9
10
  select{_1.size >= min_word_length}.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-09 00:00:00.000000000 Z
11
+ date: 2024-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist