unicoder 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
4
- data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
3
+ metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
4
+ data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
5
5
  SHA512:
6
- metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
7
- data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
6
+ metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
7
+ data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
data/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.3.0
4
+
5
+ - confusable: Add ignorables
6
+ - confusable: Nest index and make ESM/charkeys version, fix ";"
7
+
8
+ ### 1.2.1
9
+
10
+ - name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
11
+
12
+ ### 1.2.0
13
+
14
+ - Change format for sequence_name's sub-index for unqalified Emoji sequences
15
+
16
+ ### 1.1.2
17
+
18
+ - Update CLDR to v46
19
+
20
+ ### 1.1.1
21
+
22
+ - Fix bug related to unsafe characters
23
+ - Fix squared CJK
24
+ - Small adjustments for scripts and blocks index builders
25
+
3
26
  ### 1.1.0
4
27
 
5
28
  - Improve name index size: Support ranges
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.1.0)
4
+ unicoder (1.3.0)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -39,6 +39,9 @@ Index Name | Module
39
39
  --------------|----
40
40
  name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
41
  numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
+ scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
43
+ blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
44
+ categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
42
45
 
43
46
  ## MIT License
44
47
 
@@ -73,8 +73,18 @@ module Unicoder
73
73
  file = File.read(LOCAL_DATA_DIRECTORY + filename)
74
74
 
75
75
  if parse_mode == :line
76
+ active = !parse_options[:begin]
77
+
76
78
  file.each_line{ |line|
77
- yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
79
+ if !active && parse_options[:begin] && line.match?(parse_options[:begin])
80
+ active = true
81
+ elsif active && parse_options[:end] && line.match?(parse_options[:end])
82
+ active = false
83
+ end
84
+
85
+ if active
86
+ yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
87
+ end
78
88
  }
79
89
  elsif parse_mode == :xml
80
90
  require "oga"
@@ -4,12 +4,14 @@ module Unicoder
4
4
  include Builder
5
5
 
6
6
  def initialize_index
7
- @index = []
7
+ @index = {
8
+ BLOCKS: []
9
+ }
8
10
  end
9
11
 
10
12
  def parse!
11
13
  parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
12
- @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
14
+ @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
13
15
  end
14
16
  end
15
17
  end
@@ -9,6 +9,12 @@ module Unicoder
9
9
  @index = {
10
10
  CATEGORIES: [],
11
11
  CATEGORY_NAMES: {},
12
+ OFFSETS: [
13
+ 0x10000,
14
+ 0x1000,
15
+ 0x100,
16
+ 0x10
17
+ ],
12
18
  }
13
19
  @range_start = nil
14
20
  end
@@ -3,17 +3,38 @@ module Unicoder
3
3
  class Confusable
4
4
  include Builder
5
5
 
6
+ def initialize_index
7
+ @index = {
8
+ CONFUSABLE: {},
9
+ IGNORABLE: [],
10
+ }
11
+ end
12
+
6
13
  def parse!
7
- parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+)\s+;.*$/ do |line|
14
+ parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
8
15
  source = line["from"].to_i(16)
9
16
  if line["to"].include?(" ")
10
17
  replace_with = line["to"].split(" ").map{ |codepoint|
18
+ cp = codepoint.to_i(16)
19
+ option =~ /charvalues/ ? [cp].pack("U") : cp
20
+ }
21
+ else
22
+ cp = line["to"].to_i(16)
23
+ replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
24
+ end
25
+ assign :CONFUSABLE, source, replace_with
26
+ end
27
+
28
+ parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
29
+ if line["codepoints"]['..']
30
+ single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
11
31
  codepoint.to_i(16)
12
32
  }
13
33
  else
14
- replace_with = line["to"].to_i(16)
34
+ single_or_multiple_codepoints = line["codepoints"].to_i(16)
15
35
  end
16
- @index[source] = replace_with
36
+
37
+ @index[:IGNORABLE] << single_or_multiple_codepoints
17
38
  end
18
39
  end
19
40
  end
@@ -27,7 +27,7 @@ module Unicoder
27
27
  "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
28
  "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
29
  "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
- "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
31
31
  },
32
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
33
33
  JAMO: {
@@ -68,7 +68,7 @@ module Unicoder
68
68
  elsif line["name"] != "<control>"
69
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
70
70
  end
71
- elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
72
72
  # ignore
73
73
  else
74
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
@@ -10,6 +10,12 @@ module Unicoder
10
10
  SCRIPT_EXTENSIONS: {},
11
11
  SCRIPT_ALIASES: {},
12
12
  SCRIPT_NAMES: [],
13
+ OFFSETS: [
14
+ 0x10000,
15
+ 0x1000,
16
+ 0x100,
17
+ 0x10
18
+ ],
13
19
  }
14
20
  @reverse_script_names = {}
15
21
  @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
21
27
  }
22
28
  end
23
29
 
30
+ # TODO refactor how multiple indexes are organized
31
+ def assign_classic(sub_index_name, codepoint, value)
32
+ idx = @index[sub_index_name]
33
+
34
+ if option =~ /charkeys/
35
+ idx[[codepoint].pack("U*")] = value
36
+ else
37
+ idx[codepoint] = value
38
+ end
39
+ end
40
+
24
41
  def parse!
25
42
  parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
26
43
  @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
47
64
  parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
48
65
  if line["to"]
49
66
  (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
50
- @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
67
+ assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
51
68
  }
52
69
  else
53
- @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
70
+ assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
54
71
  end
55
72
  end
56
73
  end
@@ -11,7 +11,7 @@ module Unicoder
11
11
  def initialize_index
12
12
  @index = {
13
13
  SEQUENCES: {},
14
- SEQUENCES_NOT_QUALIFIED: {},
14
+ EMOJI_NOT_QUALIFIED: {},
15
15
  }
16
16
  @words = []
17
17
  end
@@ -74,8 +74,12 @@ module Unicoder
74
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
75
75
  codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
76
  assign_codepoint codepoints, name
77
+
78
+
79
+ # Build all combinations of VS16 present and missing and add to second index
77
80
  if codepoints.include?(0xFE0F)
78
- # Build all combinations of VS16 present and missing
81
+ sequence = codepoints.pack("U*")
82
+
79
83
  codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
84
  if cur.include? 0xFE0F
81
85
  acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
@@ -85,13 +89,13 @@ module Unicoder
85
89
  }.
86
90
  select {|sub_codepoints| sub_codepoints != codepoints }.
87
91
  each { |sub_codepoints|
88
- assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
92
+ sub_sequence = sub_codepoints.pack("U*")
93
+ @index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
89
94
  }
90
95
  end
91
96
  end
92
97
 
93
98
  replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
- replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
95
99
  end
96
100
  end
97
101
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.1.0"
4
+ VERSION = "1.3.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -57,7 +57,7 @@ module Unicoder
57
57
 
58
58
  IVD_VERSION = "2022-09-13"
59
59
 
60
- CLDR_VERSION = "45"
60
+ CLDR_VERSION = "46"
61
61
 
62
62
  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
63
63
 
@@ -69,6 +69,7 @@ module Unicoder
69
69
  name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
70
  confusables: "/security/UNICODE_VERSION/confusables.txt",
71
71
  blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
72
+ core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
72
73
  scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
73
74
  script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
74
75
  property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
@@ -2,8 +2,9 @@ require "json"
2
2
 
3
3
  module Unicoder
4
4
  module ReplaceCommonWords
5
- def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
- puts "Starting to replace the #{count} most common words"
5
+ def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
6
+ base = @words.join.chars.max.ord + 1
7
+ puts "Starting to replace the #{count} most common words (replace base: #{base})"
7
8
  @index[:REPLACE_BASE] = base
8
9
  @index[:COMMON_WORDS] = words.
9
10
  select{_1.size >= min_word_length}.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-09 00:00:00.000000000 Z
11
+ date: 2024-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -61,7 +61,6 @@ extensions: []
61
61
  extra_rdoc_files: []
62
62
  files:
63
63
  - ".gitignore"
64
- - ".travis.yml"
65
64
  - CHANGELOG.md
66
65
  - CODE_OF_CONDUCT.md
67
66
  - Gemfile
data/.travis.yml DELETED
@@ -1,20 +0,0 @@
1
- sudo: false
2
- language: ruby
3
-
4
- rvm:
5
- - 2.7
6
- - 2.6
7
- - 2.5
8
- - 2.4
9
- - 2.3
10
- - ruby-head
11
- - jruby-9.2.9.0
12
- - truffleruby
13
-
14
- matrix:
15
- allow_failures:
16
- - rvm: 2.3
17
- - rvm: ruby-head
18
- - rvm: jruby-2.9.2.0
19
- - rvm: truffleruby
20
- # fast_finish: true