unicoder 1.1.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
4
- data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
3
+ metadata.gz: 9064718b14baf32790bb6f9a705aab4d07231771f1c663546dea90413b0b68d9
4
+ data.tar.gz: 40f6145a58220620accb91bda68b06c2e2ceeea12b3cee4dc6a1c021e8fc9194
5
5
  SHA512:
6
- metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
7
- data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
6
+ metadata.gz: f69ddbfcef5269be204fed89d00018312a623330c28382531922fb5fe0093c9dfee7aa7e7bc5bff069ba1e57341fbb62e915d34bb27f37cec10283b7e8bbc678
7
+ data.tar.gz: f4499a5ce299023e3752ed69df04e148d9937ac355ae7e415525f46c402761992540de6ae5fc8d1b22ab4a2dc154220d722ad940d4878fca3c9e3f6a306d00a1
data/CHANGELOG.md CHANGED
@@ -1,5 +1,28 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.3.0
4
+
5
+ - confusable: Add ignorables
6
+ - confusable: Nest index and make ESM/charkeys version, fix ";"
7
+
8
+ ### 1.2.1
9
+
10
+ - name: Fix some CJK Compatibility Ideographs not declared in CP_RANGES
11
+
12
+ ### 1.2.0
13
+
14
+ - Change format for sequence_name's sub-index for unqalified Emoji sequences
15
+
16
+ ### 1.1.2
17
+
18
+ - Update CLDR to v46
19
+
20
+ ### 1.1.1
21
+
22
+ - Fix bug related to unsafe characters
23
+ - Fix squared CJK
24
+ - Small adjustments for scripts and blocks index builders
25
+
3
26
  ### 1.1.0
4
27
 
5
28
  - Improve name index size: Support ranges
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.1.0)
4
+ unicoder (1.3.0)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -39,6 +39,9 @@ Index Name | Module
39
39
  --------------|----
40
40
  name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
41
  numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
+ scripts | [unicode-script.js](https://github.com/janlelis/unicode-script.js)
43
+ blocks | [unicode-block.js](https://github.com/janlelis/unicode-block.js)
44
+ categories | [unicode-category.js](https://github.com/janlelis/unicode-category.js)
42
45
 
43
46
  ## MIT License
44
47
 
@@ -73,8 +73,18 @@ module Unicoder
73
73
  file = File.read(LOCAL_DATA_DIRECTORY + filename)
74
74
 
75
75
  if parse_mode == :line
76
+ active = !parse_options[:begin]
77
+
76
78
  file.each_line{ |line|
77
- yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
79
+ if !active && parse_options[:begin] && line.match?(parse_options[:begin])
80
+ active = true
81
+ elsif active && parse_options[:end] && line.match?(parse_options[:end])
82
+ active = false
83
+ end
84
+
85
+ if active
86
+ yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
87
+ end
78
88
  }
79
89
  elsif parse_mode == :xml
80
90
  require "oga"
@@ -4,12 +4,14 @@ module Unicoder
4
4
  include Builder
5
5
 
6
6
  def initialize_index
7
- @index = []
7
+ @index = {
8
+ BLOCKS: []
9
+ }
8
10
  end
9
11
 
10
12
  def parse!
11
13
  parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
12
- @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
14
+ @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
13
15
  end
14
16
  end
15
17
  end
@@ -9,6 +9,12 @@ module Unicoder
9
9
  @index = {
10
10
  CATEGORIES: [],
11
11
  CATEGORY_NAMES: {},
12
+ OFFSETS: [
13
+ 0x10000,
14
+ 0x1000,
15
+ 0x100,
16
+ 0x10
17
+ ],
12
18
  }
13
19
  @range_start = nil
14
20
  end
@@ -3,17 +3,38 @@ module Unicoder
3
3
  class Confusable
4
4
  include Builder
5
5
 
6
+ def initialize_index
7
+ @index = {
8
+ CONFUSABLE: {},
9
+ IGNORABLE: [],
10
+ }
11
+ end
12
+
6
13
  def parse!
7
- parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+)\s+;.*$/ do |line|
14
+ parse_file :confusables, :line, regex: /^(?<from>\S+)\s+;\s+(?<to>.+?)\s+;.*$/ do |line|
8
15
  source = line["from"].to_i(16)
9
16
  if line["to"].include?(" ")
10
17
  replace_with = line["to"].split(" ").map{ |codepoint|
18
+ cp = codepoint.to_i(16)
19
+ option =~ /charvalues/ ? [cp].pack("U") : cp
20
+ }
21
+ else
22
+ cp = line["to"].to_i(16)
23
+ replace_with = option =~ /charvalues/ ? [cp].pack("U") : cp
24
+ end
25
+ assign :CONFUSABLE, source, replace_with
26
+ end
27
+
28
+ parse_file :core_properties, :line, begin: /^# Derived Property: Default_Ignorable_Code_Point$/, end: /^# ================================================$/, regex: /^(?<codepoints>\S+)\s+; Default_Ignorable_Code_Point.*$/ do |line|
29
+ if line["codepoints"]['..']
30
+ single_or_multiple_codepoints = line["codepoints"].split('..').map{ |codepoint|
11
31
  codepoint.to_i(16)
12
32
  }
13
33
  else
14
- replace_with = line["to"].to_i(16)
34
+ single_or_multiple_codepoints = line["codepoints"].to_i(16)
15
35
  end
16
- @index[source] = replace_with
36
+
37
+ @index[:IGNORABLE] << single_or_multiple_codepoints
17
38
  end
18
39
  end
19
40
  end
@@ -27,7 +27,7 @@ module Unicoder
27
27
  "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
28
  "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
29
  "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
- "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0xF900, 0xFAFF], [0x2F800, 0x2FA1D]],
31
31
  },
32
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
33
33
  JAMO: {
@@ -68,7 +68,7 @@ module Unicoder
68
68
  elsif line["name"] != "<control>"
69
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
70
70
  end
71
- elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
72
72
  # ignore
73
73
  else
74
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
@@ -10,6 +10,12 @@ module Unicoder
10
10
  SCRIPT_EXTENSIONS: {},
11
11
  SCRIPT_ALIASES: {},
12
12
  SCRIPT_NAMES: [],
13
+ OFFSETS: [
14
+ 0x10000,
15
+ 0x1000,
16
+ 0x100,
17
+ 0x10
18
+ ],
13
19
  }
14
20
  @reverse_script_names = {}
15
21
  @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
21
27
  }
22
28
  end
23
29
 
30
+ # TODO refactor how multiple indexes are organized
31
+ def assign_classic(sub_index_name, codepoint, value)
32
+ idx = @index[sub_index_name]
33
+
34
+ if option =~ /charkeys/
35
+ idx[[codepoint].pack("U*")] = value
36
+ else
37
+ idx[codepoint] = value
38
+ end
39
+ end
40
+
24
41
  def parse!
25
42
  parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
26
43
  @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
47
64
  parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
48
65
  if line["to"]
49
66
  (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
50
- @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
67
+ assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
51
68
  }
52
69
  else
53
- @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
70
+ assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
54
71
  end
55
72
  end
56
73
  end
@@ -11,7 +11,7 @@ module Unicoder
11
11
  def initialize_index
12
12
  @index = {
13
13
  SEQUENCES: {},
14
- SEQUENCES_NOT_QUALIFIED: {},
14
+ EMOJI_NOT_QUALIFIED: {},
15
15
  }
16
16
  @words = []
17
17
  end
@@ -74,8 +74,12 @@ module Unicoder
74
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
75
75
  codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
76
  assign_codepoint codepoints, name
77
+
78
+
79
+ # Build all combinations of VS16 present and missing and add to second index
77
80
  if codepoints.include?(0xFE0F)
78
- # Build all combinations of VS16 present and missing
81
+ sequence = codepoints.pack("U*")
82
+
79
83
  codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
84
  if cur.include? 0xFE0F
81
85
  acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
@@ -85,13 +89,13 @@ module Unicoder
85
89
  }.
86
90
  select {|sub_codepoints| sub_codepoints != codepoints }.
87
91
  each { |sub_codepoints|
88
- assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
92
+ sub_sequence = sub_codepoints.pack("U*")
93
+ @index[:EMOJI_NOT_QUALIFIED][sub_sequence] = sequence
89
94
  }
90
95
  end
91
96
  end
92
97
 
93
98
  replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
- replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
95
99
  end
96
100
  end
97
101
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.1.0"
4
+ VERSION = "1.3.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -57,7 +57,7 @@ module Unicoder
57
57
 
58
58
  IVD_VERSION = "2022-09-13"
59
59
 
60
- CLDR_VERSION = "45"
60
+ CLDR_VERSION = "46"
61
61
 
62
62
  UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
63
63
 
@@ -69,6 +69,7 @@ module Unicoder
69
69
  name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
70
  confusables: "/security/UNICODE_VERSION/confusables.txt",
71
71
  blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
72
+ core_properties: "/UNICODE_VERSION/ucd/DerivedCoreProperties.txt",
72
73
  scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
73
74
  script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
74
75
  property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
@@ -2,8 +2,9 @@ require "json"
2
2
 
3
3
  module Unicoder
4
4
  module ReplaceCommonWords
5
- def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
- puts "Starting to replace the #{count} most common words"
5
+ def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
6
+ base = @words.join.chars.max.ord + 1
7
+ puts "Starting to replace the #{count} most common words (replace base: #{base})"
7
8
  @index[:REPLACE_BASE] = base
8
9
  @index[:COMMON_WORDS] = words.
9
10
  select{_1.size >= min_word_length}.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-09 00:00:00.000000000 Z
11
+ date: 2024-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -61,7 +61,6 @@ extensions: []
61
61
  extra_rdoc_files: []
62
62
  files:
63
63
  - ".gitignore"
64
- - ".travis.yml"
65
64
  - CHANGELOG.md
66
65
  - CODE_OF_CONDUCT.md
67
66
  - Gemfile
data/.travis.yml DELETED
@@ -1,20 +0,0 @@
1
- sudo: false
2
- language: ruby
3
-
4
- rvm:
5
- - 2.7
6
- - 2.6
7
- - 2.5
8
- - 2.4
9
- - 2.3
10
- - ruby-head
11
- - jruby-9.2.9.0
12
- - truffleruby
13
-
14
- matrix:
15
- allow_failures:
16
- - rvm: 2.3
17
- - rvm: ruby-head
18
- - rvm: jruby-2.9.2.0
19
- - rvm: truffleruby
20
- # fast_finish: true