unicode_script_detector 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4795cdb246ac34ccb0ee5183ff0e704d25e4e67410acee321f36f4446dd28356
4
- data.tar.gz: 0c7b9c4c835718f2fc7509225204e38c0a7148949c5745f1448b299c27e0e88d
3
+ metadata.gz: 58ac1a16218a1a336bb9c2fa6c75c5519a69a6c846c1007ffd236d991f3ef223
4
+ data.tar.gz: 11c82345a0b3990737b1c1a85c97d6c7552644a10e0c1d9265a4a46e4a0a9ce3
5
5
  SHA512:
6
- metadata.gz: 5da7422c57295f4ac3dee3ac9ccfaa99b5586418de956a88876035541da023e9fa4afe609a4aa79d4c3a1a5f9b1ffe64370984657844c06fc6a575578beb5ee2
7
- data.tar.gz: aa9fecf48386b6eb5a0074cbbec8819af80153c6111e042debd9e9c312145bc11a936b3003a406370a28e8a27a48b2bd0409c0cf043b26fe465f4c58ee9669e2
6
+ metadata.gz: baad85a939325ce58dc96c132e103b81bf1c9cd0f4a3120675bb6db7ec81bad305759b40da55e8d868128baa3d5bbdf3e28d044ae6f735d0ebce5ed4f89bb190
7
+ data.tar.gz: cfc04d633c2bdbbb0810c505d1cb6d671045b4b0c3651584be63231cb9b722a413c65c59487dbf5953374646af7bdc953ec7e66c4b71c7c1ad069004c8d9dd46
@@ -1,55 +1,65 @@
1
1
  module UnicodeScriptDetector
2
2
  class Detector
3
- attr_reader :characters, :scripts
3
+ attr_reader :scripts
4
4
 
5
5
  def initialize(string)
6
6
  @string = string
7
- @characters = []
8
- @scripts = []
7
+ @char_scripts = []
8
+ @script_names = []
9
9
 
10
10
  detect_scripts
11
11
  end
12
12
 
13
13
  def scripts
14
- @scripts.uniq
15
- end
16
-
17
- def detect_scripts
18
- @string.chars.each_with_index do |char, index|
19
- detected = false
20
- Scripts::LIST.each_with_index do |script_data, index|
21
- if char.match?(script_data[:regex])
22
- @characters << Character.new(char, script_data[:script], script_data[:name])
23
- @scripts << script_data[:script]
24
- detected = true
25
- break
26
- end
27
- end
28
- @characters << Character.new(char, :Other, "Other") unless detected
29
- @scripts << :Other unless detected
30
- end
14
+ @char_scripts.uniq
31
15
  end
32
16
 
33
17
  def contains?(scripts)
34
- return @scripts.include?(scripts) if scripts.is_a?(Symbol)
35
-
36
- scripts.all? { |script| @scripts.include?(script) }
18
+ return @char_scripts.include?(scripts) if scripts.is_a?(Symbol)
19
+ scripts.all? { |script| @char_scripts.include?(script) }
37
20
  end
38
21
 
39
22
  def contains_only?(scripts)
40
- return @scripts.uniq == [scripts] if scripts.is_a?(Symbol)
41
-
42
- @scripts.uniq.sort == scripts.uniq.sort
23
+ return @char_scripts.uniq == [scripts] if scripts.is_a?(Symbol)
24
+ @char_scripts.uniq.sort == scripts.uniq.sort
43
25
  end
44
26
 
45
27
  def script_groups
46
- @characters
47
- .chunk { |char| char.script }
48
- .map { |script, chars| ScriptGroup.new(script, chars) }
28
+ @string.chars
29
+ .zip(@char_scripts, @script_names)
30
+ .chunk { |_, script, _| script }
31
+ .map do |script, char_data|
32
+ chars = char_data.map(&:first)
33
+ name = char_data.first[2]
34
+ ScriptGroup.new(script, chars, name)
35
+ end
49
36
  end
50
37
 
51
38
  def grouped_scripts_hash
52
39
  script_groups.map { |group| [group.script, group.text] }.to_h
53
40
  end
41
+
42
+ def characters
43
+ @characters ||= @string.chars.zip(@char_scripts, @script_names).map do |char, script, name|
44
+ Character.new(char, script, name)
45
+ end
46
+ end
47
+
48
+ private
49
+ def detect_scripts
50
+ @string.chars.each do |char|
51
+ script_info = find_script_for_char(char)
52
+ @char_scripts << script_info[:script]
53
+ @script_names << script_info[:name]
54
+ end
55
+ end
56
+
57
+ def find_script_for_char(char)
58
+ Scripts::LIST.each do |script_data|
59
+ return script_data if char.match?(script_data[:regex])
60
+ end
61
+ { script: :Other, name: "Other" }
62
+ end
63
+
54
64
  end
55
65
  end
@@ -1,19 +1,20 @@
1
1
  module UnicodeScriptDetector
2
2
  class ScriptGroup
3
- attr_reader :script, :characters, :text
3
+ attr_reader :script, :text, :name
4
4
 
5
- def initialize(script, characters)
5
+ def initialize(script, chars, name)
6
6
  @script = script
7
- @characters = characters
8
- @text = characters.map(&:char).join
7
+ @chars = chars
8
+ @text = chars.join
9
+ @name = name
9
10
  end
10
11
 
11
12
  def length
12
- @characters.length
13
+ @chars.length
13
14
  end
14
15
 
15
- def name
16
- @characters.first&.name
16
+ def characters
17
+ @characters ||= @chars.map { |char| Character.new(char, @script, @name) }
17
18
  end
18
19
  end
19
20
  end
@@ -11,11 +11,6 @@ module UnicodeScriptDetector
11
11
  name: "Digit",
12
12
  regex: /\d/
13
13
  },
14
- {
15
- script: :Punctuation,
16
- name: "Punctuation",
17
- regex: /[[:punct:]]/
18
- },
19
14
  {
20
15
  script: :Adlam,
21
16
  name: "Adlam",
@@ -842,7 +837,12 @@ module UnicodeScriptDetector
842
837
  {
843
838
  script: :Emoji,
844
839
  name: "Emoji",
845
- regex: /\p{Emoji}/,
840
+ regex: /\p{Emoji_Presentation}/,
841
+ },
842
+ {
843
+ script: :Punctuation,
844
+ name: "Punctuation",
845
+ regex: /[[:punct:]]/
846
846
  },
847
847
  {
848
848
  script: :Common,
@@ -1,3 +1,3 @@
1
1
  module UnicodeScriptDetector
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode_script_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Arendsen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-12-31 00:00:00.000000000 Z
11
+ date: 2026-01-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: zeitwerk