unicode_script_detector 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 58ac1a16218a1a336bb9c2fa6c75c5519a69a6c846c1007ffd236d991f3ef223
|
|
4
|
+
data.tar.gz: 11c82345a0b3990737b1c1a85c97d6c7552644a10e0c1d9265a4a46e4a0a9ce3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: baad85a939325ce58dc96c132e103b81bf1c9cd0f4a3120675bb6db7ec81bad305759b40da55e8d868128baa3d5bbdf3e28d044ae6f735d0ebce5ed4f89bb190
|
|
7
|
+
data.tar.gz: cfc04d633c2bdbbb0810c505d1cb6d671045b4b0c3651584be63231cb9b722a413c65c59487dbf5953374646af7bdc953ec7e66c4b71c7c1ad069004c8d9dd46
|
|
@@ -1,55 +1,65 @@
|
|
|
1
1
|
module UnicodeScriptDetector
|
|
2
2
|
class Detector
|
|
3
|
-
attr_reader :
|
|
3
|
+
attr_reader :scripts
|
|
4
4
|
|
|
5
5
|
def initialize(string)
|
|
6
6
|
@string = string
|
|
7
|
-
@
|
|
8
|
-
@
|
|
7
|
+
@char_scripts = []
|
|
8
|
+
@script_names = []
|
|
9
9
|
|
|
10
10
|
detect_scripts
|
|
11
11
|
end
|
|
12
12
|
|
|
13
13
|
def scripts
|
|
14
|
-
@
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def detect_scripts
|
|
18
|
-
@string.chars.each_with_index do |char, index|
|
|
19
|
-
detected = false
|
|
20
|
-
Scripts::LIST.each_with_index do |script_data, index|
|
|
21
|
-
if char.match?(script_data[:regex])
|
|
22
|
-
@characters << Character.new(char, script_data[:script], script_data[:name])
|
|
23
|
-
@scripts << script_data[:script]
|
|
24
|
-
detected = true
|
|
25
|
-
break
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
@characters << Character.new(char, :Other, "Other") unless detected
|
|
29
|
-
@scripts << :Other unless detected
|
|
30
|
-
end
|
|
14
|
+
@char_scripts.uniq
|
|
31
15
|
end
|
|
32
16
|
|
|
33
17
|
def contains?(scripts)
|
|
34
|
-
return @
|
|
35
|
-
|
|
36
|
-
scripts.all? { |script| @scripts.include?(script) }
|
|
18
|
+
return @char_scripts.include?(scripts) if scripts.is_a?(Symbol)
|
|
19
|
+
scripts.all? { |script| @char_scripts.include?(script) }
|
|
37
20
|
end
|
|
38
21
|
|
|
39
22
|
def contains_only?(scripts)
|
|
40
|
-
return @
|
|
41
|
-
|
|
42
|
-
@scripts.uniq.sort == scripts.uniq.sort
|
|
23
|
+
return @char_scripts.uniq == [scripts] if scripts.is_a?(Symbol)
|
|
24
|
+
@char_scripts.uniq.sort == scripts.uniq.sort
|
|
43
25
|
end
|
|
44
26
|
|
|
45
27
|
def script_groups
|
|
46
|
-
@
|
|
47
|
-
.
|
|
48
|
-
.
|
|
28
|
+
@string.chars
|
|
29
|
+
.zip(@char_scripts, @script_names)
|
|
30
|
+
.chunk { |_, script, _| script }
|
|
31
|
+
.map do |script, char_data|
|
|
32
|
+
chars = char_data.map(&:first)
|
|
33
|
+
name = char_data.first[2]
|
|
34
|
+
ScriptGroup.new(script, chars, name)
|
|
35
|
+
end
|
|
49
36
|
end
|
|
50
37
|
|
|
51
38
|
def grouped_scripts_hash
|
|
52
39
|
script_groups.map { |group| [group.script, group.text] }.to_h
|
|
53
40
|
end
|
|
41
|
+
|
|
42
|
+
def characters
|
|
43
|
+
@characters ||= @string.chars.zip(@char_scripts, @script_names).map do |char, script, name|
|
|
44
|
+
Character.new(char, script, name)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
def detect_scripts
|
|
50
|
+
@string.chars.each do |char|
|
|
51
|
+
script_info = find_script_for_char(char)
|
|
52
|
+
@char_scripts << script_info[:script]
|
|
53
|
+
@script_names << script_info[:name]
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def find_script_for_char(char)
|
|
58
|
+
Scripts::LIST.each do |script_data|
|
|
59
|
+
return script_data if char.match?(script_data[:regex])
|
|
60
|
+
end
|
|
61
|
+
{ script: :Other, name: "Other" }
|
|
62
|
+
end
|
|
63
|
+
|
|
54
64
|
end
|
|
55
65
|
end
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
module UnicodeScriptDetector
|
|
2
2
|
class ScriptGroup
|
|
3
|
-
attr_reader :script, :
|
|
3
|
+
attr_reader :script, :text, :name
|
|
4
4
|
|
|
5
|
-
def initialize(script,
|
|
5
|
+
def initialize(script, chars, name)
|
|
6
6
|
@script = script
|
|
7
|
-
@
|
|
8
|
-
@text =
|
|
7
|
+
@chars = chars
|
|
8
|
+
@text = chars.join
|
|
9
|
+
@name = name
|
|
9
10
|
end
|
|
10
11
|
|
|
11
12
|
def length
|
|
12
|
-
@
|
|
13
|
+
@chars.length
|
|
13
14
|
end
|
|
14
15
|
|
|
15
|
-
def
|
|
16
|
-
@characters.
|
|
16
|
+
def characters
|
|
17
|
+
@characters ||= @chars.map { |char| Character.new(char, @script, @name) }
|
|
17
18
|
end
|
|
18
19
|
end
|
|
19
20
|
end
|
|
@@ -11,11 +11,6 @@ module UnicodeScriptDetector
|
|
|
11
11
|
name: "Digit",
|
|
12
12
|
regex: /\d/
|
|
13
13
|
},
|
|
14
|
-
{
|
|
15
|
-
script: :Punctuation,
|
|
16
|
-
name: "Punctuation",
|
|
17
|
-
regex: /[[:punct:]]/
|
|
18
|
-
},
|
|
19
14
|
{
|
|
20
15
|
script: :Adlam,
|
|
21
16
|
name: "Adlam",
|
|
@@ -842,7 +837,12 @@ module UnicodeScriptDetector
|
|
|
842
837
|
{
|
|
843
838
|
script: :Emoji,
|
|
844
839
|
name: "Emoji",
|
|
845
|
-
regex: /\p{
|
|
840
|
+
regex: /\p{Emoji_Presentation}/,
|
|
841
|
+
},
|
|
842
|
+
{
|
|
843
|
+
script: :Punctuation,
|
|
844
|
+
name: "Punctuation",
|
|
845
|
+
regex: /[[:punct:]]/
|
|
846
846
|
},
|
|
847
847
|
{
|
|
848
848
|
script: :Common,
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: unicode_script_detector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Arendsen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-01-02 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: zeitwerk
|