unicode_script_detector 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +30 -8
- data/lib/unicode_script_detector/character.rb +13 -1
- data/lib/unicode_script_detector/detector.rb +45 -25
- data/lib/unicode_script_detector/script_group.rb +20 -0
- data/lib/unicode_script_detector/scripts.rb +6 -1
- data/lib/unicode_script_detector/version.rb +1 -1
- data/lib/unicode_script_detector.rb +5 -1
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 58ac1a16218a1a336bb9c2fa6c75c5519a69a6c846c1007ffd236d991f3ef223
|
|
4
|
+
data.tar.gz: 11c82345a0b3990737b1c1a85c97d6c7552644a10e0c1d9265a4a46e4a0a9ce3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: baad85a939325ce58dc96c132e103b81bf1c9cd0f4a3120675bb6db7ec81bad305759b40da55e8d868128baa3d5bbdf3e28d044ae6f735d0ebce5ed4f89bb190
|
|
7
|
+
data.tar.gz: cfc04d633c2bdbbb0810c505d1cb6d671045b4b0c3651584be63231cb9b722a413c65c59487dbf5953374646af7bdc953ec7e66c4b71c7c1ad069004c8d9dd46
|
data/README.md
CHANGED
|
@@ -20,23 +20,45 @@ $ gem install unicode_script_detector
|
|
|
20
20
|
UnicodeScriptDetector.detect_characters "Hel6б"
|
|
21
21
|
|
|
22
22
|
#Output:
|
|
23
|
-
[
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
[
|
|
24
|
+
#<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
|
|
25
|
+
#<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
|
|
26
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
|
|
27
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
|
|
28
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>
|
|
29
|
+
]
|
|
28
30
|
```
|
|
29
31
|
|
|
30
32
|
## Detect if a script contains certain scripts
|
|
31
33
|
```ruby
|
|
32
34
|
# This will return true because it contains Latin and Cyrillic
|
|
33
|
-
UnicodeScriptDetector.contains? "
|
|
35
|
+
UnicodeScriptDetector.contains? "Helб🔥", [:Latin, :Cyrillic]
|
|
34
36
|
```
|
|
35
37
|
|
|
36
38
|
## Detect if a script contains only certain scripts
|
|
37
39
|
```ruby
|
|
38
40
|
# This will return false because it contains an Emoji as well
|
|
39
|
-
UnicodeScriptDetector.contains_only? "
|
|
41
|
+
UnicodeScriptDetector.contains_only? "Helб🔥", [:Latin, :Cyrillic]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Detect all the characters of a string, grouped by the script
|
|
45
|
+
```ruby
|
|
46
|
+
UnicodeScriptDetector.script_groups("Hel6б how are you?").each do |group|
|
|
47
|
+
puts "#{group.name}: #{group.text} (#{group.length} characters)"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
#Output:
|
|
51
|
+
|
|
52
|
+
Latin: Hel (3 characters)
|
|
53
|
+
Digit: 6 (1 characters)
|
|
54
|
+
Cyrillic: б (1 characters)
|
|
55
|
+
Whitespace: (1 characters)
|
|
56
|
+
Latin: how (3 characters)
|
|
57
|
+
Whitespace: (1 characters)
|
|
58
|
+
Latin: are (3 characters)
|
|
59
|
+
Whitespace: (1 characters)
|
|
60
|
+
Latin: you (3 characters)
|
|
61
|
+
Punctuation: ? (1 characters)
|
|
40
62
|
```
|
|
41
63
|
|
|
42
64
|
## Development
|
|
@@ -47,4 +69,4 @@ Run the tests with `bin/test`.
|
|
|
47
69
|
You're welcome to contribute to this project. See https://github.com/davidarendsen/unicode_script_detector.
|
|
48
70
|
|
|
49
71
|
## License
|
|
50
|
-
This software is released under the [MIT license](LICENSE).
|
|
72
|
+
This software is released under the [MIT license](LICENSE).
|
|
@@ -15,5 +15,17 @@ module UnicodeScriptDetector
|
|
|
15
15
|
def hiragana?
|
|
16
16
|
@script === :Hiragana
|
|
17
17
|
end
|
|
18
|
+
|
|
19
|
+
def punctuation?
|
|
20
|
+
@script === :Punctuation
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def emoji?
|
|
24
|
+
@script === :Emoji
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def digit?
|
|
28
|
+
@script === :Digit
|
|
29
|
+
end
|
|
18
30
|
end
|
|
19
|
-
end
|
|
31
|
+
end
|
|
@@ -1,45 +1,65 @@
|
|
|
1
1
|
module UnicodeScriptDetector
|
|
2
2
|
class Detector
|
|
3
|
-
attr_reader :
|
|
3
|
+
attr_reader :scripts
|
|
4
4
|
|
|
5
5
|
def initialize(string)
|
|
6
6
|
@string = string
|
|
7
|
-
@
|
|
8
|
-
@
|
|
7
|
+
@char_scripts = []
|
|
8
|
+
@script_names = []
|
|
9
9
|
|
|
10
10
|
detect_scripts
|
|
11
11
|
end
|
|
12
12
|
|
|
13
13
|
def scripts
|
|
14
|
-
@
|
|
14
|
+
@char_scripts.uniq
|
|
15
15
|
end
|
|
16
16
|
|
|
17
|
-
def
|
|
18
|
-
@
|
|
19
|
-
|
|
20
|
-
Scripts::LIST.each_with_index do |script_data, index|
|
|
21
|
-
if char.match?(script_data[:regex])
|
|
22
|
-
@characters << Character.new(char, script_data[:script], script_data[:name])
|
|
23
|
-
@scripts << script_data[:script]
|
|
24
|
-
detected = true
|
|
25
|
-
break
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
@characters << Character.new(char, :Other, "Other") unless detected
|
|
29
|
-
@scripts << :Other unless detected
|
|
30
|
-
end
|
|
17
|
+
def contains?(scripts)
|
|
18
|
+
return @char_scripts.include?(scripts) if scripts.is_a?(Symbol)
|
|
19
|
+
scripts.all? { |script| @char_scripts.include?(script) }
|
|
31
20
|
end
|
|
32
21
|
|
|
33
|
-
def
|
|
34
|
-
return @
|
|
22
|
+
def contains_only?(scripts)
|
|
23
|
+
return @char_scripts.uniq == [scripts] if scripts.is_a?(Symbol)
|
|
24
|
+
@char_scripts.uniq.sort == scripts.uniq.sort
|
|
25
|
+
end
|
|
35
26
|
|
|
36
|
-
|
|
27
|
+
def script_groups
|
|
28
|
+
@string.chars
|
|
29
|
+
.zip(@char_scripts, @script_names)
|
|
30
|
+
.chunk { |_, script, _| script }
|
|
31
|
+
.map do |script, char_data|
|
|
32
|
+
chars = char_data.map(&:first)
|
|
33
|
+
name = char_data.first[2]
|
|
34
|
+
ScriptGroup.new(script, chars, name)
|
|
35
|
+
end
|
|
37
36
|
end
|
|
38
37
|
|
|
39
|
-
def
|
|
40
|
-
|
|
38
|
+
def grouped_scripts_hash
|
|
39
|
+
script_groups.map { |group| [group.script, group.text] }.to_h
|
|
40
|
+
end
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
def characters
|
|
43
|
+
@characters ||= @string.chars.zip(@char_scripts, @script_names).map do |char, script, name|
|
|
44
|
+
Character.new(char, script, name)
|
|
45
|
+
end
|
|
43
46
|
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
def detect_scripts
|
|
50
|
+
@string.chars.each do |char|
|
|
51
|
+
script_info = find_script_for_char(char)
|
|
52
|
+
@char_scripts << script_info[:script]
|
|
53
|
+
@script_names << script_info[:name]
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def find_script_for_char(char)
|
|
58
|
+
Scripts::LIST.each do |script_data|
|
|
59
|
+
return script_data if char.match?(script_data[:regex])
|
|
60
|
+
end
|
|
61
|
+
{ script: :Other, name: "Other" }
|
|
62
|
+
end
|
|
63
|
+
|
|
44
64
|
end
|
|
45
|
-
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module UnicodeScriptDetector
|
|
2
|
+
class ScriptGroup
|
|
3
|
+
attr_reader :script, :text, :name
|
|
4
|
+
|
|
5
|
+
def initialize(script, chars, name)
|
|
6
|
+
@script = script
|
|
7
|
+
@chars = chars
|
|
8
|
+
@text = chars.join
|
|
9
|
+
@name = name
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def length
|
|
13
|
+
@chars.length
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def characters
|
|
17
|
+
@characters ||= @chars.map { |char| Character.new(char, @script, @name) }
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -837,7 +837,12 @@ module UnicodeScriptDetector
|
|
|
837
837
|
{
|
|
838
838
|
script: :Emoji,
|
|
839
839
|
name: "Emoji",
|
|
840
|
-
regex: /\p{
|
|
840
|
+
regex: /\p{Emoji_Presentation}/,
|
|
841
|
+
},
|
|
842
|
+
{
|
|
843
|
+
script: :Punctuation,
|
|
844
|
+
name: "Punctuation",
|
|
845
|
+
regex: /[[:punct:]]/
|
|
841
846
|
},
|
|
842
847
|
{
|
|
843
848
|
script: :Common,
|
|
@@ -9,6 +9,10 @@ module UnicodeScriptDetector
|
|
|
9
9
|
UnicodeScriptDetector::Detector.new(string).characters
|
|
10
10
|
end
|
|
11
11
|
|
|
12
|
+
def script_groups(string)
|
|
13
|
+
UnicodeScriptDetector::Detector.new(string).script_groups
|
|
14
|
+
end
|
|
15
|
+
|
|
12
16
|
def contains?(string, scripts)
|
|
13
17
|
UnicodeScriptDetector::Detector.new(string).contains?(scripts)
|
|
14
18
|
end
|
|
@@ -17,4 +21,4 @@ module UnicodeScriptDetector
|
|
|
17
21
|
UnicodeScriptDetector::Detector.new(string).contains_only?(scripts)
|
|
18
22
|
end
|
|
19
23
|
end
|
|
20
|
-
end
|
|
24
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: unicode_script_detector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Arendsen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-01-02 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: zeitwerk
|
|
@@ -55,6 +55,7 @@ files:
|
|
|
55
55
|
- lib/unicode_script_detector.rb
|
|
56
56
|
- lib/unicode_script_detector/character.rb
|
|
57
57
|
- lib/unicode_script_detector/detector.rb
|
|
58
|
+
- lib/unicode_script_detector/script_group.rb
|
|
58
59
|
- lib/unicode_script_detector/scripts.rb
|
|
59
60
|
- lib/unicode_script_detector/version.rb
|
|
60
61
|
homepage: https://github.com/davidarendsen/unicode_script_detector
|
|
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
79
80
|
- !ruby/object:Gem::Version
|
|
80
81
|
version: '0'
|
|
81
82
|
requirements: []
|
|
82
|
-
rubygems_version: 3.5.
|
|
83
|
+
rubygems_version: 3.5.11
|
|
83
84
|
signing_key:
|
|
84
85
|
specification_version: 4
|
|
85
86
|
summary: Unicode Script Detector
|