unicode_script_detector 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +30 -8
- data/lib/unicode_script_detector/character.rb +13 -1
- data/lib/unicode_script_detector/detector.rb +11 -1
- data/lib/unicode_script_detector/script_group.rb +19 -0
- data/lib/unicode_script_detector/scripts.rb +5 -0
- data/lib/unicode_script_detector/version.rb +1 -1
- data/lib/unicode_script_detector.rb +5 -1
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4795cdb246ac34ccb0ee5183ff0e704d25e4e67410acee321f36f4446dd28356
|
|
4
|
+
data.tar.gz: 0c7b9c4c835718f2fc7509225204e38c0a7148949c5745f1448b299c27e0e88d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5da7422c57295f4ac3dee3ac9ccfaa99b5586418de956a88876035541da023e9fa4afe609a4aa79d4c3a1a5f9b1ffe64370984657844c06fc6a575578beb5ee2
|
|
7
|
+
data.tar.gz: aa9fecf48386b6eb5a0074cbbec8819af80153c6111e042debd9e9c312145bc11a936b3003a406370a28e8a27a48b2bd0409c0cf043b26fe465f4c58ee9669e2
|
data/README.md
CHANGED
|
@@ -20,23 +20,45 @@ $ gem install unicode_script_detector
|
|
|
20
20
|
UnicodeScriptDetector.detect_characters "Hel6б"
|
|
21
21
|
|
|
22
22
|
#Output:
|
|
23
|
-
[
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
[
|
|
24
|
+
#<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
|
|
25
|
+
#<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
|
|
26
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
|
|
27
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
|
|
28
|
+
#<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>
|
|
29
|
+
]
|
|
28
30
|
```
|
|
29
31
|
|
|
30
32
|
## Detect if a script contains certain scripts
|
|
31
33
|
```ruby
|
|
32
34
|
# This will return true because it contains Latin and Cyrillic
|
|
33
|
-
UnicodeScriptDetector.contains? "
|
|
35
|
+
UnicodeScriptDetector.contains? "Helб🔥", [:Latin, :Cyrillic]
|
|
34
36
|
```
|
|
35
37
|
|
|
36
38
|
## Detect if a script contains only certain scripts
|
|
37
39
|
```ruby
|
|
38
40
|
# This will return false because it contains an Emoji as well
|
|
39
|
-
UnicodeScriptDetector.contains_only? "
|
|
41
|
+
UnicodeScriptDetector.contains_only? "Helб🔥", [:Latin, :Cyrillic]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Detect all the characters of a string, grouped by the script
|
|
45
|
+
```ruby
|
|
46
|
+
UnicodeScriptDetector.script_groups("Hel6б how are you?").each do |group|
|
|
47
|
+
puts "#{group.name}: #{group.text} (#{group.length} characters)"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
#Output:
|
|
51
|
+
|
|
52
|
+
Latin: Hel (3 characters)
|
|
53
|
+
Digit: 6 (1 characters)
|
|
54
|
+
Cyrillic: б (1 characters)
|
|
55
|
+
Whitespace: (1 characters)
|
|
56
|
+
Latin: how (3 characters)
|
|
57
|
+
Whitespace: (1 characters)
|
|
58
|
+
Latin: are (3 characters)
|
|
59
|
+
Whitespace: (1 characters)
|
|
60
|
+
Latin: you (3 characters)
|
|
61
|
+
Punctuation: ? (1 characters)
|
|
40
62
|
```
|
|
41
63
|
|
|
42
64
|
## Development
|
|
@@ -47,4 +69,4 @@ Run the tests with `bin/test`.
|
|
|
47
69
|
You're welcome to contribute to this project. See https://github.com/davidarendsen/unicode_script_detector.
|
|
48
70
|
|
|
49
71
|
## License
|
|
50
|
-
This software is released under the [MIT license](LICENSE).
|
|
72
|
+
This software is released under the [MIT license](LICENSE).
|
|
@@ -15,5 +15,17 @@ module UnicodeScriptDetector
|
|
|
15
15
|
def hiragana?
|
|
16
16
|
@script === :Hiragana
|
|
17
17
|
end
|
|
18
|
+
|
|
19
|
+
def punctuation?
|
|
20
|
+
@script === :Punctuation
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def emoji?
|
|
24
|
+
@script === :Emoji
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def digit?
|
|
28
|
+
@script === :Digit
|
|
29
|
+
end
|
|
18
30
|
end
|
|
19
|
-
end
|
|
31
|
+
end
|
|
@@ -41,5 +41,15 @@ module UnicodeScriptDetector
|
|
|
41
41
|
|
|
42
42
|
@scripts.uniq.sort == scripts.uniq.sort
|
|
43
43
|
end
|
|
44
|
+
|
|
45
|
+
def script_groups
|
|
46
|
+
@characters
|
|
47
|
+
.chunk { |char| char.script }
|
|
48
|
+
.map { |script, chars| ScriptGroup.new(script, chars) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def grouped_scripts_hash
|
|
52
|
+
script_groups.map { |group| [group.script, group.text] }.to_h
|
|
53
|
+
end
|
|
44
54
|
end
|
|
45
|
-
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module UnicodeScriptDetector
|
|
2
|
+
class ScriptGroup
|
|
3
|
+
attr_reader :script, :characters, :text
|
|
4
|
+
|
|
5
|
+
def initialize(script, characters)
|
|
6
|
+
@script = script
|
|
7
|
+
@characters = characters
|
|
8
|
+
@text = characters.map(&:char).join
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def length
|
|
12
|
+
@characters.length
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def name
|
|
16
|
+
@characters.first&.name
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -9,6 +9,10 @@ module UnicodeScriptDetector
|
|
|
9
9
|
UnicodeScriptDetector::Detector.new(string).characters
|
|
10
10
|
end
|
|
11
11
|
|
|
12
|
+
def script_groups(string)
|
|
13
|
+
UnicodeScriptDetector::Detector.new(string).script_groups
|
|
14
|
+
end
|
|
15
|
+
|
|
12
16
|
def contains?(string, scripts)
|
|
13
17
|
UnicodeScriptDetector::Detector.new(string).contains?(scripts)
|
|
14
18
|
end
|
|
@@ -17,4 +21,4 @@ module UnicodeScriptDetector
|
|
|
17
21
|
UnicodeScriptDetector::Detector.new(string).contains_only?(scripts)
|
|
18
22
|
end
|
|
19
23
|
end
|
|
20
|
-
end
|
|
24
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: unicode_script_detector
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David Arendsen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2025-12-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: zeitwerk
|
|
@@ -55,6 +55,7 @@ files:
|
|
|
55
55
|
- lib/unicode_script_detector.rb
|
|
56
56
|
- lib/unicode_script_detector/character.rb
|
|
57
57
|
- lib/unicode_script_detector/detector.rb
|
|
58
|
+
- lib/unicode_script_detector/script_group.rb
|
|
58
59
|
- lib/unicode_script_detector/scripts.rb
|
|
59
60
|
- lib/unicode_script_detector/version.rb
|
|
60
61
|
homepage: https://github.com/davidarendsen/unicode_script_detector
|
|
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
79
80
|
- !ruby/object:Gem::Version
|
|
80
81
|
version: '0'
|
|
81
82
|
requirements: []
|
|
82
|
-
rubygems_version: 3.5.
|
|
83
|
+
rubygems_version: 3.5.11
|
|
83
84
|
signing_key:
|
|
84
85
|
specification_version: 4
|
|
85
86
|
summary: Unicode Script Detector
|