unicode_script_detector 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d80a777771e5076bfb67e1b63ea3a0900f9ca53b16b0e90224608b60c06c78bc
4
- data.tar.gz: 266e4ca3ad0728cacbd4a4f37819f271b22205a893f5750c14c862bda8d5cdc2
3
+ metadata.gz: 4795cdb246ac34ccb0ee5183ff0e704d25e4e67410acee321f36f4446dd28356
4
+ data.tar.gz: 0c7b9c4c835718f2fc7509225204e38c0a7148949c5745f1448b299c27e0e88d
5
5
  SHA512:
6
- metadata.gz: a0d6741c40ae5adfde691ec742b1bc5440e2b45f265d448cda9b6c40978843404b05419e82ebad258ef40e752bcaffb26c985e625cb52025de7ae37b7aee505a
7
- data.tar.gz: 2c81a22c0cb25ec024eb875f4da5a19891bff344acf7daaaa8a4155caf89e0ba77021d5b475b3b3f8e986eaf1b6be8bf8e20747a88afdbd0bfdd083d344c2680
6
+ metadata.gz: 5da7422c57295f4ac3dee3ac9ccfaa99b5586418de956a88876035541da023e9fa4afe609a4aa79d4c3a1a5f9b1ffe64370984657844c06fc6a575578beb5ee2
7
+ data.tar.gz: aa9fecf48386b6eb5a0074cbbec8819af80153c6111e042debd9e9c312145bc11a936b3003a406370a28e8a27a48b2bd0409c0cf043b26fe465f4c58ee9669e2
data/README.md CHANGED
@@ -20,23 +20,45 @@ $ gem install unicode_script_detector
20
20
  UnicodeScriptDetector.detect_characters "Hel6б"
21
21
 
22
22
  #Output:
23
- [#<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
24
- #<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
25
- #<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
26
- #<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
27
- #<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>]
23
+ [
24
+ #<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
25
+ #<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
26
+ #<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
27
+ #<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
28
+ #<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>
29
+ ]
28
30
  ```
29
31
 
30
32
  ## Detect if a script contains certain scripts
31
33
  ```ruby
32
34
  # This will return true because it contains Latin and Cyrillic
33
- UnicodeScriptDetector.contains? "Hellб🔥", [:Latin, :Cyrillic]
35
+ UnicodeScriptDetector.contains? "Helб🔥", [:Latin, :Cyrillic]
34
36
  ```
35
37
 
36
38
  ## Detect if a script contains only certain scripts
37
39
  ```ruby
38
40
  # This will return false because it contains an Emoji as well
39
- UnicodeScriptDetector.contains_only? "Hellб🔥", [:Latin, :Cyrillic]
41
+ UnicodeScriptDetector.contains_only? "Helб🔥", [:Latin, :Cyrillic]
42
+ ```
43
+
44
+ ## Detect all the characters of a string, grouped by the script
45
+ ```ruby
46
+ UnicodeScriptDetector.script_groups("Hel6б how are you?").each do |group|
47
+ puts "#{group.name}: #{group.text} (#{group.length} characters)"
48
+ end
49
+
50
+ #Output:
51
+
52
+ Latin: Hel (3 characters)
53
+ Digit: 6 (1 characters)
54
+ Cyrillic: б (1 characters)
55
+ Whitespace: (1 characters)
56
+ Latin: how (3 characters)
57
+ Whitespace: (1 characters)
58
+ Latin: are (3 characters)
59
+ Whitespace: (1 characters)
60
+ Latin: you (3 characters)
61
+ Punctuation: ? (1 characters)
40
62
  ```
41
63
 
42
64
  ## Development
@@ -47,4 +69,4 @@ Run the tests with `bin/test`.
47
69
  You're welcome to contribute to this project. See https://github.com/davidarendsen/unicode_script_detector.
48
70
 
49
71
  ## License
50
- This software is released under the [MIT license](LICENSE).
72
+ This software is released under the [MIT license](LICENSE).
@@ -15,5 +15,17 @@ module UnicodeScriptDetector
15
15
  def hiragana?
16
16
  @script === :Hiragana
17
17
  end
18
+
19
+ def punctuation?
20
+ @script === :Punctuation
21
+ end
22
+
23
+ def emoji?
24
+ @script === :Emoji
25
+ end
26
+
27
+ def digit?
28
+ @script === :Digit
29
+ end
18
30
  end
19
- end
31
+ end
@@ -41,5 +41,15 @@ module UnicodeScriptDetector
41
41
 
42
42
  @scripts.uniq.sort == scripts.uniq.sort
43
43
  end
44
+
45
+ def script_groups
46
+ @characters
47
+ .chunk { |char| char.script }
48
+ .map { |script, chars| ScriptGroup.new(script, chars) }
49
+ end
50
+
51
+ def grouped_scripts_hash
52
+ script_groups.map { |group| [group.script, group.text] }.to_h
53
+ end
44
54
  end
45
- end
55
+ end
@@ -0,0 +1,19 @@
1
+ module UnicodeScriptDetector
2
+ class ScriptGroup
3
+ attr_reader :script, :characters, :text
4
+
5
+ def initialize(script, characters)
6
+ @script = script
7
+ @characters = characters
8
+ @text = characters.map(&:char).join
9
+ end
10
+
11
+ def length
12
+ @characters.length
13
+ end
14
+
15
+ def name
16
+ @characters.first&.name
17
+ end
18
+ end
19
+ end
@@ -11,6 +11,11 @@ module UnicodeScriptDetector
11
11
  name: "Digit",
12
12
  regex: /\d/
13
13
  },
14
+ {
15
+ script: :Punctuation,
16
+ name: "Punctuation",
17
+ regex: /[[:punct:]]/
18
+ },
14
19
  {
15
20
  script: :Adlam,
16
21
  name: "Adlam",
@@ -1,3 +1,3 @@
1
1
  module UnicodeScriptDetector
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -9,6 +9,10 @@ module UnicodeScriptDetector
9
9
  UnicodeScriptDetector::Detector.new(string).characters
10
10
  end
11
11
 
12
+ def script_groups(string)
13
+ UnicodeScriptDetector::Detector.new(string).script_groups
14
+ end
15
+
12
16
  def contains?(string, scripts)
13
17
  UnicodeScriptDetector::Detector.new(string).contains?(scripts)
14
18
  end
@@ -17,4 +21,4 @@ module UnicodeScriptDetector
17
21
  UnicodeScriptDetector::Detector.new(string).contains_only?(scripts)
18
22
  end
19
23
  end
20
- end
24
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode_script_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Arendsen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-12-04 00:00:00.000000000 Z
11
+ date: 2025-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: zeitwerk
@@ -55,6 +55,7 @@ files:
55
55
  - lib/unicode_script_detector.rb
56
56
  - lib/unicode_script_detector/character.rb
57
57
  - lib/unicode_script_detector/detector.rb
58
+ - lib/unicode_script_detector/script_group.rb
58
59
  - lib/unicode_script_detector/scripts.rb
59
60
  - lib/unicode_script_detector/version.rb
60
61
  homepage: https://github.com/davidarendsen/unicode_script_detector
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
80
  - !ruby/object:Gem::Version
80
81
  version: '0'
81
82
  requirements: []
82
- rubygems_version: 3.5.23
83
+ rubygems_version: 3.5.11
83
84
  signing_key:
84
85
  specification_version: 4
85
86
  summary: Unicode Script Detector