unicode_script_detector 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d80a777771e5076bfb67e1b63ea3a0900f9ca53b16b0e90224608b60c06c78bc
4
- data.tar.gz: 266e4ca3ad0728cacbd4a4f37819f271b22205a893f5750c14c862bda8d5cdc2
3
+ metadata.gz: 58ac1a16218a1a336bb9c2fa6c75c5519a69a6c846c1007ffd236d991f3ef223
4
+ data.tar.gz: 11c82345a0b3990737b1c1a85c97d6c7552644a10e0c1d9265a4a46e4a0a9ce3
5
5
  SHA512:
6
- metadata.gz: a0d6741c40ae5adfde691ec742b1bc5440e2b45f265d448cda9b6c40978843404b05419e82ebad258ef40e752bcaffb26c985e625cb52025de7ae37b7aee505a
7
- data.tar.gz: 2c81a22c0cb25ec024eb875f4da5a19891bff344acf7daaaa8a4155caf89e0ba77021d5b475b3b3f8e986eaf1b6be8bf8e20747a88afdbd0bfdd083d344c2680
6
+ metadata.gz: baad85a939325ce58dc96c132e103b81bf1c9cd0f4a3120675bb6db7ec81bad305759b40da55e8d868128baa3d5bbdf3e28d044ae6f735d0ebce5ed4f89bb190
7
+ data.tar.gz: cfc04d633c2bdbbb0810c505d1cb6d671045b4b0c3651584be63231cb9b722a413c65c59487dbf5953374646af7bdc953ec7e66c4b71c7c1ad069004c8d9dd46
data/README.md CHANGED
@@ -20,23 +20,45 @@ $ gem install unicode_script_detector
20
20
  UnicodeScriptDetector.detect_characters "Hel6б"
21
21
 
22
22
  #Output:
23
- [#<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
24
- #<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
25
- #<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
26
- #<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
27
- #<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>]
23
+ [
24
+ #<UnicodeScriptDetector::Character:0x00007768fefdead8 @char="H", @name="Latin", @script=:Latin>,
25
+ #<UnicodeScriptDetector::Character:0x00007768fefdea10 @char="e", @name="Latin", @script=:Latin>,
26
+ #<UnicodeScriptDetector::Character:0x00007768fefde970 @char="l", @name="Latin", @script=:Latin>,
27
+ #<UnicodeScriptDetector::Character:0x00007768fefde8d0 @char="6", @name="Digit", @script=:Digit>,
28
+ #<UnicodeScriptDetector::Character:0x00007768fefde830 @char="б", @name="Cyrillic", @script=:Cyrillic>
29
+ ]
28
30
  ```
29
31
 
30
32
  ## Detect if a script contains certain scripts
31
33
  ```ruby
32
34
  # This will return true because it contains Latin and Cyrillic
33
- UnicodeScriptDetector.contains? "Hellб🔥", [:Latin, :Cyrillic]
35
+ UnicodeScriptDetector.contains? "Helб🔥", [:Latin, :Cyrillic]
34
36
  ```
35
37
 
36
38
  ## Detect if a script contains only certain scripts
37
39
  ```ruby
38
40
  # This will return false because it contains an Emoji as well
39
- UnicodeScriptDetector.contains_only? "Hellб🔥", [:Latin, :Cyrillic]
41
+ UnicodeScriptDetector.contains_only? "Helб🔥", [:Latin, :Cyrillic]
42
+ ```
43
+
44
+ ## Detect all the characters of a string, grouped by the script
45
+ ```ruby
46
+ UnicodeScriptDetector.script_groups("Hel6б how are you?").each do |group|
47
+ puts "#{group.name}: #{group.text} (#{group.length} characters)"
48
+ end
49
+
50
+ #Output:
51
+
52
+ Latin: Hel (3 characters)
53
+ Digit: 6 (1 characters)
54
+ Cyrillic: б (1 characters)
55
+ Whitespace: (1 characters)
56
+ Latin: how (3 characters)
57
+ Whitespace: (1 characters)
58
+ Latin: are (3 characters)
59
+ Whitespace: (1 characters)
60
+ Latin: you (3 characters)
61
+ Punctuation: ? (1 characters)
40
62
  ```
41
63
 
42
64
  ## Development
@@ -47,4 +69,4 @@ Run the tests with `bin/test`.
47
69
  You're welcome to contribute to this project. See https://github.com/davidarendsen/unicode_script_detector.
48
70
 
49
71
  ## License
50
- This software is released under the [MIT license](LICENSE).
72
+ This software is released under the [MIT license](LICENSE).
@@ -15,5 +15,17 @@ module UnicodeScriptDetector
15
15
  def hiragana?
16
16
  @script === :Hiragana
17
17
  end
18
+
19
+ def punctuation?
20
+ @script === :Punctuation
21
+ end
22
+
23
+ def emoji?
24
+ @script === :Emoji
25
+ end
26
+
27
+ def digit?
28
+ @script === :Digit
29
+ end
18
30
  end
19
- end
31
+ end
@@ -1,45 +1,65 @@
1
1
  module UnicodeScriptDetector
2
2
  class Detector
3
- attr_reader :characters, :scripts
3
+ attr_reader :scripts
4
4
 
5
5
  def initialize(string)
6
6
  @string = string
7
- @characters = []
8
- @scripts = []
7
+ @char_scripts = []
8
+ @script_names = []
9
9
 
10
10
  detect_scripts
11
11
  end
12
12
 
13
13
  def scripts
14
- @scripts.uniq
14
+ @char_scripts.uniq
15
15
  end
16
16
 
17
- def detect_scripts
18
- @string.chars.each_with_index do |char, index|
19
- detected = false
20
- Scripts::LIST.each_with_index do |script_data, index|
21
- if char.match?(script_data[:regex])
22
- @characters << Character.new(char, script_data[:script], script_data[:name])
23
- @scripts << script_data[:script]
24
- detected = true
25
- break
26
- end
27
- end
28
- @characters << Character.new(char, :Other, "Other") unless detected
29
- @scripts << :Other unless detected
30
- end
17
+ def contains?(scripts)
18
+ return @char_scripts.include?(scripts) if scripts.is_a?(Symbol)
19
+ scripts.all? { |script| @char_scripts.include?(script) }
31
20
  end
32
21
 
33
- def contains?(scripts)
34
- return @scripts.include?(scripts) if scripts.is_a?(Symbol)
22
+ def contains_only?(scripts)
23
+ return @char_scripts.uniq == [scripts] if scripts.is_a?(Symbol)
24
+ @char_scripts.uniq.sort == scripts.uniq.sort
25
+ end
35
26
 
36
- scripts.all? { |script| @scripts.include?(script) }
27
+ def script_groups
28
+ @string.chars
29
+ .zip(@char_scripts, @script_names)
30
+ .chunk { |_, script, _| script }
31
+ .map do |script, char_data|
32
+ chars = char_data.map(&:first)
33
+ name = char_data.first[2]
34
+ ScriptGroup.new(script, chars, name)
35
+ end
37
36
  end
38
37
 
39
- def contains_only?(scripts)
40
- return @scripts.uniq == [scripts] if scripts.is_a?(Symbol)
38
+ def grouped_scripts_hash
39
+ script_groups.map { |group| [group.script, group.text] }.to_h
40
+ end
41
41
 
42
- @scripts.uniq.sort == scripts.uniq.sort
42
+ def characters
43
+ @characters ||= @string.chars.zip(@char_scripts, @script_names).map do |char, script, name|
44
+ Character.new(char, script, name)
45
+ end
43
46
  end
47
+
48
+ private
49
+ def detect_scripts
50
+ @string.chars.each do |char|
51
+ script_info = find_script_for_char(char)
52
+ @char_scripts << script_info[:script]
53
+ @script_names << script_info[:name]
54
+ end
55
+ end
56
+
57
+ def find_script_for_char(char)
58
+ Scripts::LIST.each do |script_data|
59
+ return script_data if char.match?(script_data[:regex])
60
+ end
61
+ { script: :Other, name: "Other" }
62
+ end
63
+
44
64
  end
45
- end
65
+ end
@@ -0,0 +1,20 @@
1
+ module UnicodeScriptDetector
2
+ class ScriptGroup
3
+ attr_reader :script, :text, :name
4
+
5
+ def initialize(script, chars, name)
6
+ @script = script
7
+ @chars = chars
8
+ @text = chars.join
9
+ @name = name
10
+ end
11
+
12
+ def length
13
+ @chars.length
14
+ end
15
+
16
+ def characters
17
+ @characters ||= @chars.map { |char| Character.new(char, @script, @name) }
18
+ end
19
+ end
20
+ end
@@ -837,7 +837,12 @@ module UnicodeScriptDetector
837
837
  {
838
838
  script: :Emoji,
839
839
  name: "Emoji",
840
- regex: /\p{Emoji}/,
840
+ regex: /\p{Emoji_Presentation}/,
841
+ },
842
+ {
843
+ script: :Punctuation,
844
+ name: "Punctuation",
845
+ regex: /[[:punct:]]/
841
846
  },
842
847
  {
843
848
  script: :Common,
@@ -1,3 +1,3 @@
1
1
  module UnicodeScriptDetector
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -9,6 +9,10 @@ module UnicodeScriptDetector
9
9
  UnicodeScriptDetector::Detector.new(string).characters
10
10
  end
11
11
 
12
+ def script_groups(string)
13
+ UnicodeScriptDetector::Detector.new(string).script_groups
14
+ end
15
+
12
16
  def contains?(string, scripts)
13
17
  UnicodeScriptDetector::Detector.new(string).contains?(scripts)
14
18
  end
@@ -17,4 +21,4 @@ module UnicodeScriptDetector
17
21
  UnicodeScriptDetector::Detector.new(string).contains_only?(scripts)
18
22
  end
19
23
  end
20
- end
24
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode_script_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Arendsen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-12-04 00:00:00.000000000 Z
11
+ date: 2026-01-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: zeitwerk
@@ -55,6 +55,7 @@ files:
55
55
  - lib/unicode_script_detector.rb
56
56
  - lib/unicode_script_detector/character.rb
57
57
  - lib/unicode_script_detector/detector.rb
58
+ - lib/unicode_script_detector/script_group.rb
58
59
  - lib/unicode_script_detector/scripts.rb
59
60
  - lib/unicode_script_detector/version.rb
60
61
  homepage: https://github.com/davidarendsen/unicode_script_detector
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
80
  - !ruby/object:Gem::Version
80
81
  version: '0'
81
82
  requirements: []
82
- rubygems_version: 3.5.23
83
+ rubygems_version: 3.5.11
83
84
  signing_key:
84
85
  specification_version: 4
85
86
  summary: Unicode Script Detector