RubyGems - unicode_script_detector - Versions diffs - 0.0.5 → 0.0.7 - Mend

unicode_script_detector 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/unicode_script_detector/detector.rb +39 -29
data/lib/unicode_script_detector/script_group.rb +8 -7
data/lib/unicode_script_detector/scripts.rb +119 -75
data/lib/unicode_script_detector/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4795cdb246ac34ccb0ee5183ff0e704d25e4e67410acee321f36f4446dd28356
-  data.tar.gz: 0c7b9c4c835718f2fc7509225204e38c0a7148949c5745f1448b299c27e0e88d
+  metadata.gz: a0febe236b556e42077b401d8e117b3996c6065dcbb33c974a572a8af64d14a4
+  data.tar.gz: f8ef874b90e0ca8e387bc16d5fc947745fcbe34bbd8ea53c328f98d8c3d8eddd
 SHA512:
-  metadata.gz: 5da7422c57295f4ac3dee3ac9ccfaa99b5586418de956a88876035541da023e9fa4afe609a4aa79d4c3a1a5f9b1ffe64370984657844c06fc6a575578beb5ee2
-  data.tar.gz: aa9fecf48386b6eb5a0074cbbec8819af80153c6111e042debd9e9c312145bc11a936b3003a406370a28e8a27a48b2bd0409c0cf043b26fe465f4c58ee9669e2
+  metadata.gz: c5b931f1a8f527900f9c37ba0af67ca4e1fbfc0a0fde2f382fc1c2853a2c8813654f1d334f692e39882421eb3a4825f8e056d91d92d58fb4c2c1c22266c347b3
+  data.tar.gz: 69ac1e2314cefd944af0958cba15764afdf05e6e6eb314b68bc75172c7d1085be00725d40df5fe7f1af23abccc0243fa5d134c0b590da62d83913dce6570f0c1

data/lib/unicode_script_detector/detector.rb CHANGED Viewed

@@ -1,55 +1,65 @@
 module UnicodeScriptDetector
   class Detector
-    attr_reader :characters, :scripts
+    attr_reader :scripts
     def initialize(string)
       @string = string
-      @characters = []
-      @scripts = []
+      @char_scripts = []
+      @script_names = []
       detect_scripts
     end
     def scripts
-      @scripts.uniq
-    end
-    def detect_scripts
-      @string.chars.each_with_index do |char, index|
-        detected = false
-        Scripts::LIST.each_with_index do |script_data, index|
-          if char.match?(script_data[:regex])
-            @characters << Character.new(char, script_data[:script], script_data[:name])
-            @scripts << script_data[:script]
-            detected = true
-            break
-          end
-        end
-        @characters << Character.new(char, :Other, "Other") unless detected
-        @scripts << :Other unless detected
-      end
+      @char_scripts.uniq
     end
     def contains?(scripts)
-      return @scripts.include?(scripts) if scripts.is_a?(Symbol)
-      scripts.all? { |script| @scripts.include?(script) }
+      return @char_scripts.include?(scripts) if scripts.is_a?(Symbol)
+      scripts.all? { |script| @char_scripts.include?(script) }
     end
     def contains_only?(scripts)
-      return @scripts.uniq == [scripts] if scripts.is_a?(Symbol)
-      @scripts.uniq.sort == scripts.uniq.sort
+      return @char_scripts.uniq == [scripts] if scripts.is_a?(Symbol)
+      @char_scripts.uniq.sort == scripts.uniq.sort
     end
     def script_groups
-      @characters
-        .chunk { |char| char.script }
-        .map { |script, chars| ScriptGroup.new(script, chars) }
+      @string.chars
+        .zip(@char_scripts, @script_names)
+        .chunk { |_, script, _| script }
+        .map do |script, char_data|
+          chars = char_data.map(&:first)
+          name = char_data.first[2]
+          ScriptGroup.new(script, chars, name)
+        end
     end
     def grouped_scripts_hash
       script_groups.map { |group| [group.script, group.text] }.to_h
     end
+    def characters
+      @characters ||= @string.chars.zip(@char_scripts, @script_names).map do |char, script, name|
+        Character.new(char, script, name)
+      end
+    end
+    private
+      def detect_scripts
+        @string.chars.each do |char|
+          script_info = find_script_for_char(char)
+          @char_scripts << script_info[:script]
+          @script_names << script_info[:name]
+        end
+      end
+      def find_script_for_char(char)
+        Scripts::LIST.each do |script_data|
+          return script_data if char.match?(script_data[:regex])
+        end
+        { script: :Other, name: "Other" }
+      end
   end
 end

data/lib/unicode_script_detector/script_group.rb CHANGED Viewed

@@ -1,19 +1,20 @@
 module UnicodeScriptDetector
   class ScriptGroup
-    attr_reader :script, :characters, :text
+    attr_reader :script, :text, :name
-    def initialize(script, characters)
+    def initialize(script, chars, name)
       @script = script
-      @characters = characters
-      @text = characters.map(&:char).join
+      @chars = chars
+      @text = chars.join
+      @name = name
     end
     def length
-      @characters.length
+      @chars.length
     end
-    def name
-      @characters.first&.name
+    def characters
+      @characters ||= @chars.map { |char| Character.new(char, @script, @name) }
     end
   end
 end

data/lib/unicode_script_detector/scripts.rb CHANGED Viewed

@@ -1,21 +1,6 @@
 module UnicodeScriptDetector
   class Scripts
     LIST = [
-      {
-        script: :Whitespace,
-        name: "Whitespace",
-        regex: /\s/
-      },
-      {
-        script: :Digit,
-        name: "Digit",
-        regex: /\d/
-      },
-      {
-        script: :Punctuation,
-        name: "Punctuation",
-        regex: /[[:punct:]]/
-      },
       {
         script: :Adlam,
         name: "Adlam",
@@ -28,7 +13,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Anatolian_Hieroglyphs,
-        name: "Anatolian_Hieroglyphs",
+        name: "Anatolian Hieroglyphs",
         regex: /\p{Anatolian_Hieroglyphs}/,
       },
       {
@@ -58,7 +43,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Bassa_Vah,
-        name: "Bassa_Vah",
+        name: "Bassa Vah",
         regex: /\p{Bassa_Vah}/,
       },
       {
@@ -103,7 +88,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Canadian_Aboriginal,
-        name: "Canadian_Aboriginal",
+        name: "Canadian Aboriginal",
         regex: /\p{Canadian_Aboriginal}/,
       },
       {
@@ -113,7 +98,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Caucasian_Albanian,
-        name: "Caucasian_Albanian",
+        name: "Caucasian Albanian",
         regex: /\p{Caucasian_Albanian}/,
       },
       {
@@ -153,7 +138,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Cypro_Minoan,
-        name: "Cypro_Minoan",
+        name: "Cypro Minoan",
         regex: /\p{Cypro_Minoan}/,
       },
       {
@@ -173,7 +158,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Dives_Akuru,
-        name: "Dives_Akuru",
+        name: "Dives Akuru",
         regex: /\p{Dives_Akuru}/,
       },
       {
@@ -188,8 +173,8 @@ module UnicodeScriptDetector
       },
       {
         script: :Egyptian_Hieroglyphs,
-        name: "Egyptian_Hieroglyphs",
-        regex: /\p{Egyptian_Hieroglyphs}/,
+        name: "Egyptian Hieroglyphs",
+        regex: /\p{Egyptian_Hieroglyphs}|[\u{13460}-\u{1355F}]/,
       },
       {
         script: :Elbasan,
@@ -206,7 +191,11 @@ module UnicodeScriptDetector
         name: "Ethiopic",
         regex: /\p{Ethiopic}/,
       },
+      {
+        script: :Garay,
+        name: "Garay",
+        regex: /[\u{10D40}-\u{10D8F}]/,
+      },
       {
         script: :Georgian,
         name: "Georgian",
@@ -239,7 +228,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Gunjala_Gondi,
-        name: "Gunjala_Gondi",
+        name: "Gunjala Gondi",
         regex: /\p{Gunjala_Gondi}/,
       },
       {
@@ -247,11 +236,15 @@ module UnicodeScriptDetector
         name: "Gurmukhi",
         regex: /\p{Gurmukhi}/,
       },
+      {
+        script: :Gurung_Khema,
+        name: "Gurung Khema",
+        regex: /[\u{16100}-\u{1613F}]/,
+      },
       {
         script: :Han,
         name: "Han",
-        regex: /\p{Han}/,
+        regex: /\p{Han}|[\u{323B0}-\u{3347F}]/,
       },
       {
         script: :Hangul,
@@ -260,7 +253,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Hanifi_Rohingya,
-        name: "Hanifi_Rohingya",
+        name: "Hanifi Rohingya",
         regex: /\p{Hanifi_Rohingya}/,
       },
       {
@@ -285,7 +278,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Imperial_Aramaic,
-        name: "Imperial_Aramaic",
+        name: "Imperial Aramaic",
         regex: /\p{Imperial_Aramaic}/,
       },
       {
@@ -295,12 +288,12 @@ module UnicodeScriptDetector
       },
       {
         script: :Inscriptional_Pahlavi,
-        name: "Inscriptional_Pahlavi",
+        name: "Inscriptional Pahlavi",
         regex: /\p{Inscriptional_Pahlavi}/,
       },
       {
         script: :Inscriptional_Parthian,
-        name: "Inscriptional_Parthian",
+        name: "Inscriptional Parthian",
         regex: /\p{Inscriptional_Parthian}/,
       },
       {
@@ -323,7 +316,6 @@ module UnicodeScriptDetector
         name: "Katakana",
         regex: /\p{Katakana}/,
       },
       {
         script: :Kawi,
         name: "Kawi",
@@ -331,7 +323,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Kayah_Li,
-        name: "Kayah_Li",
+        name: "Kayah Li",
         regex: /\p{Kayah_Li}/,
       },
       {
@@ -341,7 +333,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Khitan_Small_Script,
-        name: "Khitan_Small_Script",
+        name: "Khitan Small Script",
         regex: /\p{Khitan_Small_Script}/,
       },
       {
@@ -359,7 +351,11 @@ module UnicodeScriptDetector
         name: "Khudawadi",
         regex: /\p{Khudawadi}/,
       },
+      {
+        script: :Kirat_Rai,
+        name: "Kirat Rai",
+        regex: /[\u{16D40}-\u{16D7F}]/,
+      },
       {
         script: :Lao,
         name: "Lao",
@@ -382,12 +378,12 @@ module UnicodeScriptDetector
       },
       {
         script: :Linear_A,
-        name: "Linear_A",
+        name: "Linear A",
         regex: /\p{Linear_A}/,
       },
       {
         script: :Linear_B,
-        name: "Linear_B",
+        name: "Linear B",
         regex: /\p{Linear_B}/,
       },
       {
@@ -437,7 +433,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Masaram_Gondi,
-        name: "Masaram_Gondi",
+        name: "Masaram Gondi",
         regex: /\p{Masaram_Gondi}/,
       },
       {
@@ -447,22 +443,22 @@ module UnicodeScriptDetector
       },
       {
         script: :Meetei_Mayek,
-        name: "Meetei_Mayek",
+        name: "Meetei Mayek",
         regex: /\p{Meetei_Mayek}/,
       },
       {
         script: :Mende_Kikakui,
-        name: "Mende_Kikakui",
+        name: "Mende Kikakui",
         regex: /\p{Mende_Kikakui}/,
       },
       {
         script: :Meroitic_Cursive,
-        name: "Meroitic_Cursive",
+        name: "Meroitic Cursive",
         regex: /\p{Meroitic_Cursive}/,
       },
       {
         script: :Meroitic_Hieroglyphs,
-        name: "Meroitic_Hieroglyphs",
+        name: "Meroitic Hieroglyphs",
         regex: /\p{Meroitic_Hieroglyphs}/,
       },
       {
@@ -493,7 +489,7 @@ module UnicodeScriptDetector
       {
         script: :Myanmar,
         name: "Myanmar",
-        regex: /\p{Myanmar}/,
+        regex: /\p{Myanmar}|[\u{116D0}-\u{116FF}]/,
       },
       {
         script: :Nabataean,
@@ -502,7 +498,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Nag_Mundari,
-        name: "Nag_Mundari",
+        name: "Nag Mundari",
         regex: /\p{Nag_Mundari}/,
       },
       {
@@ -512,7 +508,7 @@ module UnicodeScriptDetector
       },
       {
         script: :New_Tai_Lue,
-        name: "New_Tai_Lue",
+        name: "New Tai Lue",
         regex: /\p{New_Tai_Lue}/,
       },
       {
@@ -532,7 +528,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Nyiakeng_Puachue_Hmong,
-        name: "Nyiakeng_Puachue_Hmong",
+        name: "Nyiakeng Puachue Hmong",
         regex: /\p{Nyiakeng_Puachue_Hmong}/,
       },
       {
@@ -542,53 +538,57 @@ module UnicodeScriptDetector
       },
       {
         script: :Ol_Chiki,
-        name: "Ol_Chiki",
+        name: "Ol Chiki",
         regex: /\p{Ol_Chiki}/,
       },
+      {
+        script: :Ol_Onal,
+        name: "Ol Onal",
+        regex: /[\u{1E5D0}-\u{1E5FF}]/,
+      },
       {
         script: :Old_Hungarian,
-        name: "Old_Hungarian",
+        name: "Old Hungarian",
         regex: /\p{Old_Hungarian}/,
       },
       {
         script: :Old_Italic,
-        name: "Old_Italic",
+        name: "Old Italic",
         regex: /\p{Old_Italic}/,
       },
       {
         script: :Old_North_Arabian,
-        name: "Old_North_Arabian",
+        name: "Old North Arabian",
         regex: /\p{Old_North_Arabian}/,
       },
       {
         script: :Old_Permic,
-        name: "Old_Permic",
+        name: "Old Permic",
         regex: /\p{Old_Permic}/,
       },
       {
         script: :Old_Persian,
-        name: "Old_Persian",
+        name: "Old Persian",
         regex: /\p{Old_Persian}/,
       },
       {
         script: :Old_Sogdian,
-        name: "Old_Sogdian",
+        name: "Old Sogdian",
         regex: /\p{Old_Sogdian}/,
       },
       {
         script: :Old_South_Arabian,
-        name: "Old_South_Arabian",
+        name: "Old South Arabian",
         regex: /\p{Old_South_Arabian}/,
       },
       {
         script: :Old_Turkic,
-        name: "Old_Turkic",
+        name: "Old Turkic",
         regex: /\p{Old_Turkic}/,
       },
       {
         script: :Old_Uyghur,
-        name: "Old_Uyghur",
+        name: "Old Uyghur",
         regex: /\p{Old_Uyghur}/,
       },
       {
@@ -608,7 +608,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Pahawh_Hmong,
-        name: "Pahawh_Hmong",
+        name: "Pahawh Hmong",
         regex: /\p{Pahawh_Hmong}/,
       },
       {
@@ -618,12 +618,12 @@ module UnicodeScriptDetector
       },
       {
         script: :Pau_Cin_Hau,
-        name: "Pau_Cin_Hau",
+        name: "Pau Cin Hau",
         regex: /\p{Pau_Cin_Hau}/,
       },
       {
         script: :Phags_Pa,
-        name: "Phags_Pa",
+        name: "Phags Pa",
         regex: /\p{Phags_Pa}/,
       },
       {
@@ -633,7 +633,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Psalter_Pahlavi,
-        name: "Psalter_Pahlavi",
+        name: "Psalter Pahlavi",
         regex: /\p{Psalter_Pahlavi}/,
       },
       {
@@ -659,7 +659,7 @@ module UnicodeScriptDetector
       {
         script: :Sharada,
         name: "Sharada",
-        regex: /\p{Sharada}/,
+        regex: /\p{Sharada}|[\u{11B60}-\u{11B7F}]/,
       },
       {
         script: :Shavian,
@@ -671,6 +671,11 @@ module UnicodeScriptDetector
         name: "Siddham",
         regex: /\p{Siddham}/,
       },
+      {
+        script: :Sidetic,
+        name: "Sidetic",
+        regex: /[\u{10940}-\u{1095F}]/
+      },
       {
         script: :SignWriting,
         name: "SignWriting",
@@ -688,7 +693,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Sora_Sompeng,
-        name: "Sora_Sompeng",
+        name: "Sora Sompeng",
         regex: /\p{Sora_Sompeng}/,
       },
       {
@@ -701,10 +706,14 @@ module UnicodeScriptDetector
         name: "Sundanese",
         regex: /\p{Sundanese}/,
       },
+      {
+        script: :Sunuwar,
+        name: "Sunuwar",
+        regex: /[\u{11BC0}-\u{11BFF}]/,
+      },
       {
         script: :Syloti_Nagri,
-        name: "Syloti_Nagri",
+        name: "Syloti Nagri",
         regex: /\p{Syloti_Nagri}/,
       },
       {
@@ -724,19 +733,24 @@ module UnicodeScriptDetector
       },
       {
         script: :Tai_Le,
-        name: "Tai_Le",
+        name: "Tai Le",
         regex: /\p{Tai_Le}/,
       },
       {
         script: :Tai_Tham,
-        name: "Tai_Tham",
+        name: "Tai Tham",
         regex: /\p{Tai_Tham}/,
       },
       {
         script: :Tai_Viet,
-        name: "Tai_Viet",
+        name: "Tai Viet",
         regex: /\p{Tai_Viet}/,
       },
+      {
+        script: :Tai_Yo,
+        name: "Tai Yo",
+        regex: /[\u{1E6C0}-\u{1E6FF}]/,
+      },
       {
         script: :Takri,
         name: "Takri",
@@ -755,7 +769,7 @@ module UnicodeScriptDetector
       {
         script: :Tangut,
         name: "Tangut",
-        regex: /\p{Tangut}/,
+        regex: /\p{Tangut}|[\u{18D80}-\u{18DFF}]/,
       },
       {
         script: :Telugu,
@@ -787,13 +801,26 @@ module UnicodeScriptDetector
         name: "Tirhuta",
         regex: /\p{Tirhuta}/,
       },
+      {
+        script: :Todhri,
+        name: "Todhri",
+        regex: /[\u{105C0}-\u{105FF}]/,
+      },
+      {
+        script: :Tolong_Siki,
+        name: "Tolong Siki",
+        regex: /[\u{11DB0}-\u{11DEF}]/,
+      },
       {
         script: :Toto,
         name: "Toto",
         regex: /\p{Toto}/,
       },
+      {
+        script: :Tulu_Tigalari,
+        name: "Tulu Tigalari",
+        regex: /[\u{11380}-\u{113FF}]/,
+      },
       {
         script: :Ugaritic,
         name: "Ugaritic",
@@ -821,7 +848,7 @@ module UnicodeScriptDetector
       },
       {
         script: :Warang_Citi,
-        name: "Warang_Citi",
+        name: "Warang Citi",
         regex: /\p{Warang_Citi}/,
       },
       {
@@ -836,18 +863,35 @@ module UnicodeScriptDetector
       },
       {
         script: :Zanabazar_Square,
-        name: "Zanabazar_Square",
+        name: "Zanabazar Square",
         regex: /\p{Zanabazar_Square}/,
       },
+      #Special characters
+      {
+        script: :Whitespace,
+        name: "Whitespace",
+        regex: /\s/
+      },
+      {
+        script: :Digit,
+        name: "Digit",
+        regex: /\d/
+      },
       {
         script: :Emoji,
         name: "Emoji",
-        regex: /\p{Emoji}/,
+        regex: /\p{Emoji_Presentation}/,
+      },
+      {
+        script: :Punctuation,
+        name: "Punctuation",
+        regex: /[[:punct:]]/
       },
       {
         script: :Common,
         name: "Common",
-        regex: /\p{Common}/,
+        regex: /\p{Common}|[\u{1CEC0}-\u{1CEFF}]|[\u{1CC00}-\u{1CEBF}]/,
       },
     ]
   end

data/lib/unicode_script_detector/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UnicodeScriptDetector
-  VERSION = "0.0.5"
+  VERSION = "0.0.7"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unicode_script_detector
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.7
 platform: ruby
 authors:
 - David Arendsen
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-12-31 00:00:00.000000000 Z
+date: 2026-01-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: zeitwerk