RubyGems - langdetect-ruby - Versions diffs - 0.1.1 → 0.2.0 - Mend

langdetect-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

checksums.yaml +4 -4
data/README.md +24 -13
data/langdetect-ruby.gemspec +1 -1
data/lib/lingua_ruby/configuration.rb +4 -1
data/lib/lingua_ruby/detector.rb +59 -1
data/lib/lingua_ruby/profile_loader.rb +26 -6
data/lib/lingua_ruby/profiles/am.json +193 -0
data/lib/lingua_ruby/profiles/bg.json +290 -0
data/lib/lingua_ruby/profiles/bn.json +211 -0
data/lib/lingua_ruby/profiles/cs.json +302 -0
data/lib/lingua_ruby/profiles/da.json +302 -0
data/lib/lingua_ruby/profiles/de.json +302 -0
data/lib/lingua_ruby/profiles/el.json +302 -0
data/lib/lingua_ruby/profiles/es.json +302 -0
data/lib/lingua_ruby/profiles/et.json +289 -0
data/lib/lingua_ruby/profiles/fa.json +234 -0
data/lib/lingua_ruby/profiles/fi.json +284 -0
data/lib/lingua_ruby/profiles/fr.json +302 -0
data/lib/lingua_ruby/profiles/ha.json +302 -0
data/lib/lingua_ruby/profiles/hi.json +255 -0
data/lib/lingua_ruby/profiles/hr.json +302 -0
data/lib/lingua_ruby/profiles/hu.json +302 -0
data/lib/lingua_ruby/profiles/it.json +302 -0
data/lib/lingua_ruby/profiles/lt.json +294 -0
data/lib/lingua_ruby/profiles/lv.json +302 -0
data/lib/lingua_ruby/profiles/my.json +200 -0
data/lib/lingua_ruby/profiles/no.json +297 -0
data/lib/lingua_ruby/profiles/pl.json +302 -0
data/lib/lingua_ruby/profiles/pt.json +302 -0
data/lib/lingua_ruby/profiles/ro.json +302 -0
data/lib/lingua_ruby/profiles/ru.json +297 -0
data/lib/lingua_ruby/profiles/sk.json +302 -0
data/lib/lingua_ruby/profiles/sv.json +302 -0
data/lib/lingua_ruby/profiles/sw.json +268 -0
data/lib/lingua_ruby/profiles/ta.json +235 -0
data/lib/lingua_ruby/profiles/te.json +254 -0
data/lib/lingua_ruby/profiles/th.json +251 -0
data/lib/lingua_ruby/profiles/tl.json +302 -0
data/lib/lingua_ruby/profiles/tr.json +302 -0
data/lib/lingua_ruby/profiles/uk.json +302 -0
data/lib/lingua_ruby/profiles/ur.json +232 -0
data/lib/lingua_ruby/profiles/vi.json +277 -0
data/lib/lingua_ruby/profiles/yo.json +245 -0
data/lib/lingua_ruby/profiles/zu.json +302 -0
data/lib/lingua_ruby/result.rb +13 -26
data/lib/lingua_ruby/version.rb +1 -1
data/lib/lingua_ruby.rb +4 -0
metadata +41 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4e5394fb4ef82d685b4d202a1d621f17aad0630f7afb635f86888d41c59a0e8a
-  data.tar.gz: 07ef998b77fb36d9aa37ca93e9a3ab195b148ff7b610fb0eb31c1cbd6c699eaa
+  metadata.gz: bc3f5643c02cc99f8c8c9dbc5ddf183a6ba1cedc52e3fec442df40e8d93f14a2
+  data.tar.gz: e933d5754d9ccb59b767484b38a3dad6cb0ef7dd4c7ce75ef0825cf222ff0b08
 SHA512:
-  metadata.gz: c179fc508bfec2d93849613674bfb1cacad889b81a2975ca703553a7133f1bb2d04688137c229b8ed4c97a2e25ab973de10554ea17c557933e98376963fdffa6
-  data.tar.gz: '096782d37ce17b2dbdfbc4bccfb056658abc23076fafc5b923a97fb52a62f0c226a830fea7d8bf35e48d493cb557d1d4122f14d418380df0ce627ddb60acf01a'
+  metadata.gz: 2b06daef47d7aa93ef113206299a5f202a938d2a69cb913bb066fae5bf0c301ea7a3fdc846bf44a6f97c80a7c0ae366965e573e3c18f0d10d5221e8fe181adeb
+  data.tar.gz: d02fa38feff5929c523c9ee1db261a0c788f29d073fba6ca7a27bbca3f63b772eab85751aa59bdb2c10300435a0990c52201f7a9bba9b60705359ea8b9fb90f9

data/README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 # langdetect-ruby
-Pure Ruby language detection using character n-gram frequency profiles.
+Language detection for Ruby using n-gram profiles. Supports 10+ languages with script-based fast paths for CJK, Arabic, Thai, and Devanagari.
 ## Installation
 ```ruby
-gem "langdetect-ruby", "~> 0.1"
+gem "langdetect-ruby"
 ```
 ## Usage
@@ -13,20 +13,31 @@ gem "langdetect-ruby", "~> 0.1"
 ```ruby
 require "lingua_ruby"
-result = LinguaRuby.detect("Selamat pagi, apa kabar?")
-result.language   # => :id
-result.confidence # => 0.94
-result.name       # => "Indonesian"
-results = LinguaRuby.detect_all("Good morning everyone")
-detector = LinguaRuby::Detector.new(languages: [:id, :en, :ms])
-detector.detect("Ini bahasa apa?")
+# Single detection
+result = LinguaRuby.detect("This is an English sentence")
+result.language  # => :en
+result.confidence  # => 0.92
+# Batch detection
+results = LinguaRuby.detect_batch([
+  "Hello world",
+  "Halo dunia",
+  "こんにちは世界"
+])
+# Restrict to specific languages
+detector = LinguaRuby::Detector.new(languages: [:en, :id, :ms])
+result = detector.detect("Selamat pagi")
 ```
-## Supported Languages
+## Features
-Indonesian, English, Malay, Javanese, Sundanese, Dutch, Arabic, Chinese, Japanese, Korean
+- N-gram profile comparison with normalized confidence (0.0-1.0)
+- CJK/Arabic/Thai/Devanagari script fast-path detection
+- Short text mode (< 20 chars) with higher n-gram orders
+- Batch detection with single profile load
+- Indonesian/Malay/Sundanese differentiation
+- Input validation and error handling
 ## License

data/langdetect-ruby.gemspec CHANGED Viewed

@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
   spec.authors = ["Johannes Dwi Cahyo"]
   spec.email = ["johannes@example.com"]
   spec.summary = "Language detection for Ruby using n-gram profiles"
-  spec.description = "Pure Ruby language detection library using character n-gram frequency profiles. Detects 50+ languages with high accuracy."
+  spec.description = "Pure Ruby language detection library using character n-gram frequency profiles. Detects 48 languages including European, Asian, and African languages with script-based fast-path and mixed-language segment detection."
   spec.homepage = "https://github.com/johannesdwicahyo/lingua-ruby"
   spec.license = "MIT"
   spec.required_ruby_version = ">= 3.0.0"

data/lib/lingua_ruby/configuration.rb CHANGED Viewed

@@ -3,13 +3,16 @@
 module LinguaRuby
   class Configuration
     attr_accessor :default_languages, :min_confidence, :ngram_range,
-                  :max_profile_size
+                  :max_profile_size, :custom_profiles_dir,
+                  :use_sigmoid_calibration
     def initialize
       @default_languages = nil
       @min_confidence = 0.1
       @ngram_range = 1..3
       @max_profile_size = 300
+      @custom_profiles_dir = nil
+      @use_sigmoid_calibration = false
     end
   end
 end

data/lib/lingua_ruby/detector.rb CHANGED Viewed

@@ -48,7 +48,8 @@ module LinguaRuby
       @min_confidence = min_confidence || config.min_confidence
       @ngram_range = ngram_range || config.ngram_range
       @max_profile_size = max_profile_size || config.max_profile_size
-      @loader = ProfileLoader.new
+      @use_sigmoid = config.use_sigmoid_calibration
+      @loader = ProfileLoader.new(custom_profiles_dir: config.custom_profiles_dir)
       @profiles = @loader.load_all(languages: @languages)
     end
@@ -58,6 +59,36 @@ module LinguaRuby
       results.first
     end
+    # Detect language segments within mixed-language text
+    def detect_segments(text, min_segment_length: 20)
+      validate_text!(text)
+      return [] if text.nil? || text.strip.empty?
+      sentences = text.split(/(?<=[.!?。！？])\s+/)
+      segments = []
+      current_lang = nil
+      current_text = ""
+      sentences.each do |sentence|
+        next if sentence.strip.empty?
+        result = detect(sentence)
+        lang = result&.language
+        if lang == current_lang || current_text.length < min_segment_length
+          current_text += " " unless current_text.empty?
+          current_text += sentence
+          current_lang ||= lang
+        else
+          segments << { language: current_lang, text: current_text, confidence: detect(current_text)&.confidence } unless current_text.empty?
+          current_lang = lang
+          current_text = sentence
+        end
+      end
+      segments << { language: current_lang, text: current_text, confidence: detect(current_text)&.confidence } unless current_text.empty?
+      segments
+    end
     def detect_all(text)
       validate_text!(text)
       return [] if text.nil? || text.strip.empty?
@@ -98,6 +129,11 @@ module LinguaRuby
         # Map [min_distance, max_distance] → [1.0, 0.0]
         confidence = 1.0 - ((dist - min_distance) / (max_distance - min_distance))
+        # Apply sigmoid calibration if enabled
+        if @use_sigmoid
+          confidence = sigmoid_calibrate(confidence)
+        end
         # For short texts, reduce confidence to reflect unreliability
         if short_text
           confidence *= 0.7
@@ -110,6 +146,22 @@ module LinguaRuby
              .select { |r| r.confidence >= @min_confidence }
     end
+    # Score profile quality (0.0–1.0) based on n-gram diversity and coverage
+    def self.profile_quality(profile)
+      return 0.0 unless profile.is_a?(Profile)
+      ngram_count = profile.ngrams.size
+      return 0.0 if ngram_count == 0
+      # Measure n-gram order diversity
+      orders = profile.ngrams.keys.map(&:length).uniq.sort
+      order_diversity = orders.size / 5.0  # max 5 orders
+      # Size score (more ngrams = better, up to 300)
+      size_score = [ngram_count / 300.0, 1.0].min
+      ((order_diversity + size_score) / 2.0).round(4)
+    end
     private
     def validate_text!(text)
@@ -210,5 +262,11 @@ module LinguaRuby
     def script_result(lang, confidence: 0.99)
       [Result.new(language: lang, confidence: confidence)]
     end
+    # Sigmoid calibration: maps linear [0,1] to sigmoid-shaped [0,1]
+    # Steeper around the middle, compresses extremes
+    def sigmoid_calibrate(x, k: 10.0)
+      1.0 / (1.0 + Math.exp(-k * (x - 0.5)))
+    end
   end
 end

data/lib/lingua_ruby/profile_loader.rb CHANGED Viewed

@@ -6,8 +6,9 @@ module LinguaRuby
   class ProfileLoader
     PROFILES_DIR = File.join(__dir__, "profiles")
-    def initialize
+    def initialize(custom_profiles_dir: nil)
       @cache = {}
+      @custom_dir = custom_profiles_dir
     end
     def load(language)
@@ -21,16 +22,35 @@ module LinguaRuby
     end
     def available_languages
-      Dir.glob(File.join(PROFILES_DIR, "*.json")).map do |f|
-        File.basename(f, ".json").to_sym
-      end.sort
+      dirs = [PROFILES_DIR]
+      dirs << @custom_dir if @custom_dir
+      dirs.flat_map do |dir|
+        Dir.glob(File.join(dir, "*.json")).map { |f| File.basename(f, ".json").to_sym }
+      end.uniq.sort
+    end
+    # Load a custom profile from a hash or file
+    def load_custom(language, ngrams: nil, path: nil)
+      if path
+        data = JSON.parse(File.read(path))
+        @cache[language.to_sym] = Profile.new(language: language, ngrams: data)
+      elsif ngrams
+        @cache[language.to_sym] = Profile.new(language: language, ngrams: ngrams)
+      else
+        raise ArgumentError, "Either ngrams: or path: must be provided"
+      end
     end
     private
     def load_from_file(language)
-      path = File.join(PROFILES_DIR, "#{language}.json")
-      raise Error, "No profile found for language: #{language}" unless File.exist?(path)
+      # Check custom dir first, then built-in
+      paths = []
+      paths << File.join(@custom_dir, "#{language}.json") if @custom_dir
+      paths << File.join(PROFILES_DIR, "#{language}.json")
+      path = paths.find { |p| File.exist?(p) }
+      raise Error, "No profile found for language: #{language}" unless path
       data = JSON.parse(File.read(path))
       Profile.new(language: language, ngrams: data)

data/lib/lingua_ruby/profiles/am.json ADDED Viewed

@@ -0,0 +1,193 @@
+{
+  "ት": 0,
+  "ያ": 1,
+  " አ": 2,
+  "ያ ": 3,
+  "ጵያ": 4,
+  "ዮጵ": 5,
+  "ትዮ": 6,
+  "ኢት": 7,
+  "ኢትዮ": 8,
+  "ትዮጵ": 9,
+  "ቋ": 10,
+  "ዮጵያ": 11,
+  "ና": 12,
+  "ጵያ ": 13,
+  "ዮ": 14,
+  "ጵ": 15,
+  "አ": 16,
+  "ኢ": 17,
+  "የ": 18,
+  "ገ": 19,
+  "ር": 20,
+  "ም": 21,
+  " የ": 22,
+  " ናት": 23,
+  "ት ": 24,
+  "ናት": 25,
+  " ና": 26,
+  "ቋንቋ": 27,
+  " ቋን": 28,
+  "ናት ": 29,
+  "ር ና": 30,
+  "ገር ": 31,
+  "ሀገር": 32,
+  "ንቋ ": 33,
+  " ሀ": 34,
+  "ሀገ": 35,
+  "ገር": 36,
+  "ር ": 37,
+  " የኢ": 38,
+  "የኢት": 39,
+  " ቋ": 40,
+  "ቋን": 41,
+  "ንቋ": 42,
+  "ቋ ": 43,
+  "የኢ": 44,
+  " ሀገ": 45,
+  "በ": 46,
+  "ስ": 47,
+  "ሀ": 48,
+  "ሪ": 49,
+  "ማ": 50,
+  "ን": 51,
+  "ራ": 52,
+  "አፍሪ": 53,
+  " አፍ": 54,
+  "ቅ አ": 55,
+  "ራቅ ": 56,
+  "ቋ የ": 57,
+  "ላ": 58,
+  "ያ ብ": 59,
+  "ምስራ": 60,
+  "ስራቅ": 61,
+  "ና ከ": 62,
+  "ፍሪካ": 63,
+  "ሪካ ": 64,
+  "ካ የ": 65,
+  " የም": 66,
+  "የምት": 67,
+  "ኛ ቋ": 68,
+  "ርኛ ": 69,
+  "ምትገ": 70,
+  "ማርኛ": 71,
+  "አማር": 72,
+  " አማ": 73,
+  "ት አ": 74,
+  "ትገኝ": 75,
+  "ገኝ ": 76,
+  "ኝ ሀ": 77,
+  " ከተ": 78,
+  "ከተማ": 79,
+  "ተማ ": 80,
+  "ማ ና": 81,
+  "ት ኢ": 82,
+  " ኢት": 83,
+  "ያ ረ": 84,
+  " ረጅ": 85,
+  "ረጅም": 86,
+  "ጅም ": 87,
+  "ም ታ": 88,
+  " ታሪ": 89,
+  "ታሪክ": 90,
+  "ሪክ ": 91,
+  "ክ ያ": 92,
+  " ያላ": 93,
+  "ያላት": 94,
+  "ላት ": 95,
+  "ት ሀ": 96,
+  " ብሔ": 97,
+  "ብሔራ": 98,
+  "ሔራዊ": 99,
+  "ራዊ ": 100,
+  "ዊ ቋ": 101,
+  "ቋ ነ": 102,
+  " ነው": 103,
+  "ነው ": 104,
+  "ው አ": 105,
+  " አዲ": 106,
+  "አዲስ": 107,
+  "ዲስ ": 108,
+  "ስ አ": 109,
+  " አበ": 110,
+  "አበባ": 111,
+  "በባ ": 112,
+  "ባ የ": 113,
+  "ያ ዋ": 114,
+  " ዋና": 115,
+  "ዋና ": 116,
+  " በ": 117,
+  "በም": 118,
+  "ምስ": 119,
+  "ስራ": 120,
+  "ራቅ": 121,
+  "ቅ ": 122,
+  "አፍ": 123,
+  "ፍሪ": 124,
+  "ሪካ": 125,
+  "ካ ": 126,
+  "የም": 127,
+  "ምት": 128,
+  "ትገ": 129,
+  "ገኝ": 130,
+  "ኝ ": 131,
+  "አማ": 132,
+  "ማር": 133,
+  " ከ": 134,
+  "ቅ": 135,
+  "ፍ": 136,
+  "ካ": 137,
+  "ኝ": 138,
+  "ኛ": 139,
+  "ብ": 140,
+  "ሔ": 141,
+  "ዊ": 142,
+  "ነ": 143,
+  "ው": 144,
+  "ዲ": 145,
+  "ባ": 146,
+  "ዋ": 147,
+  "ከ": 148,
+  "ተ": 149,
+  "ረ": 150,
+  "ጅ": 151,
+  "ታ": 152,
+  "ክ": 153,
+  "ከተ": 154,
+  "ተማ": 155,
+  "ማ ": 156,
+  " ኢ": 157,
+  " ረ": 158,
+  "ረጅ": 159,
+  "ጅም": 160,
+  "ም ": 161,
+  " ታ": 162,
+  "ታሪ": 163,
+  "ሪክ": 164,
+  "ክ ": 165,
+  " ያ": 166,
+  "ያላ": 167,
+  "ላት": 168,
+  "ያ በ": 169,
+  " በም": 170,
+  "በምስ": 171,
+  "ርኛ": 172,
+  "ኛ ": 173,
+  " ብ": 174,
+  "ብሔ": 175,
+  "ሔራ": 176,
+  "ራዊ": 177,
+  "ዊ ": 178,
+  " ነ": 179,
+  "ነው": 180,
+  "ው ": 181,
+  "አዲ": 182,
+  "ዲስ": 183,
+  "ስ ": 184,
+  "አበ": 185,
+  "በባ": 186,
+  "ባ ": 187,
+  " ዋ": 188,
+  "ዋና": 189,
+  "ና ": 190
+}