RubyGems - langdetect-ruby - Versions diffs - 0.1.1 - Mend

langdetect-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +9 -0
data/LICENSE +21 -0
data/README.md +33 -0
data/Rakefile +11 -0
data/langdetect-ruby.gemspec +33 -0
data/lib/lingua_ruby/configuration.rb +15 -0
data/lib/lingua_ruby/detector.rb +214 -0
data/lib/lingua_ruby/ngram.rb +34 -0
data/lib/lingua_ruby/profile.rb +31 -0
data/lib/lingua_ruby/profile_loader.rb +39 -0
data/lib/lingua_ruby/profiles/ar.json +302 -0
data/lib/lingua_ruby/profiles/en.json +302 -0
data/lib/lingua_ruby/profiles/id.json +402 -0
data/lib/lingua_ruby/profiles/ja.json +302 -0
data/lib/lingua_ruby/profiles/jv.json +302 -0
data/lib/lingua_ruby/profiles/ko.json +302 -0
data/lib/lingua_ruby/profiles/ms.json +402 -0
data/lib/lingua_ruby/profiles/nl.json +302 -0
data/lib/lingua_ruby/profiles/su.json +402 -0
data/lib/lingua_ruby/profiles/zh.json +302 -0
data/lib/lingua_ruby/result.rb +54 -0
data/lib/lingua_ruby/version.rb +5 -0
data/lib/lingua_ruby.rb +49 -0
metadata +95 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 4e5394fb4ef82d685b4d202a1d621f17aad0630f7afb635f86888d41c59a0e8a
+  data.tar.gz: 07ef998b77fb36d9aa37ca93e9a3ab195b148ff7b610fb0eb31c1cbd6c699eaa
+SHA512:
+  metadata.gz: c179fc508bfec2d93849613674bfb1cacad889b81a2975ca703553a7133f1bb2d04688137c229b8ed4c97a2e25ab973de10554ea17c557933e98376963fdffa6
+  data.tar.gz: '096782d37ce17b2dbdfbc4bccfb056658abc23076fafc5b923a97fb52a62f0c226a830fea7d8bf35e48d493cb557d1d4122f14d418380df0ce627ddb60acf01a'

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,9 @@
+# Changelog
+## 0.1.0 (2026-03-09)
+- Initial release
+- Language detection using character n-gram profiles
+- Support for 10 languages: Indonesian, English, Malay, Javanese, Sundanese, Dutch, Arabic, Chinese, Japanese, Korean
+- Module-level `detect` and `detect_all` API
+- Configurable language set, confidence threshold, and n-gram range

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Johannes Dwi Cahyo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,33 @@
+# langdetect-ruby
+Pure Ruby language detection using character n-gram frequency profiles.
+## Installation
+```ruby
+gem "langdetect-ruby", "~> 0.1"
+```
+## Usage
+```ruby
+require "lingua_ruby"
+result = LinguaRuby.detect("Selamat pagi, apa kabar?")
+result.language   # => :id
+result.confidence # => 0.94
+result.name       # => "Indonesian"
+results = LinguaRuby.detect_all("Good morning everyone")
+detector = LinguaRuby::Detector.new(languages: [:id, :en, :ms])
+detector.detect("Ini bahasa apa?")
+```
+## Supported Languages
+Indonesian, English, Malay, Javanese, Sundanese, Dutch, Arabic, Chinese, Japanese, Korean
+## License
+MIT

data/Rakefile ADDED Viewed

@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+require "rake/testtask"
+Rake::TestTask.new(:test) do |t|
+  t.libs << "test"
+  t.libs << "lib"
+  t.test_files = FileList["test/**/test_*.rb"]
+end
+task default: :test

data/langdetect-ruby.gemspec ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+require_relative "lib/lingua_ruby/version"
+Gem::Specification.new do |spec|
+  spec.name = "langdetect-ruby"
+  spec.version = LinguaRuby::VERSION
+  spec.authors = ["Johannes Dwi Cahyo"]
+  spec.email = ["johannes@example.com"]
+  spec.summary = "Language detection for Ruby using n-gram profiles"
+  spec.description = "Pure Ruby language detection library using character n-gram frequency profiles. Detects 50+ languages with high accuracy."
+  spec.homepage = "https://github.com/johannesdwicahyo/lingua-ruby"
+  spec.license = "MIT"
+  spec.required_ruby_version = ">= 3.0.0"
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = spec.homepage
+  spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
+  spec.files = Dir[
+    "lib/**/*.rb",
+    "lib/**/*.json",
+    "README.md",
+    "LICENSE",
+    "CHANGELOG.md",
+    "Rakefile",
+    "langdetect-ruby.gemspec"
+  ]
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "minitest", "~> 5.0"
+  spec.add_development_dependency "rake", "~> 13.0"
+end

data/lib/lingua_ruby/configuration.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+module LinguaRuby
+  class Configuration
+    attr_accessor :default_languages, :min_confidence, :ngram_range,
+                  :max_profile_size
+    def initialize
+      @default_languages = nil
+      @min_confidence = 0.1
+      @ngram_range = 1..3
+      @max_profile_size = 300
+    end
+  end
+end

data/lib/lingua_ruby/detector.rb ADDED Viewed

@@ -0,0 +1,214 @@
+# frozen_string_literal: true
+module LinguaRuby
+  class Detector
+    # Valid language symbols that can be passed to the languages parameter
+    VALID_LANGUAGES = Result::LANGUAGE_NAMES.keys.freeze
+    # Unicode ranges for script-based fast-path detection
+    CJK_RANGES = [
+      (0x4E00..0x9FFF),   # CJK Unified Ideographs
+      (0x3400..0x4DBF),   # CJK Unified Ideographs Extension A
+      (0x20000..0x2A6DF), # CJK Unified Ideographs Extension B
+      (0x2A700..0x2B73F), # CJK Unified Ideographs Extension C
+      (0x2B740..0x2B81F), # CJK Unified Ideographs Extension D
+      (0xF900..0xFAFF),   # CJK Compatibility Ideographs
+      (0x2F800..0x2FA1F)  # CJK Compatibility Ideographs Supplement
+    ].freeze
+    HIRAGANA_RANGE = (0x3040..0x309F)
+    KATAKANA_RANGE = (0x30A0..0x30FF)
+    HANGUL_RANGES = [
+      (0xAC00..0xD7AF),   # Hangul Syllables
+      (0x1100..0x11FF),   # Hangul Jamo
+      (0x3130..0x318F)    # Hangul Compatibility Jamo
+    ].freeze
+    ARABIC_RANGES = [
+      (0x0600..0x06FF),   # Arabic
+      (0x0750..0x077F),   # Arabic Supplement
+      (0x08A0..0x08FF),   # Arabic Extended-A
+      (0xFB50..0xFDFF),   # Arabic Presentation Forms-A
+      (0xFE70..0xFEFF)    # Arabic Presentation Forms-B
+    ].freeze
+    THAI_RANGE = (0x0E00..0x0E7F)
+    DEVANAGARI_RANGE = (0x0900..0x097F)
+    SHORT_TEXT_THRESHOLD = 20
+    def initialize(languages: nil, min_confidence: nil, ngram_range: nil, max_profile_size: nil)
+      config = LinguaRuby.configuration
+      if languages
+        validate_languages!(languages)
+      end
+      @languages = languages
+      @min_confidence = min_confidence || config.min_confidence
+      @ngram_range = ngram_range || config.ngram_range
+      @max_profile_size = max_profile_size || config.max_profile_size
+      @loader = ProfileLoader.new
+      @profiles = @loader.load_all(languages: @languages)
+    end
+    def detect(text)
+      validate_text!(text)
+      results = detect_all(text)
+      results.first
+    end
+    def detect_all(text)
+      validate_text!(text)
+      return [] if text.nil? || text.strip.empty?
+      # Script-based fast-path for unambiguous scripts
+      script_result = detect_by_script(text)
+      return script_result if script_result
+      short_text = text.strip.length < SHORT_TEXT_THRESHOLD
+      # For short text, use higher n-gram orders if available
+      ngram_range = if short_text
+                      high_start = [@ngram_range.begin, 2].max
+                      high_end = [@ngram_range.end, 5].max
+                      high_start..high_end
+                    else
+                      @ngram_range
+                    end
+      text_profile = Ngram.profile(text, range: ngram_range, max_size: @max_profile_size)
+      return [] if text_profile.empty?
+      distances = @profiles.map do |lang, profile|
+        [lang, profile.distance(text_profile)]
+      end
+      return [] if distances.empty?
+      # Normalize confidence using inverse distance with min-max normalization
+      min_distance = distances.map(&:last).min.to_f
+      max_distance = distances.map(&:last).max.to_f
+      # If all distances are the same, we cannot differentiate
+      return [] if min_distance == max_distance
+      results = distances.map do |lang, dist|
+        # Inverse: lowest distance = highest confidence
+        # Map [min_distance, max_distance] → [1.0, 0.0]
+        confidence = 1.0 - ((dist - min_distance) / (max_distance - min_distance))
+        # For short texts, reduce confidence to reflect unreliability
+        if short_text
+          confidence *= 0.7
+        end
+        Result.new(language: lang, confidence: confidence.round(4))
+      end
+      results.sort
+             .select { |r| r.confidence >= @min_confidence }
+    end
+    private
+    def validate_text!(text)
+      return if text.nil?
+      unless text.is_a?(String)
+        raise ArgumentError, "text must be a String, got #{text.class}"
+      end
+    end
+    def validate_languages!(languages)
+      unless languages.is_a?(Array)
+        raise ArgumentError, "languages must be an Array, got #{languages.class}"
+      end
+      available = @loader ? @loader.available_languages : ProfileLoader.new.available_languages
+      languages.each do |lang|
+        sym = lang.to_sym
+        unless available.include?(sym)
+          raise ArgumentError, "unknown language: #{lang.inspect}. Available: #{available.join(', ')}"
+        end
+      end
+    end
+    def detect_by_script(text)
+      chars = text.strip.chars.reject { |c| c.match?(/[\s\p{P}\p{S}0-9]/) }
+      return nil if chars.empty?
+      # Count characters by script
+      cjk_count = 0
+      hiragana_count = 0
+      katakana_count = 0
+      hangul_count = 0
+      arabic_count = 0
+      thai_count = 0
+      devanagari_count = 0
+      chars.each do |char|
+        cp = char.ord
+        if CJK_RANGES.any? { |r| r.include?(cp) }
+          cjk_count += 1
+        elsif HIRAGANA_RANGE.include?(cp)
+          hiragana_count += 1
+        elsif KATAKANA_RANGE.include?(cp)
+          katakana_count += 1
+        elsif HANGUL_RANGES.any? { |r| r.include?(cp) }
+          hangul_count += 1
+        elsif ARABIC_RANGES.any? { |r| r.include?(cp) }
+          arabic_count += 1
+        elsif THAI_RANGE.include?(cp)
+          thai_count += 1
+        elsif DEVANAGARI_RANGE.include?(cp)
+          devanagari_count += 1
+        end
+      end
+      total = chars.length.to_f
+      threshold = 0.5 # At least 50% of chars must be in the script
+      # Japanese: has hiragana/katakana (unique to Japanese)
+      if (hiragana_count + katakana_count) / total >= 0.2
+        return script_result(:ja) if language_available?(:ja)
+      end
+      # Korean: has hangul
+      if hangul_count / total >= threshold
+        return script_result(:ko) if language_available?(:ko)
+      end
+      # Chinese: CJK ideographs without kana/hangul
+      if cjk_count / total >= threshold && hiragana_count == 0 && katakana_count == 0 && hangul_count == 0
+        return script_result(:zh) if language_available?(:zh)
+      end
+      # Arabic
+      if arabic_count / total >= threshold
+        return script_result(:ar) if language_available?(:ar)
+      end
+      # Thai
+      if thai_count / total >= threshold
+        return script_result(:th) if language_available?(:th)
+      end
+      # Hindi (Devanagari)
+      if devanagari_count / total >= threshold
+        return script_result(:hi) if language_available?(:hi)
+      end
+      nil
+    end
+    def language_available?(lang)
+      @languages.nil? || @languages.map(&:to_sym).include?(lang)
+    end
+    def script_result(lang, confidence: 0.99)
+      [Result.new(language: lang, confidence: confidence)]
+    end
+  end
+end

data/lib/lingua_ruby/ngram.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+module LinguaRuby
+  module Ngram
+    def self.extract(text, range: 1..3)
+      text = normalize(text)
+      ngrams = Hash.new(0)
+      range.each do |n|
+        chars = text.chars
+        (0..chars.length - n).each do |i|
+          gram = chars[i, n].join
+          ngrams[gram] += 1 unless gram.strip.empty?
+        end
+      end
+      ngrams
+    end
+    def self.profile(text, range: 1..3, max_size: 300)
+      counts = extract(text, range: range)
+      sorted = counts.sort_by { |_, count| -count }
+      sorted.first(max_size).each_with_index.map { |(gram, _), idx| [gram, idx] }.to_h
+    end
+    def self.normalize(text)
+      text.downcase
+          .gsub(/[0-9]/, "")
+          .gsub(/[\p{P}\p{S}]/, " ")
+          .gsub(/\s+/, " ")
+          .strip
+    end
+  end
+end

data/lib/lingua_ruby/profile.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+module LinguaRuby
+  class Profile
+    attr_reader :language, :ngrams
+    def initialize(language:, ngrams:)
+      @language = language.to_sym
+      @ngrams = ngrams
+    end
+    def distance(other_profile)
+      max_rank = ngrams.size
+      total = 0
+      other_profile.each do |gram, other_rank|
+        if ngrams.key?(gram)
+          total += (ngrams[gram] - other_rank).abs
+        else
+          total += max_rank
+        end
+      end
+      total
+    end
+    def to_h
+      { language: @language, ngrams: @ngrams }
+    end
+  end
+end

data/lib/lingua_ruby/profile_loader.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+require "json"
+module LinguaRuby
+  class ProfileLoader
+    PROFILES_DIR = File.join(__dir__, "profiles")
+    def initialize
+      @cache = {}
+    end
+    def load(language)
+      @cache[language.to_sym] ||= load_from_file(language)
+    end
+    def load_all(languages: nil)
+      available = available_languages
+      targets = languages ? languages.map(&:to_sym) & available : available
+      targets.map { |lang| [lang, load(lang)] }.to_h
+    end
+    def available_languages
+      Dir.glob(File.join(PROFILES_DIR, "*.json")).map do |f|
+        File.basename(f, ".json").to_sym
+      end.sort
+    end
+    private
+    def load_from_file(language)
+      path = File.join(PROFILES_DIR, "#{language}.json")
+      raise Error, "No profile found for language: #{language}" unless File.exist?(path)
+      data = JSON.parse(File.read(path))
+      Profile.new(language: language, ngrams: data)
+    end
+  end
+end