langdetect-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 4e5394fb4ef82d685b4d202a1d621f17aad0630f7afb635f86888d41c59a0e8a
4
+ data.tar.gz: 07ef998b77fb36d9aa37ca93e9a3ab195b148ff7b610fb0eb31c1cbd6c699eaa
5
+ SHA512:
6
+ metadata.gz: c179fc508bfec2d93849613674bfb1cacad889b81a2975ca703553a7133f1bb2d04688137c229b8ed4c97a2e25ab973de10554ea17c557933e98376963fdffa6
7
+ data.tar.gz: '096782d37ce17b2dbdfbc4bccfb056658abc23076fafc5b923a97fb52a62f0c226a830fea7d8bf35e48d493cb557d1d4122f14d418380df0ce627ddb60acf01a'
data/CHANGELOG.md ADDED
@@ -0,0 +1,9 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 (2026-03-09)
4
+
5
+ - Initial release
6
+ - Language detection using character n-gram profiles
7
+ - Support for 10 languages: Indonesian, English, Malay, Javanese, Sundanese, Dutch, Arabic, Chinese, Japanese, Korean
8
+ - Module-level `detect` and `detect_all` API
9
+ - Configurable language set, confidence threshold, and n-gram range
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Johannes Dwi Cahyo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # langdetect-ruby
2
+
3
+ Pure Ruby language detection using character n-gram frequency profiles.
4
+
5
+ ## Installation
6
+
7
+ ```ruby
8
+ gem "langdetect-ruby", "~> 0.1"
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```ruby
14
+ require "lingua_ruby"
15
+
16
+ result = LinguaRuby.detect("Selamat pagi, apa kabar?")
17
+ result.language # => :id
18
+ result.confidence # => 0.94
19
+ result.name # => "Indonesian"
20
+
21
+ results = LinguaRuby.detect_all("Good morning everyone")
22
+
23
+ detector = LinguaRuby::Detector.new(languages: [:id, :en, :ms])
24
+ detector.detect("Ini bahasa apa?")
25
+ ```
26
+
27
+ ## Supported Languages
28
+
29
+ Indonesian, English, Malay, Javanese, Sundanese, Dutch, Arabic, Chinese, Japanese, Korean
30
+
31
+ ## License
32
+
33
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rake/testtask"
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList["test/**/test_*.rb"]
9
+ end
10
+
11
+ task default: :test
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/lingua_ruby/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "langdetect-ruby"
7
+ spec.version = LinguaRuby::VERSION
8
+ spec.authors = ["Johannes Dwi Cahyo"]
9
+ spec.email = ["johannes@example.com"]
10
+ spec.summary = "Language detection for Ruby using n-gram profiles"
11
+ spec.description = "Pure Ruby language detection library using character n-gram frequency profiles. Detects 50+ languages with high accuracy."
12
+ spec.homepage = "https://github.com/johannesdwicahyo/lingua-ruby"
13
+ spec.license = "MIT"
14
+ spec.required_ruby_version = ">= 3.0.0"
15
+
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
+ spec.metadata["source_code_uri"] = spec.homepage
18
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
19
+
20
+ spec.files = Dir[
21
+ "lib/**/*.rb",
22
+ "lib/**/*.json",
23
+ "README.md",
24
+ "LICENSE",
25
+ "CHANGELOG.md",
26
+ "Rakefile",
27
+ "langdetect-ruby.gemspec"
28
+ ]
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_development_dependency "minitest", "~> 5.0"
32
+ spec.add_development_dependency "rake", "~> 13.0"
33
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinguaRuby
4
+ class Configuration
5
+ attr_accessor :default_languages, :min_confidence, :ngram_range,
6
+ :max_profile_size
7
+
8
+ def initialize
9
+ @default_languages = nil
10
+ @min_confidence = 0.1
11
+ @ngram_range = 1..3
12
+ @max_profile_size = 300
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,214 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinguaRuby
4
+ class Detector
5
+ # Valid language symbols that can be passed to the languages parameter
6
+ VALID_LANGUAGES = Result::LANGUAGE_NAMES.keys.freeze
7
+
8
+ # Unicode ranges for script-based fast-path detection
9
+ CJK_RANGES = [
10
+ (0x4E00..0x9FFF), # CJK Unified Ideographs
11
+ (0x3400..0x4DBF), # CJK Unified Ideographs Extension A
12
+ (0x20000..0x2A6DF), # CJK Unified Ideographs Extension B
13
+ (0x2A700..0x2B73F), # CJK Unified Ideographs Extension C
14
+ (0x2B740..0x2B81F), # CJK Unified Ideographs Extension D
15
+ (0xF900..0xFAFF), # CJK Compatibility Ideographs
16
+ (0x2F800..0x2FA1F) # CJK Compatibility Ideographs Supplement
17
+ ].freeze
18
+
19
+ HIRAGANA_RANGE = (0x3040..0x309F)
20
+ KATAKANA_RANGE = (0x30A0..0x30FF)
21
+ HANGUL_RANGES = [
22
+ (0xAC00..0xD7AF), # Hangul Syllables
23
+ (0x1100..0x11FF), # Hangul Jamo
24
+ (0x3130..0x318F) # Hangul Compatibility Jamo
25
+ ].freeze
26
+
27
+ ARABIC_RANGES = [
28
+ (0x0600..0x06FF), # Arabic
29
+ (0x0750..0x077F), # Arabic Supplement
30
+ (0x08A0..0x08FF), # Arabic Extended-A
31
+ (0xFB50..0xFDFF), # Arabic Presentation Forms-A
32
+ (0xFE70..0xFEFF) # Arabic Presentation Forms-B
33
+ ].freeze
34
+
35
+ THAI_RANGE = (0x0E00..0x0E7F)
36
+ DEVANAGARI_RANGE = (0x0900..0x097F)
37
+
38
+ SHORT_TEXT_THRESHOLD = 20
39
+
40
+ def initialize(languages: nil, min_confidence: nil, ngram_range: nil, max_profile_size: nil)
41
+ config = LinguaRuby.configuration
42
+
43
+ if languages
44
+ validate_languages!(languages)
45
+ end
46
+
47
+ @languages = languages
48
+ @min_confidence = min_confidence || config.min_confidence
49
+ @ngram_range = ngram_range || config.ngram_range
50
+ @max_profile_size = max_profile_size || config.max_profile_size
51
+ @loader = ProfileLoader.new
52
+ @profiles = @loader.load_all(languages: @languages)
53
+ end
54
+
55
+ def detect(text)
56
+ validate_text!(text)
57
+ results = detect_all(text)
58
+ results.first
59
+ end
60
+
61
+ def detect_all(text)
62
+ validate_text!(text)
63
+ return [] if text.nil? || text.strip.empty?
64
+
65
+ # Script-based fast-path for unambiguous scripts
66
+ script_result = detect_by_script(text)
67
+ return script_result if script_result
68
+
69
+ short_text = text.strip.length < SHORT_TEXT_THRESHOLD
70
+
71
+ # For short text, use higher n-gram orders if available
72
+ ngram_range = if short_text
73
+ high_start = [@ngram_range.begin, 2].max
74
+ high_end = [@ngram_range.end, 5].max
75
+ high_start..high_end
76
+ else
77
+ @ngram_range
78
+ end
79
+
80
+ text_profile = Ngram.profile(text, range: ngram_range, max_size: @max_profile_size)
81
+ return [] if text_profile.empty?
82
+
83
+ distances = @profiles.map do |lang, profile|
84
+ [lang, profile.distance(text_profile)]
85
+ end
86
+
87
+ return [] if distances.empty?
88
+
89
+ # Normalize confidence using inverse distance with min-max normalization
90
+ min_distance = distances.map(&:last).min.to_f
91
+ max_distance = distances.map(&:last).max.to_f
92
+
93
+ # If all distances are the same, we cannot differentiate
94
+ return [] if min_distance == max_distance
95
+
96
+ results = distances.map do |lang, dist|
97
+ # Inverse: lowest distance = highest confidence
98
+ # Map [min_distance, max_distance] → [1.0, 0.0]
99
+ confidence = 1.0 - ((dist - min_distance) / (max_distance - min_distance))
100
+
101
+ # For short texts, reduce confidence to reflect unreliability
102
+ if short_text
103
+ confidence *= 0.7
104
+ end
105
+
106
+ Result.new(language: lang, confidence: confidence.round(4))
107
+ end
108
+
109
+ results.sort
110
+ .select { |r| r.confidence >= @min_confidence }
111
+ end
112
+
113
+ private
114
+
115
+ def validate_text!(text)
116
+ return if text.nil?
117
+
118
+ unless text.is_a?(String)
119
+ raise ArgumentError, "text must be a String, got #{text.class}"
120
+ end
121
+ end
122
+
123
+ def validate_languages!(languages)
124
+ unless languages.is_a?(Array)
125
+ raise ArgumentError, "languages must be an Array, got #{languages.class}"
126
+ end
127
+
128
+ available = @loader ? @loader.available_languages : ProfileLoader.new.available_languages
129
+
130
+ languages.each do |lang|
131
+ sym = lang.to_sym
132
+ unless available.include?(sym)
133
+ raise ArgumentError, "unknown language: #{lang.inspect}. Available: #{available.join(', ')}"
134
+ end
135
+ end
136
+ end
137
+
138
+ def detect_by_script(text)
139
+ chars = text.strip.chars.reject { |c| c.match?(/[\s\p{P}\p{S}0-9]/) }
140
+ return nil if chars.empty?
141
+
142
+ # Count characters by script
143
+ cjk_count = 0
144
+ hiragana_count = 0
145
+ katakana_count = 0
146
+ hangul_count = 0
147
+ arabic_count = 0
148
+ thai_count = 0
149
+ devanagari_count = 0
150
+
151
+ chars.each do |char|
152
+ cp = char.ord
153
+ if CJK_RANGES.any? { |r| r.include?(cp) }
154
+ cjk_count += 1
155
+ elsif HIRAGANA_RANGE.include?(cp)
156
+ hiragana_count += 1
157
+ elsif KATAKANA_RANGE.include?(cp)
158
+ katakana_count += 1
159
+ elsif HANGUL_RANGES.any? { |r| r.include?(cp) }
160
+ hangul_count += 1
161
+ elsif ARABIC_RANGES.any? { |r| r.include?(cp) }
162
+ arabic_count += 1
163
+ elsif THAI_RANGE.include?(cp)
164
+ thai_count += 1
165
+ elsif DEVANAGARI_RANGE.include?(cp)
166
+ devanagari_count += 1
167
+ end
168
+ end
169
+
170
+ total = chars.length.to_f
171
+ threshold = 0.5 # At least 50% of chars must be in the script
172
+
173
+ # Japanese: has hiragana/katakana (unique to Japanese)
174
+ if (hiragana_count + katakana_count) / total >= 0.2
175
+ return script_result(:ja) if language_available?(:ja)
176
+ end
177
+
178
+ # Korean: has hangul
179
+ if hangul_count / total >= threshold
180
+ return script_result(:ko) if language_available?(:ko)
181
+ end
182
+
183
+ # Chinese: CJK ideographs without kana/hangul
184
+ if cjk_count / total >= threshold && hiragana_count == 0 && katakana_count == 0 && hangul_count == 0
185
+ return script_result(:zh) if language_available?(:zh)
186
+ end
187
+
188
+ # Arabic
189
+ if arabic_count / total >= threshold
190
+ return script_result(:ar) if language_available?(:ar)
191
+ end
192
+
193
+ # Thai
194
+ if thai_count / total >= threshold
195
+ return script_result(:th) if language_available?(:th)
196
+ end
197
+
198
+ # Hindi (Devanagari)
199
+ if devanagari_count / total >= threshold
200
+ return script_result(:hi) if language_available?(:hi)
201
+ end
202
+
203
+ nil
204
+ end
205
+
206
+ def language_available?(lang)
207
+ @languages.nil? || @languages.map(&:to_sym).include?(lang)
208
+ end
209
+
210
+ def script_result(lang, confidence: 0.99)
211
+ [Result.new(language: lang, confidence: confidence)]
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinguaRuby
4
+ module Ngram
5
+ def self.extract(text, range: 1..3)
6
+ text = normalize(text)
7
+ ngrams = Hash.new(0)
8
+
9
+ range.each do |n|
10
+ chars = text.chars
11
+ (0..chars.length - n).each do |i|
12
+ gram = chars[i, n].join
13
+ ngrams[gram] += 1 unless gram.strip.empty?
14
+ end
15
+ end
16
+
17
+ ngrams
18
+ end
19
+
20
+ def self.profile(text, range: 1..3, max_size: 300)
21
+ counts = extract(text, range: range)
22
+ sorted = counts.sort_by { |_, count| -count }
23
+ sorted.first(max_size).each_with_index.map { |(gram, _), idx| [gram, idx] }.to_h
24
+ end
25
+
26
+ def self.normalize(text)
27
+ text.downcase
28
+ .gsub(/[0-9]/, "")
29
+ .gsub(/[\p{P}\p{S}]/, " ")
30
+ .gsub(/\s+/, " ")
31
+ .strip
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LinguaRuby
4
+ class Profile
5
+ attr_reader :language, :ngrams
6
+
7
+ def initialize(language:, ngrams:)
8
+ @language = language.to_sym
9
+ @ngrams = ngrams
10
+ end
11
+
12
+ def distance(other_profile)
13
+ max_rank = ngrams.size
14
+ total = 0
15
+
16
+ other_profile.each do |gram, other_rank|
17
+ if ngrams.key?(gram)
18
+ total += (ngrams[gram] - other_rank).abs
19
+ else
20
+ total += max_rank
21
+ end
22
+ end
23
+
24
+ total
25
+ end
26
+
27
+ def to_h
28
+ { language: @language, ngrams: @ngrams }
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module LinguaRuby
6
+ class ProfileLoader
7
+ PROFILES_DIR = File.join(__dir__, "profiles")
8
+
9
+ def initialize
10
+ @cache = {}
11
+ end
12
+
13
+ def load(language)
14
+ @cache[language.to_sym] ||= load_from_file(language)
15
+ end
16
+
17
+ def load_all(languages: nil)
18
+ available = available_languages
19
+ targets = languages ? languages.map(&:to_sym) & available : available
20
+ targets.map { |lang| [lang, load(lang)] }.to_h
21
+ end
22
+
23
+ def available_languages
24
+ Dir.glob(File.join(PROFILES_DIR, "*.json")).map do |f|
25
+ File.basename(f, ".json").to_sym
26
+ end.sort
27
+ end
28
+
29
+ private
30
+
31
+ def load_from_file(language)
32
+ path = File.join(PROFILES_DIR, "#{language}.json")
33
+ raise Error, "No profile found for language: #{language}" unless File.exist?(path)
34
+
35
+ data = JSON.parse(File.read(path))
36
+ Profile.new(language: language, ngrams: data)
37
+ end
38
+ end
39
+ end