langdetect-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +24 -13
  3. data/langdetect-ruby.gemspec +1 -1
  4. data/lib/lingua_ruby/configuration.rb +4 -1
  5. data/lib/lingua_ruby/detector.rb +59 -1
  6. data/lib/lingua_ruby/profile_loader.rb +26 -6
  7. data/lib/lingua_ruby/profiles/am.json +193 -0
  8. data/lib/lingua_ruby/profiles/bg.json +290 -0
  9. data/lib/lingua_ruby/profiles/bn.json +211 -0
  10. data/lib/lingua_ruby/profiles/cs.json +302 -0
  11. data/lib/lingua_ruby/profiles/da.json +302 -0
  12. data/lib/lingua_ruby/profiles/de.json +302 -0
  13. data/lib/lingua_ruby/profiles/el.json +302 -0
  14. data/lib/lingua_ruby/profiles/es.json +302 -0
  15. data/lib/lingua_ruby/profiles/et.json +289 -0
  16. data/lib/lingua_ruby/profiles/fa.json +234 -0
  17. data/lib/lingua_ruby/profiles/fi.json +284 -0
  18. data/lib/lingua_ruby/profiles/fr.json +302 -0
  19. data/lib/lingua_ruby/profiles/ha.json +302 -0
  20. data/lib/lingua_ruby/profiles/hi.json +255 -0
  21. data/lib/lingua_ruby/profiles/hr.json +302 -0
  22. data/lib/lingua_ruby/profiles/hu.json +302 -0
  23. data/lib/lingua_ruby/profiles/it.json +302 -0
  24. data/lib/lingua_ruby/profiles/lt.json +294 -0
  25. data/lib/lingua_ruby/profiles/lv.json +302 -0
  26. data/lib/lingua_ruby/profiles/my.json +200 -0
  27. data/lib/lingua_ruby/profiles/no.json +297 -0
  28. data/lib/lingua_ruby/profiles/pl.json +302 -0
  29. data/lib/lingua_ruby/profiles/pt.json +302 -0
  30. data/lib/lingua_ruby/profiles/ro.json +302 -0
  31. data/lib/lingua_ruby/profiles/ru.json +297 -0
  32. data/lib/lingua_ruby/profiles/sk.json +302 -0
  33. data/lib/lingua_ruby/profiles/sv.json +302 -0
  34. data/lib/lingua_ruby/profiles/sw.json +268 -0
  35. data/lib/lingua_ruby/profiles/ta.json +235 -0
  36. data/lib/lingua_ruby/profiles/te.json +254 -0
  37. data/lib/lingua_ruby/profiles/th.json +251 -0
  38. data/lib/lingua_ruby/profiles/tl.json +302 -0
  39. data/lib/lingua_ruby/profiles/tr.json +302 -0
  40. data/lib/lingua_ruby/profiles/uk.json +302 -0
  41. data/lib/lingua_ruby/profiles/ur.json +232 -0
  42. data/lib/lingua_ruby/profiles/vi.json +277 -0
  43. data/lib/lingua_ruby/profiles/yo.json +245 -0
  44. data/lib/lingua_ruby/profiles/zu.json +302 -0
  45. data/lib/lingua_ruby/result.rb +13 -26
  46. data/lib/lingua_ruby/version.rb +1 -1
  47. data/lib/lingua_ruby.rb +4 -0
  48. metadata +41 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4e5394fb4ef82d685b4d202a1d621f17aad0630f7afb635f86888d41c59a0e8a
4
- data.tar.gz: 07ef998b77fb36d9aa37ca93e9a3ab195b148ff7b610fb0eb31c1cbd6c699eaa
3
+ metadata.gz: bc3f5643c02cc99f8c8c9dbc5ddf183a6ba1cedc52e3fec442df40e8d93f14a2
4
+ data.tar.gz: e933d5754d9ccb59b767484b38a3dad6cb0ef7dd4c7ce75ef0825cf222ff0b08
5
5
  SHA512:
6
- metadata.gz: c179fc508bfec2d93849613674bfb1cacad889b81a2975ca703553a7133f1bb2d04688137c229b8ed4c97a2e25ab973de10554ea17c557933e98376963fdffa6
7
- data.tar.gz: '096782d37ce17b2dbdfbc4bccfb056658abc23076fafc5b923a97fb52a62f0c226a830fea7d8bf35e48d493cb557d1d4122f14d418380df0ce627ddb60acf01a'
6
+ metadata.gz: 2b06daef47d7aa93ef113206299a5f202a938d2a69cb913bb066fae5bf0c301ea7a3fdc846bf44a6f97c80a7c0ae366965e573e3c18f0d10d5221e8fe181adeb
7
+ data.tar.gz: d02fa38feff5929c523c9ee1db261a0c788f29d073fba6ca7a27bbca3f63b772eab85751aa59bdb2c10300435a0990c52201f7a9bba9b60705359ea8b9fb90f9
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
1
  # langdetect-ruby
2
2
 
3
- Pure Ruby language detection using character n-gram frequency profiles.
3
+ Language detection for Ruby using n-gram profiles. Supports 10+ languages with script-based fast paths for CJK, Arabic, Thai, and Devanagari.
4
4
 
5
5
  ## Installation
6
6
 
7
7
  ```ruby
8
- gem "langdetect-ruby", "~> 0.1"
8
+ gem "langdetect-ruby"
9
9
  ```
10
10
 
11
11
  ## Usage
@@ -13,20 +13,31 @@ gem "langdetect-ruby", "~> 0.1"
13
13
  ```ruby
14
14
  require "lingua_ruby"
15
15
 
16
- result = LinguaRuby.detect("Selamat pagi, apa kabar?")
17
- result.language # => :id
18
- result.confidence # => 0.94
19
- result.name # => "Indonesian"
20
-
21
- results = LinguaRuby.detect_all("Good morning everyone")
22
-
23
- detector = LinguaRuby::Detector.new(languages: [:id, :en, :ms])
24
- detector.detect("Ini bahasa apa?")
16
+ # Single detection
17
+ result = LinguaRuby.detect("This is an English sentence")
18
+ result.language # => :en
19
+ result.confidence # => 0.92
20
+
21
+ # Batch detection
22
+ results = LinguaRuby.detect_batch([
23
+ "Hello world",
24
+ "Halo dunia",
25
+ "こんにちは世界"
26
+ ])
27
+
28
+ # Restrict to specific languages
29
+ detector = LinguaRuby::Detector.new(languages: [:en, :id, :ms])
30
+ result = detector.detect("Selamat pagi")
25
31
  ```
26
32
 
27
- ## Supported Languages
33
+ ## Features
28
34
 
29
- Indonesian, English, Malay, Javanese, Sundanese, Dutch, Arabic, Chinese, Japanese, Korean
35
+ - N-gram profile comparison with normalized confidence (0.0-1.0)
36
+ - CJK/Arabic/Thai/Devanagari script fast-path detection
37
+ - Short text mode (< 20 chars) with higher n-gram orders
38
+ - Batch detection with single profile load
39
+ - Indonesian/Malay/Sundanese differentiation
40
+ - Input validation and error handling
30
41
 
31
42
  ## License
32
43
 
@@ -8,7 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.authors = ["Johannes Dwi Cahyo"]
9
9
  spec.email = ["johannes@example.com"]
10
10
  spec.summary = "Language detection for Ruby using n-gram profiles"
11
- spec.description = "Pure Ruby language detection library using character n-gram frequency profiles. Detects 50+ languages with high accuracy."
11
+ spec.description = "Pure Ruby language detection library using character n-gram frequency profiles. Detects 48 languages including European, Asian, and African languages with script-based fast-path and mixed-language segment detection."
12
12
  spec.homepage = "https://github.com/johannesdwicahyo/lingua-ruby"
13
13
  spec.license = "MIT"
14
14
  spec.required_ruby_version = ">= 3.0.0"
@@ -3,13 +3,16 @@
3
3
  module LinguaRuby
4
4
  class Configuration
5
5
  attr_accessor :default_languages, :min_confidence, :ngram_range,
6
- :max_profile_size
6
+ :max_profile_size, :custom_profiles_dir,
7
+ :use_sigmoid_calibration
7
8
 
8
9
  def initialize
9
10
  @default_languages = nil
10
11
  @min_confidence = 0.1
11
12
  @ngram_range = 1..3
12
13
  @max_profile_size = 300
14
+ @custom_profiles_dir = nil
15
+ @use_sigmoid_calibration = false
13
16
  end
14
17
  end
15
18
  end
@@ -48,7 +48,8 @@ module LinguaRuby
48
48
  @min_confidence = min_confidence || config.min_confidence
49
49
  @ngram_range = ngram_range || config.ngram_range
50
50
  @max_profile_size = max_profile_size || config.max_profile_size
51
- @loader = ProfileLoader.new
51
+ @use_sigmoid = config.use_sigmoid_calibration
52
+ @loader = ProfileLoader.new(custom_profiles_dir: config.custom_profiles_dir)
52
53
  @profiles = @loader.load_all(languages: @languages)
53
54
  end
54
55
 
@@ -58,6 +59,36 @@ module LinguaRuby
58
59
  results.first
59
60
  end
60
61
 
62
+ # Detect language segments within mixed-language text
63
+ def detect_segments(text, min_segment_length: 20)
64
+ validate_text!(text)
65
+ return [] if text.nil? || text.strip.empty?
66
+
67
+ sentences = text.split(/(?<=[.!?。!?])\s+/)
68
+ segments = []
69
+ current_lang = nil
70
+ current_text = ""
71
+
72
+ sentences.each do |sentence|
73
+ next if sentence.strip.empty?
74
+ result = detect(sentence)
75
+ lang = result&.language
76
+
77
+ if lang == current_lang || current_text.length < min_segment_length
78
+ current_text += " " unless current_text.empty?
79
+ current_text += sentence
80
+ current_lang ||= lang
81
+ else
82
+ segments << { language: current_lang, text: current_text, confidence: detect(current_text)&.confidence } unless current_text.empty?
83
+ current_lang = lang
84
+ current_text = sentence
85
+ end
86
+ end
87
+
88
+ segments << { language: current_lang, text: current_text, confidence: detect(current_text)&.confidence } unless current_text.empty?
89
+ segments
90
+ end
91
+
61
92
  def detect_all(text)
62
93
  validate_text!(text)
63
94
  return [] if text.nil? || text.strip.empty?
@@ -98,6 +129,11 @@ module LinguaRuby
98
129
  # Map [min_distance, max_distance] → [1.0, 0.0]
99
130
  confidence = 1.0 - ((dist - min_distance) / (max_distance - min_distance))
100
131
 
132
+ # Apply sigmoid calibration if enabled
133
+ if @use_sigmoid
134
+ confidence = sigmoid_calibrate(confidence)
135
+ end
136
+
101
137
  # For short texts, reduce confidence to reflect unreliability
102
138
  if short_text
103
139
  confidence *= 0.7
@@ -110,6 +146,22 @@ module LinguaRuby
110
146
  .select { |r| r.confidence >= @min_confidence }
111
147
  end
112
148
 
149
+ # Score profile quality (0.0–1.0) based on n-gram diversity and coverage
150
+ def self.profile_quality(profile)
151
+ return 0.0 unless profile.is_a?(Profile)
152
+ ngram_count = profile.ngrams.size
153
+ return 0.0 if ngram_count == 0
154
+
155
+ # Measure n-gram order diversity
156
+ orders = profile.ngrams.keys.map(&:length).uniq.sort
157
+ order_diversity = orders.size / 5.0 # max 5 orders
158
+
159
+ # Size score (more ngrams = better, up to 300)
160
+ size_score = [ngram_count / 300.0, 1.0].min
161
+
162
+ ((order_diversity + size_score) / 2.0).round(4)
163
+ end
164
+
113
165
  private
114
166
 
115
167
  def validate_text!(text)
@@ -210,5 +262,11 @@ module LinguaRuby
210
262
  def script_result(lang, confidence: 0.99)
211
263
  [Result.new(language: lang, confidence: confidence)]
212
264
  end
265
+
266
+ # Sigmoid calibration: maps linear [0,1] to sigmoid-shaped [0,1]
267
+ # Steeper around the middle, compresses extremes
268
+ def sigmoid_calibrate(x, k: 10.0)
269
+ 1.0 / (1.0 + Math.exp(-k * (x - 0.5)))
270
+ end
213
271
  end
214
272
  end
@@ -6,8 +6,9 @@ module LinguaRuby
6
6
  class ProfileLoader
7
7
  PROFILES_DIR = File.join(__dir__, "profiles")
8
8
 
9
- def initialize
9
+ def initialize(custom_profiles_dir: nil)
10
10
  @cache = {}
11
+ @custom_dir = custom_profiles_dir
11
12
  end
12
13
 
13
14
  def load(language)
@@ -21,16 +22,35 @@ module LinguaRuby
21
22
  end
22
23
 
23
24
  def available_languages
24
- Dir.glob(File.join(PROFILES_DIR, "*.json")).map do |f|
25
- File.basename(f, ".json").to_sym
26
- end.sort
25
+ dirs = [PROFILES_DIR]
26
+ dirs << @custom_dir if @custom_dir
27
+ dirs.flat_map do |dir|
28
+ Dir.glob(File.join(dir, "*.json")).map { |f| File.basename(f, ".json").to_sym }
29
+ end.uniq.sort
30
+ end
31
+
32
+ # Load a custom profile from a hash or file
33
+ def load_custom(language, ngrams: nil, path: nil)
34
+ if path
35
+ data = JSON.parse(File.read(path))
36
+ @cache[language.to_sym] = Profile.new(language: language, ngrams: data)
37
+ elsif ngrams
38
+ @cache[language.to_sym] = Profile.new(language: language, ngrams: ngrams)
39
+ else
40
+ raise ArgumentError, "Either ngrams: or path: must be provided"
41
+ end
27
42
  end
28
43
 
29
44
  private
30
45
 
31
46
  def load_from_file(language)
32
- path = File.join(PROFILES_DIR, "#{language}.json")
33
- raise Error, "No profile found for language: #{language}" unless File.exist?(path)
47
+ # Check custom dir first, then built-in
48
+ paths = []
49
+ paths << File.join(@custom_dir, "#{language}.json") if @custom_dir
50
+ paths << File.join(PROFILES_DIR, "#{language}.json")
51
+
52
+ path = paths.find { |p| File.exist?(p) }
53
+ raise Error, "No profile found for language: #{language}" unless path
34
54
 
35
55
  data = JSON.parse(File.read(path))
36
56
  Profile.new(language: language, ngrams: data)
@@ -0,0 +1,193 @@
1
+ {
2
+ "ት": 0,
3
+ "ያ": 1,
4
+ " አ": 2,
5
+ "ያ ": 3,
6
+ "ጵያ": 4,
7
+ "ዮጵ": 5,
8
+ "ትዮ": 6,
9
+ "ኢት": 7,
10
+ "ኢትዮ": 8,
11
+ "ትዮጵ": 9,
12
+ "ቋ": 10,
13
+ "ዮጵያ": 11,
14
+ "ና": 12,
15
+ "ጵያ ": 13,
16
+ "ዮ": 14,
17
+ "ጵ": 15,
18
+ "አ": 16,
19
+ "ኢ": 17,
20
+ "የ": 18,
21
+ "ገ": 19,
22
+ "ር": 20,
23
+ "ም": 21,
24
+ " የ": 22,
25
+ " ናት": 23,
26
+ "ት ": 24,
27
+ "ናት": 25,
28
+ " ና": 26,
29
+ "ቋንቋ": 27,
30
+ " ቋን": 28,
31
+ "ናት ": 29,
32
+ "ር ና": 30,
33
+ "ገር ": 31,
34
+ "ሀገር": 32,
35
+ "ንቋ ": 33,
36
+ " ሀ": 34,
37
+ "ሀገ": 35,
38
+ "ገር": 36,
39
+ "ር ": 37,
40
+ " የኢ": 38,
41
+ "የኢት": 39,
42
+ " ቋ": 40,
43
+ "ቋን": 41,
44
+ "ንቋ": 42,
45
+ "ቋ ": 43,
46
+ "የኢ": 44,
47
+ " ሀገ": 45,
48
+ "በ": 46,
49
+ "ስ": 47,
50
+ "ሀ": 48,
51
+ "ሪ": 49,
52
+ "ማ": 50,
53
+ "ን": 51,
54
+ "ራ": 52,
55
+ "አፍሪ": 53,
56
+ " አፍ": 54,
57
+ "ቅ አ": 55,
58
+ "ራቅ ": 56,
59
+ "ቋ የ": 57,
60
+ "ላ": 58,
61
+ "ያ ብ": 59,
62
+ "ምስራ": 60,
63
+ "ስራቅ": 61,
64
+ "ና ከ": 62,
65
+ "ፍሪካ": 63,
66
+ "ሪካ ": 64,
67
+ "ካ የ": 65,
68
+ " የም": 66,
69
+ "የምት": 67,
70
+ "ኛ ቋ": 68,
71
+ "ርኛ ": 69,
72
+ "ምትገ": 70,
73
+ "ማርኛ": 71,
74
+ "አማር": 72,
75
+ " አማ": 73,
76
+ "ት አ": 74,
77
+ "ትገኝ": 75,
78
+ "ገኝ ": 76,
79
+ "ኝ ሀ": 77,
80
+ " ከተ": 78,
81
+ "ከተማ": 79,
82
+ "ተማ ": 80,
83
+ "ማ ና": 81,
84
+ "ት ኢ": 82,
85
+ " ኢት": 83,
86
+ "ያ ረ": 84,
87
+ " ረጅ": 85,
88
+ "ረጅም": 86,
89
+ "ጅም ": 87,
90
+ "ም ታ": 88,
91
+ " ታሪ": 89,
92
+ "ታሪክ": 90,
93
+ "ሪክ ": 91,
94
+ "ክ ያ": 92,
95
+ " ያላ": 93,
96
+ "ያላት": 94,
97
+ "ላት ": 95,
98
+ "ት ሀ": 96,
99
+ " ብሔ": 97,
100
+ "ብሔራ": 98,
101
+ "ሔራዊ": 99,
102
+ "ራዊ ": 100,
103
+ "ዊ ቋ": 101,
104
+ "ቋ ነ": 102,
105
+ " ነው": 103,
106
+ "ነው ": 104,
107
+ "ው አ": 105,
108
+ " አዲ": 106,
109
+ "አዲስ": 107,
110
+ "ዲስ ": 108,
111
+ "ስ አ": 109,
112
+ " አበ": 110,
113
+ "አበባ": 111,
114
+ "በባ ": 112,
115
+ "ባ የ": 113,
116
+ "ያ ዋ": 114,
117
+ " ዋና": 115,
118
+ "ዋና ": 116,
119
+ " በ": 117,
120
+ "በም": 118,
121
+ "ምስ": 119,
122
+ "ስራ": 120,
123
+ "ራቅ": 121,
124
+ "ቅ ": 122,
125
+ "አፍ": 123,
126
+ "ፍሪ": 124,
127
+ "ሪካ": 125,
128
+ "ካ ": 126,
129
+ "የም": 127,
130
+ "ምት": 128,
131
+ "ትገ": 129,
132
+ "ገኝ": 130,
133
+ "ኝ ": 131,
134
+ "አማ": 132,
135
+ "ማር": 133,
136
+ " ከ": 134,
137
+ "ቅ": 135,
138
+ "ፍ": 136,
139
+ "ካ": 137,
140
+ "ኝ": 138,
141
+ "ኛ": 139,
142
+ "ብ": 140,
143
+ "ሔ": 141,
144
+ "ዊ": 142,
145
+ "ነ": 143,
146
+ "ው": 144,
147
+ "ዲ": 145,
148
+ "ባ": 146,
149
+ "ዋ": 147,
150
+ "ከ": 148,
151
+ "ተ": 149,
152
+ "ረ": 150,
153
+ "ጅ": 151,
154
+ "ታ": 152,
155
+ "ክ": 153,
156
+ "ከተ": 154,
157
+ "ተማ": 155,
158
+ "ማ ": 156,
159
+ " ኢ": 157,
160
+ " ረ": 158,
161
+ "ረጅ": 159,
162
+ "ጅም": 160,
163
+ "ም ": 161,
164
+ " ታ": 162,
165
+ "ታሪ": 163,
166
+ "ሪክ": 164,
167
+ "ክ ": 165,
168
+ " ያ": 166,
169
+ "ያላ": 167,
170
+ "ላት": 168,
171
+ "ያ በ": 169,
172
+ " በም": 170,
173
+ "በምስ": 171,
174
+ "ርኛ": 172,
175
+ "ኛ ": 173,
176
+ " ብ": 174,
177
+ "ብሔ": 175,
178
+ "ሔራ": 176,
179
+ "ራዊ": 177,
180
+ "ዊ ": 178,
181
+ " ነ": 179,
182
+ "ነው": 180,
183
+ "ው ": 181,
184
+ "አዲ": 182,
185
+ "ዲስ": 183,
186
+ "ስ ": 184,
187
+ "አበ": 185,
188
+ "በባ": 186,
189
+ "ባ ": 187,
190
+ " ዋ": 188,
191
+ "ዋና": 189,
192
+ "ና ": 190
193
+ }