langdetect-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +24 -13
  3. data/langdetect-ruby.gemspec +1 -1
  4. data/lib/lingua_ruby/configuration.rb +4 -1
  5. data/lib/lingua_ruby/detector.rb +59 -1
  6. data/lib/lingua_ruby/profile_loader.rb +26 -6
  7. data/lib/lingua_ruby/profiles/am.json +193 -0
  8. data/lib/lingua_ruby/profiles/bg.json +290 -0
  9. data/lib/lingua_ruby/profiles/bn.json +211 -0
  10. data/lib/lingua_ruby/profiles/cs.json +302 -0
  11. data/lib/lingua_ruby/profiles/da.json +302 -0
  12. data/lib/lingua_ruby/profiles/de.json +302 -0
  13. data/lib/lingua_ruby/profiles/el.json +302 -0
  14. data/lib/lingua_ruby/profiles/es.json +302 -0
  15. data/lib/lingua_ruby/profiles/et.json +289 -0
  16. data/lib/lingua_ruby/profiles/fa.json +234 -0
  17. data/lib/lingua_ruby/profiles/fi.json +284 -0
  18. data/lib/lingua_ruby/profiles/fr.json +302 -0
  19. data/lib/lingua_ruby/profiles/ha.json +302 -0
  20. data/lib/lingua_ruby/profiles/hi.json +255 -0
  21. data/lib/lingua_ruby/profiles/hr.json +302 -0
  22. data/lib/lingua_ruby/profiles/hu.json +302 -0
  23. data/lib/lingua_ruby/profiles/it.json +302 -0
  24. data/lib/lingua_ruby/profiles/lt.json +294 -0
  25. data/lib/lingua_ruby/profiles/lv.json +302 -0
  26. data/lib/lingua_ruby/profiles/my.json +200 -0
  27. data/lib/lingua_ruby/profiles/no.json +297 -0
  28. data/lib/lingua_ruby/profiles/pl.json +302 -0
  29. data/lib/lingua_ruby/profiles/pt.json +302 -0
  30. data/lib/lingua_ruby/profiles/ro.json +302 -0
  31. data/lib/lingua_ruby/profiles/ru.json +297 -0
  32. data/lib/lingua_ruby/profiles/sk.json +302 -0
  33. data/lib/lingua_ruby/profiles/sv.json +302 -0
  34. data/lib/lingua_ruby/profiles/sw.json +268 -0
  35. data/lib/lingua_ruby/profiles/ta.json +235 -0
  36. data/lib/lingua_ruby/profiles/te.json +254 -0
  37. data/lib/lingua_ruby/profiles/th.json +251 -0
  38. data/lib/lingua_ruby/profiles/tl.json +302 -0
  39. data/lib/lingua_ruby/profiles/tr.json +302 -0
  40. data/lib/lingua_ruby/profiles/uk.json +302 -0
  41. data/lib/lingua_ruby/profiles/ur.json +232 -0
  42. data/lib/lingua_ruby/profiles/vi.json +277 -0
  43. data/lib/lingua_ruby/profiles/yo.json +245 -0
  44. data/lib/lingua_ruby/profiles/zu.json +302 -0
  45. data/lib/lingua_ruby/result.rb +13 -26
  46. data/lib/lingua_ruby/version.rb +1 -1
  47. data/lib/lingua_ruby.rb +4 -0
  48. metadata +41 -2
@@ -0,0 +1,302 @@
1
+ {
2
+ "i": 0,
3
+ "a": 1,
4
+ "n": 2,
5
+ "u": 3,
6
+ "e": 4,
7
+ "l": 5,
8
+ "k": 6,
9
+ "m": 7,
10
+ "z": 8,
11
+ "in": 9,
12
+ "u ": 10,
13
+ "ni": 11,
14
+ "ng": 12,
15
+ "g": 13,
16
+ "a ": 14,
17
+ "zi": 15,
18
+ "iz": 16,
19
+ "s": 17,
20
+ "o": 18,
21
+ "im": 19,
22
+ "ing": 20,
23
+ " a": 21,
24
+ "gi": 22,
25
+ "en": 23,
26
+ "lu": 24,
27
+ "i ": 25,
28
+ "ngi": 26,
29
+ "nin": 27,
30
+ "b": 28,
31
+ "r": 29,
32
+ "eni": 30,
33
+ "ka": 31,
34
+ "ba": 32,
35
+ "ul": 33,
36
+ "u a": 34,
37
+ "ri": 35,
38
+ "izi": 36,
39
+ "mu": 37,
40
+ "h": 38,
41
+ "f": 39,
42
+ "giz": 40,
43
+ "ka ": 41,
44
+ "ika": 42,
45
+ "rik": 43,
46
+ "fri": 44,
47
+ "afr": 45,
48
+ " af": 46,
49
+ "mu ": 47,
50
+ "imu": 48,
51
+ "zim": 49,
52
+ "af": 50,
53
+ "ik": 51,
54
+ "fr": 52,
55
+ " e": 53,
56
+ "y": 54,
57
+ " i": 55,
58
+ "a i": 56,
59
+ "w": 57,
60
+ "ulu": 58,
61
+ "t": 59,
62
+ "an": 60,
63
+ "ab": 61,
64
+ "we": 62,
65
+ "se": 63,
66
+ "si": 64,
67
+ "e ": 65,
68
+ "ban": 66,
69
+ "aba": 67,
70
+ "li": 68,
71
+ "is": 69,
72
+ "il": 70,
73
+ "sen": 71,
74
+ "la": 72,
75
+ " k": 73,
76
+ "d": 74,
77
+ "em": 75,
78
+ " y": 76,
79
+ "i e": 77,
80
+ "as": 78,
81
+ "ase": 79,
82
+ "ntu": 80,
83
+ "ol": 81,
84
+ "kh": 82,
85
+ "hu": 83,
86
+ "hul": 84,
87
+ "khu": 85,
88
+ "mi": 86,
89
+ "tu ": 87,
90
+ "nt": 88,
91
+ "tu": 89,
92
+ "mi ": 90,
93
+ "imi": 91,
94
+ "lim": 92,
95
+ "lu ": 93,
96
+ "ni ": 94,
97
+ " ba": 95,
98
+ "akh": 96,
99
+ "ku": 97,
100
+ "wen": 98,
101
+ "ezi": 99,
102
+ " ya": 100,
103
+ "gi ": 101,
104
+ " is": 102,
105
+ "isi": 103,
106
+ "siz": 104,
107
+ "izu": 105,
108
+ "zul": 106,
109
+ " ab": 107,
110
+ "ant": 108,
111
+ "lo": 109,
112
+ "zu": 110,
113
+ "p": 111,
114
+ " b": 112,
115
+ "o ": 113,
116
+ "ez": 114,
117
+ "lw": 115,
118
+ " l": 116,
119
+ "ak": 117,
120
+ "ya": 118,
121
+ "go": 119,
122
+ " iy": 120,
123
+ "iyi": 121,
124
+ " em": 122,
125
+ "kak": 123,
126
+ " ka": 124,
127
+ "e k": 125,
128
+ "ise": 126,
129
+ "ok": 127,
130
+ "i y": 128,
131
+ "ye ": 129,
132
+ " en": 130,
133
+ "lum": 131,
134
+ " n": 132,
135
+ "uma": 133,
136
+ "ma ": 134,
137
+ "olu": 135,
138
+ "i o": 136,
139
+ "la ": 137,
140
+ "ma": 138,
141
+ "um": 139,
142
+ "lul": 140,
143
+ "uk": 141,
144
+ " o": 142,
145
+ " si": 143,
146
+ "u s": 144,
147
+ "et": 145,
148
+ " s": 146,
149
+ "ani": 147,
150
+ "ye": 148,
151
+ "el": 149,
152
+ "yi": 150,
153
+ " z": 151,
154
+ "sin": 152,
155
+ "ngo": 153,
156
+ "gol": 154,
157
+ "le": 155,
158
+ " lw": 156,
159
+ "bas": 157,
160
+ "lwe": 158,
161
+ "ud": 159,
162
+ "al": 160,
163
+ "lok": 161,
164
+ "th": 162,
165
+ "zin": 163,
166
+ "ini": 164,
167
+ " ez": 165,
168
+ "ili": 166,
169
+ "zil": 167,
170
+ "iy": 168,
171
+ " uk": 169,
172
+ "het": 170,
173
+ "eth": 171,
174
+ "thw": 172,
175
+ "the": 173,
176
+ "hwe": 174,
177
+ "ala": 175,
178
+ "mth": 176,
179
+ "emt": 177,
180
+ "a u": 178,
181
+ "eng": 179,
182
+ "oli": 180,
183
+ "jen": 181,
184
+ "i l": 182,
185
+ "lwa": 183,
186
+ "wab": 184,
187
+ "abo": 185,
188
+ "nje": 186,
189
+ " nj": 187,
190
+ "u n": 188,
191
+ "bak": 189,
192
+ "bo ": 190,
193
+ "a b": 191,
194
+ "j": 192,
195
+ "o l": 193,
196
+ " lo": 194,
197
+ "oku": 195,
198
+ "i b": 196,
199
+ "kuq": 197,
200
+ "a a": 198,
201
+ "uqa": 199,
202
+ "qal": 200,
203
+ "q": 201,
204
+ "v": 202,
205
+ "i z": 203,
206
+ " za": 204,
207
+ "zas": 205,
208
+ "a z": 206,
209
+ " zi": 207,
210
+ "ine": 208,
211
+ "nem": 209,
212
+ "emv": 210,
213
+ "mve": 211,
214
+ "vel": 212,
215
+ "ela": 213,
216
+ "lap": 214,
217
+ "aph": 215,
218
+ "phi": 216,
219
+ "hi ": 217,
220
+ "yaz": 218,
221
+ "azo": 219,
222
+ "zo ": 220,
223
+ "o e": 221,
224
+ "eml": 222,
225
+ "mla": 223,
226
+ "lan": 224,
227
+ "and": 225,
228
+ "ndw": 226,
229
+ "dwe": 227,
230
+ " om": 228,
231
+ "omu": 229,
232
+ "mud": 230,
233
+ "ude": 231,
234
+ "de ": 232,
235
+ "uku": 233,
236
+ "kud": 234,
237
+ "udl": 235,
238
+ "dla": 236,
239
+ "a k": 237,
240
+ " kw": 238,
241
+ "kwe": 239,
242
+ "wes": 240,
243
+ "esi": 241,
244
+ "int": 242,
245
+ "u k": 243,
246
+ " ku": 244,
247
+ "kub": 245,
248
+ "uba": 246,
249
+ "bal": 247,
250
+ "alu": 248,
251
+ "ule": 249,
252
+ "lek": 250,
253
+ "eki": 251,
254
+ "kil": 252,
255
+ "ile": 253,
256
+ "le ": 254,
257
+ "u e": 255,
258
+ "emp": 256,
259
+ "mpi": 257,
260
+ "pil": 258,
261
+ "ilw": 259,
262
+ "yab": 260,
263
+ "u b": 261,
264
+ " iz": 262,
265
+ "nd": 263,
266
+ "dw": 264,
267
+ "om": 265,
268
+ "de": 266,
269
+ "he": 267,
270
+ "mt": 268,
271
+ "ny": 269,
272
+ "un": 270,
273
+ "ha": 271,
274
+ "bh": 272,
275
+ "ob": 273,
276
+ "do": 274,
277
+ " d": 275,
278
+ "yiz": 276,
279
+ "izw": 277,
280
+ "zwe": 278,
281
+ "we ": 279,
282
+ "e e": 280,
283
+ " el": 281,
284
+ "eli": 282,
285
+ "lis": 283,
286
+ "ko": 284,
287
+ "u y": 285,
288
+ " ye": 286,
289
+ "e a": 287,
290
+ "ria": 288,
291
+ "nj": 289,
292
+ "je": 290,
293
+ "wa": 291,
294
+ "bo": 292,
295
+ "uq": 293,
296
+ "qa": 294,
297
+ " u": 295,
298
+ "dl": 296,
299
+ "hw": 297,
300
+ "kw": 298,
301
+ "es": 299
302
+ }
@@ -5,32 +5,19 @@ module LinguaRuby
5
5
  attr_reader :language, :confidence, :name
6
6
 
7
7
  LANGUAGE_NAMES = {
8
- id: "Indonesian",
9
- en: "English",
10
- ms: "Malay",
11
- jv: "Javanese",
12
- su: "Sundanese",
13
- nl: "Dutch",
14
- ar: "Arabic",
15
- zh: "Chinese",
16
- ja: "Japanese",
17
- ko: "Korean",
18
- fr: "French",
19
- de: "German",
20
- es: "Spanish",
21
- pt: "Portuguese",
22
- it: "Italian",
23
- ru: "Russian",
24
- tr: "Turkish",
25
- pl: "Polish",
26
- sv: "Swedish",
27
- da: "Danish",
28
- no: "Norwegian",
29
- fi: "Finnish",
30
- th: "Thai",
31
- vi: "Vietnamese",
32
- tl: "Tagalog",
33
- hi: "Hindi"
8
+ id: "Indonesian", en: "English", ms: "Malay", jv: "Javanese",
9
+ su: "Sundanese", nl: "Dutch", ar: "Arabic", zh: "Chinese",
10
+ ja: "Japanese", ko: "Korean", fr: "French", de: "German",
11
+ es: "Spanish", pt: "Portuguese", it: "Italian", ru: "Russian",
12
+ tr: "Turkish", pl: "Polish", sv: "Swedish", da: "Danish",
13
+ no: "Norwegian", fi: "Finnish", th: "Thai", vi: "Vietnamese",
14
+ tl: "Tagalog", hi: "Hindi", bn: "Bengali", ta: "Tamil",
15
+ te: "Telugu", ur: "Urdu", fa: "Persian", my: "Burmese",
16
+ cs: "Czech", ro: "Romanian", hu: "Hungarian", el: "Greek",
17
+ bg: "Bulgarian", hr: "Croatian", sk: "Slovak", uk: "Ukrainian",
18
+ lt: "Lithuanian", lv: "Latvian", et: "Estonian",
19
+ sw: "Swahili", ha: "Hausa", yo: "Yoruba", am: "Amharic",
20
+ zu: "Zulu"
34
21
  }.freeze
35
22
 
36
23
  def initialize(language:, confidence:)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LinguaRuby
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/lingua_ruby.rb CHANGED
@@ -40,6 +40,10 @@ module LinguaRuby
40
40
  texts.map { |text| detector.detect(text) }
41
41
  end
42
42
 
43
+ def detect_segments(text, min_segment_length: 20)
44
+ default_detector.detect_segments(text, min_segment_length: min_segment_length)
45
+ end
46
+
43
47
  private
44
48
 
45
49
  def default_detector
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: langdetect-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -38,7 +38,8 @@ dependencies:
38
38
  - !ruby/object:Gem::Version
39
39
  version: '13.0'
40
40
  description: Pure Ruby language detection library using character n-gram frequency
41
- profiles. Detects 50+ languages with high accuracy.
41
+ profiles. Detects 48 languages including European, Asian, and African languages
42
+ with script-based fast-path and mixed-language segment detection.
42
43
  email:
43
44
  - johannes@example.com
44
45
  executables: []
@@ -56,16 +57,54 @@ files:
56
57
  - lib/lingua_ruby/ngram.rb
57
58
  - lib/lingua_ruby/profile.rb
58
59
  - lib/lingua_ruby/profile_loader.rb
60
+ - lib/lingua_ruby/profiles/am.json
59
61
  - lib/lingua_ruby/profiles/ar.json
62
+ - lib/lingua_ruby/profiles/bg.json
63
+ - lib/lingua_ruby/profiles/bn.json
64
+ - lib/lingua_ruby/profiles/cs.json
65
+ - lib/lingua_ruby/profiles/da.json
66
+ - lib/lingua_ruby/profiles/de.json
67
+ - lib/lingua_ruby/profiles/el.json
60
68
  - lib/lingua_ruby/profiles/en.json
69
+ - lib/lingua_ruby/profiles/es.json
70
+ - lib/lingua_ruby/profiles/et.json
71
+ - lib/lingua_ruby/profiles/fa.json
72
+ - lib/lingua_ruby/profiles/fi.json
73
+ - lib/lingua_ruby/profiles/fr.json
74
+ - lib/lingua_ruby/profiles/ha.json
75
+ - lib/lingua_ruby/profiles/hi.json
76
+ - lib/lingua_ruby/profiles/hr.json
77
+ - lib/lingua_ruby/profiles/hu.json
61
78
  - lib/lingua_ruby/profiles/id.json
79
+ - lib/lingua_ruby/profiles/it.json
62
80
  - lib/lingua_ruby/profiles/ja.json
63
81
  - lib/lingua_ruby/profiles/jv.json
64
82
  - lib/lingua_ruby/profiles/ko.json
83
+ - lib/lingua_ruby/profiles/lt.json
84
+ - lib/lingua_ruby/profiles/lv.json
65
85
  - lib/lingua_ruby/profiles/ms.json
86
+ - lib/lingua_ruby/profiles/my.json
66
87
  - lib/lingua_ruby/profiles/nl.json
88
+ - lib/lingua_ruby/profiles/no.json
89
+ - lib/lingua_ruby/profiles/pl.json
90
+ - lib/lingua_ruby/profiles/pt.json
91
+ - lib/lingua_ruby/profiles/ro.json
92
+ - lib/lingua_ruby/profiles/ru.json
93
+ - lib/lingua_ruby/profiles/sk.json
67
94
  - lib/lingua_ruby/profiles/su.json
95
+ - lib/lingua_ruby/profiles/sv.json
96
+ - lib/lingua_ruby/profiles/sw.json
97
+ - lib/lingua_ruby/profiles/ta.json
98
+ - lib/lingua_ruby/profiles/te.json
99
+ - lib/lingua_ruby/profiles/th.json
100
+ - lib/lingua_ruby/profiles/tl.json
101
+ - lib/lingua_ruby/profiles/tr.json
102
+ - lib/lingua_ruby/profiles/uk.json
103
+ - lib/lingua_ruby/profiles/ur.json
104
+ - lib/lingua_ruby/profiles/vi.json
105
+ - lib/lingua_ruby/profiles/yo.json
68
106
  - lib/lingua_ruby/profiles/zh.json
107
+ - lib/lingua_ruby/profiles/zu.json
69
108
  - lib/lingua_ruby/result.rb
70
109
  - lib/lingua_ruby/version.rb
71
110
  homepage: https://github.com/johannesdwicahyo/lingua-ruby