langdetect-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +24 -13
- data/langdetect-ruby.gemspec +1 -1
- data/lib/lingua_ruby/configuration.rb +4 -1
- data/lib/lingua_ruby/detector.rb +59 -1
- data/lib/lingua_ruby/profile_loader.rb +26 -6
- data/lib/lingua_ruby/profiles/am.json +193 -0
- data/lib/lingua_ruby/profiles/bg.json +290 -0
- data/lib/lingua_ruby/profiles/bn.json +211 -0
- data/lib/lingua_ruby/profiles/cs.json +302 -0
- data/lib/lingua_ruby/profiles/da.json +302 -0
- data/lib/lingua_ruby/profiles/de.json +302 -0
- data/lib/lingua_ruby/profiles/el.json +302 -0
- data/lib/lingua_ruby/profiles/es.json +302 -0
- data/lib/lingua_ruby/profiles/et.json +289 -0
- data/lib/lingua_ruby/profiles/fa.json +234 -0
- data/lib/lingua_ruby/profiles/fi.json +284 -0
- data/lib/lingua_ruby/profiles/fr.json +302 -0
- data/lib/lingua_ruby/profiles/ha.json +302 -0
- data/lib/lingua_ruby/profiles/hi.json +255 -0
- data/lib/lingua_ruby/profiles/hr.json +302 -0
- data/lib/lingua_ruby/profiles/hu.json +302 -0
- data/lib/lingua_ruby/profiles/it.json +302 -0
- data/lib/lingua_ruby/profiles/lt.json +294 -0
- data/lib/lingua_ruby/profiles/lv.json +302 -0
- data/lib/lingua_ruby/profiles/my.json +200 -0
- data/lib/lingua_ruby/profiles/no.json +297 -0
- data/lib/lingua_ruby/profiles/pl.json +302 -0
- data/lib/lingua_ruby/profiles/pt.json +302 -0
- data/lib/lingua_ruby/profiles/ro.json +302 -0
- data/lib/lingua_ruby/profiles/ru.json +297 -0
- data/lib/lingua_ruby/profiles/sk.json +302 -0
- data/lib/lingua_ruby/profiles/sv.json +302 -0
- data/lib/lingua_ruby/profiles/sw.json +268 -0
- data/lib/lingua_ruby/profiles/ta.json +235 -0
- data/lib/lingua_ruby/profiles/te.json +254 -0
- data/lib/lingua_ruby/profiles/th.json +251 -0
- data/lib/lingua_ruby/profiles/tl.json +302 -0
- data/lib/lingua_ruby/profiles/tr.json +302 -0
- data/lib/lingua_ruby/profiles/uk.json +302 -0
- data/lib/lingua_ruby/profiles/ur.json +232 -0
- data/lib/lingua_ruby/profiles/vi.json +277 -0
- data/lib/lingua_ruby/profiles/yo.json +245 -0
- data/lib/lingua_ruby/profiles/zu.json +302 -0
- data/lib/lingua_ruby/result.rb +13 -26
- data/lib/lingua_ruby/version.rb +1 -1
- data/lib/lingua_ruby.rb +4 -0
- metadata +41 -2
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
{
|
|
2
|
+
"i": 0,
|
|
3
|
+
"a": 1,
|
|
4
|
+
"n": 2,
|
|
5
|
+
"u": 3,
|
|
6
|
+
"e": 4,
|
|
7
|
+
"l": 5,
|
|
8
|
+
"k": 6,
|
|
9
|
+
"m": 7,
|
|
10
|
+
"z": 8,
|
|
11
|
+
"in": 9,
|
|
12
|
+
"u ": 10,
|
|
13
|
+
"ni": 11,
|
|
14
|
+
"ng": 12,
|
|
15
|
+
"g": 13,
|
|
16
|
+
"a ": 14,
|
|
17
|
+
"zi": 15,
|
|
18
|
+
"iz": 16,
|
|
19
|
+
"s": 17,
|
|
20
|
+
"o": 18,
|
|
21
|
+
"im": 19,
|
|
22
|
+
"ing": 20,
|
|
23
|
+
" a": 21,
|
|
24
|
+
"gi": 22,
|
|
25
|
+
"en": 23,
|
|
26
|
+
"lu": 24,
|
|
27
|
+
"i ": 25,
|
|
28
|
+
"ngi": 26,
|
|
29
|
+
"nin": 27,
|
|
30
|
+
"b": 28,
|
|
31
|
+
"r": 29,
|
|
32
|
+
"eni": 30,
|
|
33
|
+
"ka": 31,
|
|
34
|
+
"ba": 32,
|
|
35
|
+
"ul": 33,
|
|
36
|
+
"u a": 34,
|
|
37
|
+
"ri": 35,
|
|
38
|
+
"izi": 36,
|
|
39
|
+
"mu": 37,
|
|
40
|
+
"h": 38,
|
|
41
|
+
"f": 39,
|
|
42
|
+
"giz": 40,
|
|
43
|
+
"ka ": 41,
|
|
44
|
+
"ika": 42,
|
|
45
|
+
"rik": 43,
|
|
46
|
+
"fri": 44,
|
|
47
|
+
"afr": 45,
|
|
48
|
+
" af": 46,
|
|
49
|
+
"mu ": 47,
|
|
50
|
+
"imu": 48,
|
|
51
|
+
"zim": 49,
|
|
52
|
+
"af": 50,
|
|
53
|
+
"ik": 51,
|
|
54
|
+
"fr": 52,
|
|
55
|
+
" e": 53,
|
|
56
|
+
"y": 54,
|
|
57
|
+
" i": 55,
|
|
58
|
+
"a i": 56,
|
|
59
|
+
"w": 57,
|
|
60
|
+
"ulu": 58,
|
|
61
|
+
"t": 59,
|
|
62
|
+
"an": 60,
|
|
63
|
+
"ab": 61,
|
|
64
|
+
"we": 62,
|
|
65
|
+
"se": 63,
|
|
66
|
+
"si": 64,
|
|
67
|
+
"e ": 65,
|
|
68
|
+
"ban": 66,
|
|
69
|
+
"aba": 67,
|
|
70
|
+
"li": 68,
|
|
71
|
+
"is": 69,
|
|
72
|
+
"il": 70,
|
|
73
|
+
"sen": 71,
|
|
74
|
+
"la": 72,
|
|
75
|
+
" k": 73,
|
|
76
|
+
"d": 74,
|
|
77
|
+
"em": 75,
|
|
78
|
+
" y": 76,
|
|
79
|
+
"i e": 77,
|
|
80
|
+
"as": 78,
|
|
81
|
+
"ase": 79,
|
|
82
|
+
"ntu": 80,
|
|
83
|
+
"ol": 81,
|
|
84
|
+
"kh": 82,
|
|
85
|
+
"hu": 83,
|
|
86
|
+
"hul": 84,
|
|
87
|
+
"khu": 85,
|
|
88
|
+
"mi": 86,
|
|
89
|
+
"tu ": 87,
|
|
90
|
+
"nt": 88,
|
|
91
|
+
"tu": 89,
|
|
92
|
+
"mi ": 90,
|
|
93
|
+
"imi": 91,
|
|
94
|
+
"lim": 92,
|
|
95
|
+
"lu ": 93,
|
|
96
|
+
"ni ": 94,
|
|
97
|
+
" ba": 95,
|
|
98
|
+
"akh": 96,
|
|
99
|
+
"ku": 97,
|
|
100
|
+
"wen": 98,
|
|
101
|
+
"ezi": 99,
|
|
102
|
+
" ya": 100,
|
|
103
|
+
"gi ": 101,
|
|
104
|
+
" is": 102,
|
|
105
|
+
"isi": 103,
|
|
106
|
+
"siz": 104,
|
|
107
|
+
"izu": 105,
|
|
108
|
+
"zul": 106,
|
|
109
|
+
" ab": 107,
|
|
110
|
+
"ant": 108,
|
|
111
|
+
"lo": 109,
|
|
112
|
+
"zu": 110,
|
|
113
|
+
"p": 111,
|
|
114
|
+
" b": 112,
|
|
115
|
+
"o ": 113,
|
|
116
|
+
"ez": 114,
|
|
117
|
+
"lw": 115,
|
|
118
|
+
" l": 116,
|
|
119
|
+
"ak": 117,
|
|
120
|
+
"ya": 118,
|
|
121
|
+
"go": 119,
|
|
122
|
+
" iy": 120,
|
|
123
|
+
"iyi": 121,
|
|
124
|
+
" em": 122,
|
|
125
|
+
"kak": 123,
|
|
126
|
+
" ka": 124,
|
|
127
|
+
"e k": 125,
|
|
128
|
+
"ise": 126,
|
|
129
|
+
"ok": 127,
|
|
130
|
+
"i y": 128,
|
|
131
|
+
"ye ": 129,
|
|
132
|
+
" en": 130,
|
|
133
|
+
"lum": 131,
|
|
134
|
+
" n": 132,
|
|
135
|
+
"uma": 133,
|
|
136
|
+
"ma ": 134,
|
|
137
|
+
"olu": 135,
|
|
138
|
+
"i o": 136,
|
|
139
|
+
"la ": 137,
|
|
140
|
+
"ma": 138,
|
|
141
|
+
"um": 139,
|
|
142
|
+
"lul": 140,
|
|
143
|
+
"uk": 141,
|
|
144
|
+
" o": 142,
|
|
145
|
+
" si": 143,
|
|
146
|
+
"u s": 144,
|
|
147
|
+
"et": 145,
|
|
148
|
+
" s": 146,
|
|
149
|
+
"ani": 147,
|
|
150
|
+
"ye": 148,
|
|
151
|
+
"el": 149,
|
|
152
|
+
"yi": 150,
|
|
153
|
+
" z": 151,
|
|
154
|
+
"sin": 152,
|
|
155
|
+
"ngo": 153,
|
|
156
|
+
"gol": 154,
|
|
157
|
+
"le": 155,
|
|
158
|
+
" lw": 156,
|
|
159
|
+
"bas": 157,
|
|
160
|
+
"lwe": 158,
|
|
161
|
+
"ud": 159,
|
|
162
|
+
"al": 160,
|
|
163
|
+
"lok": 161,
|
|
164
|
+
"th": 162,
|
|
165
|
+
"zin": 163,
|
|
166
|
+
"ini": 164,
|
|
167
|
+
" ez": 165,
|
|
168
|
+
"ili": 166,
|
|
169
|
+
"zil": 167,
|
|
170
|
+
"iy": 168,
|
|
171
|
+
" uk": 169,
|
|
172
|
+
"het": 170,
|
|
173
|
+
"eth": 171,
|
|
174
|
+
"thw": 172,
|
|
175
|
+
"the": 173,
|
|
176
|
+
"hwe": 174,
|
|
177
|
+
"ala": 175,
|
|
178
|
+
"mth": 176,
|
|
179
|
+
"emt": 177,
|
|
180
|
+
"a u": 178,
|
|
181
|
+
"eng": 179,
|
|
182
|
+
"oli": 180,
|
|
183
|
+
"jen": 181,
|
|
184
|
+
"i l": 182,
|
|
185
|
+
"lwa": 183,
|
|
186
|
+
"wab": 184,
|
|
187
|
+
"abo": 185,
|
|
188
|
+
"nje": 186,
|
|
189
|
+
" nj": 187,
|
|
190
|
+
"u n": 188,
|
|
191
|
+
"bak": 189,
|
|
192
|
+
"bo ": 190,
|
|
193
|
+
"a b": 191,
|
|
194
|
+
"j": 192,
|
|
195
|
+
"o l": 193,
|
|
196
|
+
" lo": 194,
|
|
197
|
+
"oku": 195,
|
|
198
|
+
"i b": 196,
|
|
199
|
+
"kuq": 197,
|
|
200
|
+
"a a": 198,
|
|
201
|
+
"uqa": 199,
|
|
202
|
+
"qal": 200,
|
|
203
|
+
"q": 201,
|
|
204
|
+
"v": 202,
|
|
205
|
+
"i z": 203,
|
|
206
|
+
" za": 204,
|
|
207
|
+
"zas": 205,
|
|
208
|
+
"a z": 206,
|
|
209
|
+
" zi": 207,
|
|
210
|
+
"ine": 208,
|
|
211
|
+
"nem": 209,
|
|
212
|
+
"emv": 210,
|
|
213
|
+
"mve": 211,
|
|
214
|
+
"vel": 212,
|
|
215
|
+
"ela": 213,
|
|
216
|
+
"lap": 214,
|
|
217
|
+
"aph": 215,
|
|
218
|
+
"phi": 216,
|
|
219
|
+
"hi ": 217,
|
|
220
|
+
"yaz": 218,
|
|
221
|
+
"azo": 219,
|
|
222
|
+
"zo ": 220,
|
|
223
|
+
"o e": 221,
|
|
224
|
+
"eml": 222,
|
|
225
|
+
"mla": 223,
|
|
226
|
+
"lan": 224,
|
|
227
|
+
"and": 225,
|
|
228
|
+
"ndw": 226,
|
|
229
|
+
"dwe": 227,
|
|
230
|
+
" om": 228,
|
|
231
|
+
"omu": 229,
|
|
232
|
+
"mud": 230,
|
|
233
|
+
"ude": 231,
|
|
234
|
+
"de ": 232,
|
|
235
|
+
"uku": 233,
|
|
236
|
+
"kud": 234,
|
|
237
|
+
"udl": 235,
|
|
238
|
+
"dla": 236,
|
|
239
|
+
"a k": 237,
|
|
240
|
+
" kw": 238,
|
|
241
|
+
"kwe": 239,
|
|
242
|
+
"wes": 240,
|
|
243
|
+
"esi": 241,
|
|
244
|
+
"int": 242,
|
|
245
|
+
"u k": 243,
|
|
246
|
+
" ku": 244,
|
|
247
|
+
"kub": 245,
|
|
248
|
+
"uba": 246,
|
|
249
|
+
"bal": 247,
|
|
250
|
+
"alu": 248,
|
|
251
|
+
"ule": 249,
|
|
252
|
+
"lek": 250,
|
|
253
|
+
"eki": 251,
|
|
254
|
+
"kil": 252,
|
|
255
|
+
"ile": 253,
|
|
256
|
+
"le ": 254,
|
|
257
|
+
"u e": 255,
|
|
258
|
+
"emp": 256,
|
|
259
|
+
"mpi": 257,
|
|
260
|
+
"pil": 258,
|
|
261
|
+
"ilw": 259,
|
|
262
|
+
"yab": 260,
|
|
263
|
+
"u b": 261,
|
|
264
|
+
" iz": 262,
|
|
265
|
+
"nd": 263,
|
|
266
|
+
"dw": 264,
|
|
267
|
+
"om": 265,
|
|
268
|
+
"de": 266,
|
|
269
|
+
"he": 267,
|
|
270
|
+
"mt": 268,
|
|
271
|
+
"ny": 269,
|
|
272
|
+
"un": 270,
|
|
273
|
+
"ha": 271,
|
|
274
|
+
"bh": 272,
|
|
275
|
+
"ob": 273,
|
|
276
|
+
"do": 274,
|
|
277
|
+
" d": 275,
|
|
278
|
+
"yiz": 276,
|
|
279
|
+
"izw": 277,
|
|
280
|
+
"zwe": 278,
|
|
281
|
+
"we ": 279,
|
|
282
|
+
"e e": 280,
|
|
283
|
+
" el": 281,
|
|
284
|
+
"eli": 282,
|
|
285
|
+
"lis": 283,
|
|
286
|
+
"ko": 284,
|
|
287
|
+
"u y": 285,
|
|
288
|
+
" ye": 286,
|
|
289
|
+
"e a": 287,
|
|
290
|
+
"ria": 288,
|
|
291
|
+
"nj": 289,
|
|
292
|
+
"je": 290,
|
|
293
|
+
"wa": 291,
|
|
294
|
+
"bo": 292,
|
|
295
|
+
"uq": 293,
|
|
296
|
+
"qa": 294,
|
|
297
|
+
" u": 295,
|
|
298
|
+
"dl": 296,
|
|
299
|
+
"hw": 297,
|
|
300
|
+
"kw": 298,
|
|
301
|
+
"es": 299
|
|
302
|
+
}
|
data/lib/lingua_ruby/result.rb
CHANGED
|
@@ -5,32 +5,19 @@ module LinguaRuby
|
|
|
5
5
|
attr_reader :language, :confidence, :name
|
|
6
6
|
|
|
7
7
|
LANGUAGE_NAMES = {
|
|
8
|
-
id: "Indonesian",
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
pt: "Portuguese",
|
|
22
|
-
it: "Italian",
|
|
23
|
-
ru: "Russian",
|
|
24
|
-
tr: "Turkish",
|
|
25
|
-
pl: "Polish",
|
|
26
|
-
sv: "Swedish",
|
|
27
|
-
da: "Danish",
|
|
28
|
-
no: "Norwegian",
|
|
29
|
-
fi: "Finnish",
|
|
30
|
-
th: "Thai",
|
|
31
|
-
vi: "Vietnamese",
|
|
32
|
-
tl: "Tagalog",
|
|
33
|
-
hi: "Hindi"
|
|
8
|
+
id: "Indonesian", en: "English", ms: "Malay", jv: "Javanese",
|
|
9
|
+
su: "Sundanese", nl: "Dutch", ar: "Arabic", zh: "Chinese",
|
|
10
|
+
ja: "Japanese", ko: "Korean", fr: "French", de: "German",
|
|
11
|
+
es: "Spanish", pt: "Portuguese", it: "Italian", ru: "Russian",
|
|
12
|
+
tr: "Turkish", pl: "Polish", sv: "Swedish", da: "Danish",
|
|
13
|
+
no: "Norwegian", fi: "Finnish", th: "Thai", vi: "Vietnamese",
|
|
14
|
+
tl: "Tagalog", hi: "Hindi", bn: "Bengali", ta: "Tamil",
|
|
15
|
+
te: "Telugu", ur: "Urdu", fa: "Persian", my: "Burmese",
|
|
16
|
+
cs: "Czech", ro: "Romanian", hu: "Hungarian", el: "Greek",
|
|
17
|
+
bg: "Bulgarian", hr: "Croatian", sk: "Slovak", uk: "Ukrainian",
|
|
18
|
+
lt: "Lithuanian", lv: "Latvian", et: "Estonian",
|
|
19
|
+
sw: "Swahili", ha: "Hausa", yo: "Yoruba", am: "Amharic",
|
|
20
|
+
zu: "Zulu"
|
|
34
21
|
}.freeze
|
|
35
22
|
|
|
36
23
|
def initialize(language:, confidence:)
|
data/lib/lingua_ruby/version.rb
CHANGED
data/lib/lingua_ruby.rb
CHANGED
|
@@ -40,6 +40,10 @@ module LinguaRuby
|
|
|
40
40
|
texts.map { |text| detector.detect(text) }
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
def detect_segments(text, min_segment_length: 20)
|
|
44
|
+
default_detector.detect_segments(text, min_segment_length: min_segment_length)
|
|
45
|
+
end
|
|
46
|
+
|
|
43
47
|
private
|
|
44
48
|
|
|
45
49
|
def default_detector
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: langdetect-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Johannes Dwi Cahyo
|
|
@@ -38,7 +38,8 @@ dependencies:
|
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
39
|
version: '13.0'
|
|
40
40
|
description: Pure Ruby language detection library using character n-gram frequency
|
|
41
|
-
profiles. Detects
|
|
41
|
+
profiles. Detects 48 languages including European, Asian, and African languages
|
|
42
|
+
with script-based fast-path and mixed-language segment detection.
|
|
42
43
|
email:
|
|
43
44
|
- johannes@example.com
|
|
44
45
|
executables: []
|
|
@@ -56,16 +57,54 @@ files:
|
|
|
56
57
|
- lib/lingua_ruby/ngram.rb
|
|
57
58
|
- lib/lingua_ruby/profile.rb
|
|
58
59
|
- lib/lingua_ruby/profile_loader.rb
|
|
60
|
+
- lib/lingua_ruby/profiles/am.json
|
|
59
61
|
- lib/lingua_ruby/profiles/ar.json
|
|
62
|
+
- lib/lingua_ruby/profiles/bg.json
|
|
63
|
+
- lib/lingua_ruby/profiles/bn.json
|
|
64
|
+
- lib/lingua_ruby/profiles/cs.json
|
|
65
|
+
- lib/lingua_ruby/profiles/da.json
|
|
66
|
+
- lib/lingua_ruby/profiles/de.json
|
|
67
|
+
- lib/lingua_ruby/profiles/el.json
|
|
60
68
|
- lib/lingua_ruby/profiles/en.json
|
|
69
|
+
- lib/lingua_ruby/profiles/es.json
|
|
70
|
+
- lib/lingua_ruby/profiles/et.json
|
|
71
|
+
- lib/lingua_ruby/profiles/fa.json
|
|
72
|
+
- lib/lingua_ruby/profiles/fi.json
|
|
73
|
+
- lib/lingua_ruby/profiles/fr.json
|
|
74
|
+
- lib/lingua_ruby/profiles/ha.json
|
|
75
|
+
- lib/lingua_ruby/profiles/hi.json
|
|
76
|
+
- lib/lingua_ruby/profiles/hr.json
|
|
77
|
+
- lib/lingua_ruby/profiles/hu.json
|
|
61
78
|
- lib/lingua_ruby/profiles/id.json
|
|
79
|
+
- lib/lingua_ruby/profiles/it.json
|
|
62
80
|
- lib/lingua_ruby/profiles/ja.json
|
|
63
81
|
- lib/lingua_ruby/profiles/jv.json
|
|
64
82
|
- lib/lingua_ruby/profiles/ko.json
|
|
83
|
+
- lib/lingua_ruby/profiles/lt.json
|
|
84
|
+
- lib/lingua_ruby/profiles/lv.json
|
|
65
85
|
- lib/lingua_ruby/profiles/ms.json
|
|
86
|
+
- lib/lingua_ruby/profiles/my.json
|
|
66
87
|
- lib/lingua_ruby/profiles/nl.json
|
|
88
|
+
- lib/lingua_ruby/profiles/no.json
|
|
89
|
+
- lib/lingua_ruby/profiles/pl.json
|
|
90
|
+
- lib/lingua_ruby/profiles/pt.json
|
|
91
|
+
- lib/lingua_ruby/profiles/ro.json
|
|
92
|
+
- lib/lingua_ruby/profiles/ru.json
|
|
93
|
+
- lib/lingua_ruby/profiles/sk.json
|
|
67
94
|
- lib/lingua_ruby/profiles/su.json
|
|
95
|
+
- lib/lingua_ruby/profiles/sv.json
|
|
96
|
+
- lib/lingua_ruby/profiles/sw.json
|
|
97
|
+
- lib/lingua_ruby/profiles/ta.json
|
|
98
|
+
- lib/lingua_ruby/profiles/te.json
|
|
99
|
+
- lib/lingua_ruby/profiles/th.json
|
|
100
|
+
- lib/lingua_ruby/profiles/tl.json
|
|
101
|
+
- lib/lingua_ruby/profiles/tr.json
|
|
102
|
+
- lib/lingua_ruby/profiles/uk.json
|
|
103
|
+
- lib/lingua_ruby/profiles/ur.json
|
|
104
|
+
- lib/lingua_ruby/profiles/vi.json
|
|
105
|
+
- lib/lingua_ruby/profiles/yo.json
|
|
68
106
|
- lib/lingua_ruby/profiles/zh.json
|
|
107
|
+
- lib/lingua_ruby/profiles/zu.json
|
|
69
108
|
- lib/lingua_ruby/result.rb
|
|
70
109
|
- lib/lingua_ruby/version.rb
|
|
71
110
|
homepage: https://github.com/johannesdwicahyo/lingua-ruby
|