language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,301 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
|
6
|
+
#define ENCODINGS_PUBLIC_ENCODINGS_H_
|
7
|
+
|
8
|
+
// This interface defines the Encoding enum and various functions that
|
9
|
+
// depend only on Encoding values.
|
10
|
+
|
11
|
+
// A hash-function for Encoding, hash<Encoding>, is defined in
|
12
|
+
// i18n/encodings/public/encodings-hash.h
|
13
|
+
|
14
|
+
// On some Windows projects, UNICODE may be defined, which would prevent the
|
15
|
+
// Encoding enum below from compiling. Note that this is a quick fix that does
|
16
|
+
// not break any existing projects. The UNICODE enum may someday be changed
|
17
|
+
// to something more specific and non-colliding, but this involves careful
|
18
|
+
// testing of changes in many other projects.
|
19
|
+
#undef UNICODE
|
20
|
+
|
21
|
+
// NOTE: The Encoding enum must always start at 0. This assumption has
|
22
|
+
// been made and used.
|
23
|
+
|
24
|
+
#ifndef SWIG
|
25
|
+
|
26
|
+
#include "encodings/proto/encodings.pb.h"
|
27
|
+
|
28
|
+
// We must have this for compatibility.
|
29
|
+
// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
|
30
|
+
//using namespace i18n::encodings;
|
31
|
+
|
32
|
+
#else
|
33
|
+
|
34
|
+
// Special proto SWIG workaround header file.
|
35
|
+
#include "i18n/encodings/internal/encodings_proto_wrapper.h"
|
36
|
+
|
37
|
+
#endif
|
38
|
+
|
39
|
+
const int kNumEncodings = NUM_ENCODINGS;
|
40
|
+
|
41
|
+
// some of the popular encoding aliases
|
42
|
+
// TODO(jrm) Make these static const Encoding values instead of macros.
|
43
|
+
#define LATIN1 ISO_8859_1
|
44
|
+
#define LATIN2 ISO_8859_2
|
45
|
+
#define LATIN3 ISO_8859_3
|
46
|
+
#define LATIN4 ISO_8859_4
|
47
|
+
#define CYRILLIC ISO_8859_5
|
48
|
+
#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
|
49
|
+
#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
|
50
|
+
#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
|
51
|
+
#define LATIN5 ISO_8859_9
|
52
|
+
#define LATIN6 ISO_8859_10
|
53
|
+
#define KOREAN_HANGUL KOREAN_EUC_KR
|
54
|
+
|
55
|
+
// The default Encoding (LATIN1).
|
56
|
+
Encoding default_encoding();
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
// *************************************************************
|
61
|
+
// Encoding predicates
|
62
|
+
// IsValidEncoding()
|
63
|
+
// IsEncEncCompatible
|
64
|
+
// IsSupersetOfAscii7Bit
|
65
|
+
// Is8BitEncoding
|
66
|
+
// IsCJKEncoding
|
67
|
+
// IsHebrewEncoding
|
68
|
+
// IsRightToLeftEncoding
|
69
|
+
// IsLogicalRightToLeftEncoding
|
70
|
+
// IsVisualRightToLeftEncoding
|
71
|
+
// IsIso2022Encoding
|
72
|
+
// IsIso2022JpOrVariant
|
73
|
+
// IsShiftJisOrVariant
|
74
|
+
// IsJapaneseCellPhoneCarrierSpecificEncoding
|
75
|
+
// *************************************************************
|
76
|
+
|
77
|
+
// IsValidEncoding
|
78
|
+
// ===================================
|
79
|
+
//
|
80
|
+
// Function to check if the input language enum is within range.
|
81
|
+
//
|
82
|
+
|
83
|
+
bool IsValidEncoding(Encoding enc);
|
84
|
+
|
85
|
+
//
|
86
|
+
// IsEncEncCompatible
|
87
|
+
// ------------------
|
88
|
+
//
|
89
|
+
// This function is to determine whether or not converting from the
|
90
|
+
// first encoding to the second requires any changes to the underlying
|
91
|
+
// text (e.g. ASCII_7BIT is a subset of UTF8).
|
92
|
+
//
|
93
|
+
// TODO(someone more familiar with i18n): the current implementation
|
94
|
+
// is likely incomplete. It would be good to consider the full matrix
|
95
|
+
// of all pairs of encodings and to fish out all compatible pairs.
|
96
|
+
//
|
97
|
+
bool IsEncEncCompatible(const Encoding from, const Encoding to);
|
98
|
+
|
99
|
+
// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
|
100
|
+
// encoding represent the same characters as they do in ISO_8859_1.
|
101
|
+
|
102
|
+
// WARNING: This function does not currently return true for all encodings that
|
103
|
+
// are supersets of Ascii 7-bit.
|
104
|
+
bool IsSupersetOfAscii7Bit(Encoding e);
|
105
|
+
|
106
|
+
// To be an 8-bit encoding means that there are fewer than 256 symbols.
|
107
|
+
// Each byte determines a new character; there are no multi-byte sequences.
|
108
|
+
|
109
|
+
// WARNING: This function does not currently return true for all encodings that
|
110
|
+
// are 8-bit encodings.
|
111
|
+
bool Is8BitEncoding(Encoding e);
|
112
|
+
|
113
|
+
// IsCJKEncoding
|
114
|
+
// -------------
|
115
|
+
//
|
116
|
+
// This function returns true if the encoding is either Chinese
|
117
|
+
// (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
|
118
|
+
// considered a CJK encoding.
|
119
|
+
bool IsCJKEncoding(Encoding e);
|
120
|
+
|
121
|
+
// IsHebrewEncoding
|
122
|
+
// -------------
|
123
|
+
//
|
124
|
+
// This function returns true if the encoding is a Hebrew specific
|
125
|
+
// encoding (not UTF8, etc).
|
126
|
+
bool IsHebrewEncoding(Encoding e);
|
127
|
+
|
128
|
+
// IsRightToLeftEncoding
|
129
|
+
// ---------------------
|
130
|
+
//
|
131
|
+
// Returns true if the encoding is a right-to-left encoding.
|
132
|
+
//
|
133
|
+
// Note that the name of this function is somewhat misleading. There is nothing
|
134
|
+
// "right to left" about these encodings. They merely contain code points for
|
135
|
+
// characters in RTL languages such as Hebrew and Arabic. But this is also
|
136
|
+
// true for UTF-8.
|
137
|
+
//
|
138
|
+
// TODO(benjy): Get rid of this function. The only special-case we
|
139
|
+
// should need to worry about are visual encodings. Anything we
|
140
|
+
// need to do for all 'RTL' encodings we need to do for UTF-8 as well.
|
141
|
+
bool IsRightToLeftEncoding(Encoding enc);
|
142
|
+
|
143
|
+
// IsLogicalRightToLeftEncoding
|
144
|
+
// ----------------------------
|
145
|
+
//
|
146
|
+
// Returns true if the encoding is a logical right-to-left encoding.
|
147
|
+
// Logical right-to-left encodings are those that the browser renders
|
148
|
+
// right-to-left and applies the BiDi algorithm to. Therefore the characters
|
149
|
+
// appear in reading order in the file, and indexing, snippet generation etc.
|
150
|
+
// should all just work with no special processing.
|
151
|
+
//
|
152
|
+
// TODO(benjy): Get rid of this function. The only special-case we
|
153
|
+
// should need to worry about are visual encodings.
|
154
|
+
bool IsLogicalRightToLeftEncoding(Encoding enc);
|
155
|
+
|
156
|
+
// IsVisualRightToLeftEncoding
|
157
|
+
// ---------------------------
|
158
|
+
//
|
159
|
+
// Returns true if the encoding is a visual right-to-left encoding.
|
160
|
+
// Visual right-to-left encodings are those that the browser renders
|
161
|
+
// left-to-right and does not apply the BiDi algorithm to. Therefore each
|
162
|
+
// line appears in reverse order in the file, lines are manually wrapped
|
163
|
+
// by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
|
164
|
+
// the prehistoric days when browsers couldn't render right-to-left, but
|
165
|
+
// unfortunately some visual pages persist to this day. These documents require
|
166
|
+
// special processing so that we don't index or snippet them with each line
|
167
|
+
// reversed.
|
168
|
+
bool IsVisualRightToLeftEncoding(Encoding enc);
|
169
|
+
|
170
|
+
// IsIso2022Encoding
|
171
|
+
// -----------------
|
172
|
+
//
|
173
|
+
// Returns true if the encoding is a kind of ISO 2022 such as
|
174
|
+
// ISO-2022-JP.
|
175
|
+
bool IsIso2022Encoding(Encoding enc);
|
176
|
+
|
177
|
+
// IsIso2022JpOrVariant
|
178
|
+
// --------------------
|
179
|
+
//
|
180
|
+
// Returns true if the encoding is ISO-2022-JP or a variant such as
|
181
|
+
// KDDI's ISO-2022-JP.
|
182
|
+
bool IsIso2022JpOrVariant(Encoding enc);
|
183
|
+
|
184
|
+
// IsShiftJisOrVariant
|
185
|
+
// --------------------
|
186
|
+
//
|
187
|
+
// Returns true if the encoding is Shift_JIS or a variant such as
|
188
|
+
// KDDI's Shift_JIS.
|
189
|
+
bool IsShiftJisOrVariant(Encoding enc);
|
190
|
+
|
191
|
+
// IsJapanesCellPhoneCarrierSpecificEncoding
|
192
|
+
// -----------------------------------------
|
193
|
+
//
|
194
|
+
// Returns true if it's Japanese cell phone carrier specific encoding
|
195
|
+
// such as KDDI_SHIFT_JIS.
|
196
|
+
bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
|
197
|
+
|
198
|
+
|
199
|
+
|
200
|
+
// *************************************************************
|
201
|
+
// ENCODING NAMES
|
202
|
+
//
|
203
|
+
// This interface defines a standard name for each valid encoding, and
|
204
|
+
// a standard name for invalid encodings. (Some names use all upper
|
205
|
+
// case, but others use mixed case.)
|
206
|
+
//
|
207
|
+
// EncodingName() [Encoding to name]
|
208
|
+
// MimeEncodingName() [Encoding to name]
|
209
|
+
// EncodingFromName() [name to Encoding]
|
210
|
+
// EncodingNameAliasToEncoding() [name to Encoding]
|
211
|
+
// default_encoding_name()
|
212
|
+
// invalid_encoding_name()
|
213
|
+
// *************************************************************
|
214
|
+
|
215
|
+
// EncodingName
|
216
|
+
// ------------
|
217
|
+
//
|
218
|
+
// Given the encoding, returns its standard name.
|
219
|
+
// Return invalid_encoding_name() if the encoding is invalid.
|
220
|
+
//
|
221
|
+
const char* EncodingName(Encoding enc);
|
222
|
+
|
223
|
+
//
|
224
|
+
// MimeEncodingName
|
225
|
+
// ----------------
|
226
|
+
//
|
227
|
+
// Return the "preferred MIME name" of an encoding.
|
228
|
+
//
|
229
|
+
// This name is suitable for using in HTTP headers, HTML tags,
|
230
|
+
// and as the "charset" parameter of a MIME Content-Type.
|
231
|
+
const char* MimeEncodingName(Encoding enc);
|
232
|
+
|
233
|
+
|
234
|
+
// The maximum length of an encoding name
|
235
|
+
const int kMaxEncodingNameSize = 50;
|
236
|
+
|
237
|
+
// The standard name of the default encoding.
|
238
|
+
const char* default_encoding_name();
|
239
|
+
|
240
|
+
// The name used for an invalid encoding.
|
241
|
+
const char* invalid_encoding_name();
|
242
|
+
|
243
|
+
// EncodingFromName
|
244
|
+
// ----------------
|
245
|
+
//
|
246
|
+
// If enc_name matches the standard name of an Encoding, using a
|
247
|
+
// case-insensitive comparison, set *encoding to that Encoding and
|
248
|
+
// return true. Otherwise set *encoding to UNKNOWN_ENCODING and
|
249
|
+
// return false.
|
250
|
+
//
|
251
|
+
// REQUIRES: encoding must not be NULL.
|
252
|
+
//
|
253
|
+
bool EncodingFromName(const char* enc_name, Encoding *encoding);
|
254
|
+
|
255
|
+
//
|
256
|
+
// EncodingNameAliasToEncoding
|
257
|
+
// ---------------------------
|
258
|
+
//
|
259
|
+
// If enc_name matches the standard name or an alias of an Encoding,
|
260
|
+
// using a case-insensitive comparison, return that
|
261
|
+
// Encoding. Otherwise, return UNKNOWN_ENCODING.
|
262
|
+
//
|
263
|
+
// Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
|
264
|
+
// GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
|
265
|
+
// common variations with hyphens and underscores (e.g., "koi8-u" and
|
266
|
+
// "koi8u" for RUSSIAN_KOI8_R).
|
267
|
+
|
268
|
+
Encoding EncodingNameAliasToEncoding(const char *enc_name);
|
269
|
+
|
270
|
+
|
271
|
+
// *************************************************************
|
272
|
+
// Miscellany
|
273
|
+
// *************************************************************
|
274
|
+
|
275
|
+
// PreferredWebOutputEncoding
|
276
|
+
// --------------------------
|
277
|
+
//
|
278
|
+
// Some multi-byte encodings use byte values that coincide with the
|
279
|
+
// ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
|
280
|
+
// can misinterpret these, as indicated in an external XSS report from
|
281
|
+
// 2007-02-15. Here, we map these dangerous encodings to safer ones. We
|
282
|
+
// also use UTF8 instead of encodings that we don't support in our
|
283
|
+
// output, and we generally try to be conservative in what we send out.
|
284
|
+
// Where the client asks for single- or double-byte encodings that are
|
285
|
+
// not as common, we substitute a more common single- or double-byte
|
286
|
+
// encoding, if there is one, thereby preserving the client's intent
|
287
|
+
// to use less space than UTF-8. This also means that characters
|
288
|
+
// outside the destination set will be converted to HTML NCRs (&#NNN;)
|
289
|
+
// if requested.
|
290
|
+
Encoding PreferredWebOutputEncoding(Encoding enc);
|
291
|
+
|
292
|
+
|
293
|
+
// InitEncodings
|
294
|
+
// -------------
|
295
|
+
//
|
296
|
+
// Ensures the encodings module has been initialized. Normally this happens
|
297
|
+
// during InitGoogle, but this allows access for scripts that don't
|
298
|
+
// support InitGoogle.
|
299
|
+
void InitEncodings();
|
300
|
+
|
301
|
+
#endif // ENCODINGS_PUBLIC_ENCODINGS_H_
|
data/ext/cld/extconf.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
# TODO: Generate Makefile
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
4
|
+
#include "encodings/compact_lang_det/ext_lang_enc.h"
|
5
|
+
#include "encodings/proto/encodings.pb.h"
|
6
|
+
|
7
|
+
typedef struct {
|
8
|
+
const char *name;
|
9
|
+
const char *code;
|
10
|
+
int percent;
|
11
|
+
double score;
|
12
|
+
} LanguageDetail;
|
13
|
+
|
14
|
+
typedef struct {
|
15
|
+
const char *name;
|
16
|
+
const char *code;
|
17
|
+
bool reliable;
|
18
|
+
int text_bytes;
|
19
|
+
LanguageDetail *details;
|
20
|
+
} DetectedLanguage;
|
21
|
+
|
22
|
+
extern "C" {
|
23
|
+
DetectedLanguage language_detection(const char * src, bool is_plain_text) {
|
24
|
+
bool do_allow_extended_languages = true;
|
25
|
+
bool do_pick_summary_language = false;
|
26
|
+
bool do_remove_weak_matches = false;
|
27
|
+
|
28
|
+
bool is_reliable;
|
29
|
+
|
30
|
+
// "id" boosts Indonesian
|
31
|
+
//
|
32
|
+
const char* tld_hint = NULL;
|
33
|
+
|
34
|
+
// SJS boosts Japanese
|
35
|
+
//
|
36
|
+
int encoding_hint = UNKNOWN_ENCODING;
|
37
|
+
|
38
|
+
// ITALIAN boosts it
|
39
|
+
//
|
40
|
+
Language language_hint = UNKNOWN_LANGUAGE;
|
41
|
+
|
42
|
+
double normalized_score3[3];
|
43
|
+
Language language3[3];
|
44
|
+
int percent3[3];
|
45
|
+
int text_bytes;
|
46
|
+
|
47
|
+
Language lang;
|
48
|
+
lang = CompactLangDet::DetectLanguage(0,
|
49
|
+
src, strlen(src),
|
50
|
+
is_plain_text,
|
51
|
+
do_allow_extended_languages,
|
52
|
+
do_pick_summary_language,
|
53
|
+
do_remove_weak_matches,
|
54
|
+
tld_hint,
|
55
|
+
encoding_hint,
|
56
|
+
language_hint,
|
57
|
+
language3,
|
58
|
+
percent3,
|
59
|
+
normalized_score3,
|
60
|
+
&text_bytes,
|
61
|
+
&is_reliable);
|
62
|
+
|
63
|
+
|
64
|
+
DetectedLanguage detected_language;
|
65
|
+
LanguageDetail * details = new LanguageDetail [3];
|
66
|
+
|
67
|
+
detected_language.name = LanguageName(lang);
|
68
|
+
detected_language.code = ExtLanguageCode(lang);
|
69
|
+
detected_language.reliable = is_reliable;
|
70
|
+
detected_language.text_bytes = text_bytes;
|
71
|
+
|
72
|
+
for(int i = 0; i < 3; i++) {
|
73
|
+
Language lang = language3[i];
|
74
|
+
LanguageDetail detail;
|
75
|
+
|
76
|
+
detail.name = LanguageName(lang);
|
77
|
+
detail.code = ExtLanguageCode(lang);
|
78
|
+
detail.percent = percent3[i];
|
79
|
+
detail.score = normalized_score3[i];
|
80
|
+
|
81
|
+
details[i] = detail;
|
82
|
+
}
|
83
|
+
|
84
|
+
detected_language.details = details;
|
85
|
+
|
86
|
+
return detected_language;
|
87
|
+
}
|
88
|
+
}
|