language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
|
|
7
|
+
|
|
8
|
+
#include "base/basictypes.h"
|
|
9
|
+
|
|
10
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
|
|
7
|
+
|
|
8
|
+
#if !defined(CLD_WINDOWS)
|
|
9
|
+
|
|
10
|
+
#include "base/commandlineflags.h"
|
|
11
|
+
|
|
12
|
+
#else
|
|
13
|
+
|
|
14
|
+
#undef DEFINE_bool
|
|
15
|
+
#define DEFINE_bool(name, default_value, comment) \
|
|
16
|
+
const bool FLAGS_##name = default_value;
|
|
17
|
+
#undef DEFINE_int32
|
|
18
|
+
#define DEFINE_int32(name, default_value, comment) \
|
|
19
|
+
const int32 FLAGS_##name = default_value;
|
|
20
|
+
|
|
21
|
+
#undef DECLARE_bool
|
|
22
|
+
#define DECLARE_bool(name) extern const bool FLAGS_##name;
|
|
23
|
+
#undef DECLARE_int32
|
|
24
|
+
#define DECLARE_int32(name) extern int32 FLAGS_##name;
|
|
25
|
+
|
|
26
|
+
#endif
|
|
27
|
+
|
|
28
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
|
|
7
|
+
|
|
8
|
+
#if !defined(CLD_WINDOWS)
|
|
9
|
+
|
|
10
|
+
#include "base/google.h"
|
|
11
|
+
|
|
12
|
+
#else
|
|
13
|
+
|
|
14
|
+
// Include nothing
|
|
15
|
+
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
|
|
7
|
+
|
|
8
|
+
// Src points to '&'
|
|
9
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
|
10
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
|
11
|
+
int* tlen, int* plen);
|
|
12
|
+
|
|
13
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Author: alekseys@google.com (Aleksey Shlyapnikov)
|
|
6
|
+
|
|
7
|
+
// This code is not actually used, it was copied here for the reference only.
|
|
8
|
+
// See cld_htmlutils_windows.cc for Windows version of this code.
|
|
9
|
+
|
|
10
|
+
#include "cld/encodings/compact_lang_det/win/cld_htmlutils.h"
|
|
11
|
+
|
|
12
|
+
#include "cld/third_party/utf/utf.h" // for runetochar
|
|
13
|
+
#include "cld/webutil/html/htmlutils.h" // for ReadEntity
|
|
14
|
+
|
|
15
|
+
// Copied from getonescriptspan.cc
|
|
16
|
+
|
|
17
|
+
// Src points to '&'
|
|
18
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
|
19
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
|
20
|
+
int* tlen, int* plen) {
|
|
21
|
+
char32 entval = HtmlUtils::ReadEntity(src, len, tlen);
|
|
22
|
+
// ReadEntity does this already: entval = FixUnicodeValue(entval);
|
|
23
|
+
|
|
24
|
+
if (entval > 0) {
|
|
25
|
+
*plen = runetochar(dst, &entval);
|
|
26
|
+
} else {
|
|
27
|
+
// Illegal entity; ignore the '&'
|
|
28
|
+
*tlen = 1;
|
|
29
|
+
*plen = 0;
|
|
30
|
+
}
|
|
31
|
+
// fprintf(stderr,"t%d p%d]\n", *tlen, *plen);
|
|
32
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
|
|
6
|
+
|
|
7
|
+
// Src points to '&'
|
|
8
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
|
9
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
|
10
|
+
int* tlen, int* plen) {
|
|
11
|
+
// On Windows we do not have to do anything, browser expands HTML entities
|
|
12
|
+
// for us, so text we're retrieving from it is ready for translation as it is.
|
|
13
|
+
// But:
|
|
14
|
+
|
|
15
|
+
// This is a temporary solution to let us continue the development without
|
|
16
|
+
// having a real DOM text scraping in place. For now the full HTML is fed
|
|
17
|
+
// to CLD for language detection and just ignoring entities is good enough
|
|
18
|
+
// for testing. Later entities will be expanded by browser itself.
|
|
19
|
+
|
|
20
|
+
// Skip entity in the source.
|
|
21
|
+
*tlen = 1;
|
|
22
|
+
do {
|
|
23
|
+
++src;
|
|
24
|
+
++*tlen;
|
|
25
|
+
} while (*src && *src != ';');
|
|
26
|
+
// Report a bogus entity (space).
|
|
27
|
+
*dst = ' ';
|
|
28
|
+
*plen = 1;
|
|
29
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
|
|
7
|
+
|
|
8
|
+
#if !defined(CLD_WINDOWS)
|
|
9
|
+
|
|
10
|
+
#include "base/logging.h"
|
|
11
|
+
|
|
12
|
+
#else
|
|
13
|
+
|
|
14
|
+
#undef CHECK
|
|
15
|
+
#define CHECK(expr)
|
|
16
|
+
#undef DCHECK
|
|
17
|
+
#define DCHECK(expr)
|
|
18
|
+
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
|
|
7
|
+
|
|
8
|
+
#include "base/macros.h"
|
|
9
|
+
|
|
10
|
+
// Checks for Win32 result and if it indicates failure, returns it.
|
|
11
|
+
#define RETURN_IF_ERROR(cmd) \
|
|
12
|
+
do { \
|
|
13
|
+
DWORD result_ = (cmd); \
|
|
14
|
+
if (0 != result_) \
|
|
15
|
+
return result_; \
|
|
16
|
+
} \
|
|
17
|
+
while (0);
|
|
18
|
+
|
|
19
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
|
|
7
|
+
|
|
8
|
+
#if !defined(CLD_WINDOWS)
|
|
9
|
+
|
|
10
|
+
//#include "cld/base/strtoint.h"
|
|
11
|
+
|
|
12
|
+
#else
|
|
13
|
+
|
|
14
|
+
#include <stdlib.h>
|
|
15
|
+
|
|
16
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
17
|
+
|
|
18
|
+
// This implementation is not as good as the one in base/strtoint.h,
|
|
19
|
+
// but it's sufficient for our purposes.
|
|
20
|
+
inline int32 strto32(const char *nptr, char **endptr, int base) {
|
|
21
|
+
return static_cast<int32>(strtol(nptr, endptr, base));
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
#endif
|
|
25
|
+
|
|
26
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/cld_unicodetext.h"
|
|
6
|
+
|
|
7
|
+
#include <string>
|
|
8
|
+
#include <vector> // to compile bar/common/component.h
|
|
9
|
+
|
|
10
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
|
11
|
+
#include "base/string_util.h"
|
|
12
|
+
#include "unicode/normlzr.h"
|
|
13
|
+
#include "unicode/unistr.h"
|
|
14
|
+
#include "unicode/ustring.h"
|
|
15
|
+
|
|
16
|
+
/*
|
|
17
|
+
std::string NormalizeText(const UChar* text) {
|
|
18
|
+
// To avoid a copy, use the read-only aliasing ctor.
|
|
19
|
+
icu::UnicodeString source(1, text, -1);
|
|
20
|
+
icu::UnicodeString normalized;
|
|
21
|
+
UErrorCode status = U_ZERO_ERROR;
|
|
22
|
+
icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
|
|
23
|
+
if (U_FAILURE(status))
|
|
24
|
+
return std::string();
|
|
25
|
+
normalized.toLower();
|
|
26
|
+
std::string utf8;
|
|
27
|
+
// Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
|
|
28
|
+
// for most web pages) and does pre-flighting followed by malloc for larger
|
|
29
|
+
// strings. We have to switch to obtaining the buffer with the maximum size
|
|
30
|
+
// (UTF-16 length * 3) without pre-flighting if necessary.
|
|
31
|
+
return normalized.toUTF8String(utf8);
|
|
32
|
+
}
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
// Detects a language of the UTF-16 encoded zero-terminated text.
|
|
37
|
+
// Returns: Language enum.
|
|
38
|
+
Language DetectLanguageOfUnicodeText(
|
|
39
|
+
const CompactLangDet::DetectionTables* detection_tables,
|
|
40
|
+
const UChar* text, bool is_plain_text,
|
|
41
|
+
bool* is_reliable, int* num_languages,
|
|
42
|
+
int* error_code, int* text_bytes) {
|
|
43
|
+
if (!text || !num_languages)
|
|
44
|
+
return NUM_LANGUAGES;
|
|
45
|
+
// Normalize text to NFC, lowercase and convert to UTF-8.
|
|
46
|
+
std::string utf8_encoded = NormalizeText(text);
|
|
47
|
+
if (utf8_encoded.empty())
|
|
48
|
+
return NUM_LANGUAGES;
|
|
49
|
+
|
|
50
|
+
// Engage core CLD library language detection.
|
|
51
|
+
Language language3[3] = {
|
|
52
|
+
UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
|
|
53
|
+
};
|
|
54
|
+
int percent3[3] = { 0, 0, 0 };
|
|
55
|
+
int text_bytes_tmp = 0;
|
|
56
|
+
// We ignore return value here due to the problem described in bug 1800161.
|
|
57
|
+
// For example, translate.google.com was detected as Indonesian. It happened
|
|
58
|
+
// due to the heuristic in CLD, which ignores English as a top language
|
|
59
|
+
// in the presence of another reliably detected language.
|
|
60
|
+
// See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
|
|
61
|
+
// language3 array is always set according to the detection results and
|
|
62
|
+
// is not affected by this heuristic.
|
|
63
|
+
CompactLangDet::DetectLanguageSummary(detection_tables,
|
|
64
|
+
utf8_encoded.c_str(),
|
|
65
|
+
utf8_encoded.length(),
|
|
66
|
+
is_plain_text, language3, percent3,
|
|
67
|
+
&text_bytes_tmp, is_reliable);
|
|
68
|
+
|
|
69
|
+
// Calcualte a number of languages detected in more than 20% of the text.
|
|
70
|
+
const int kMinTextPercentToCountLanguage = 20;
|
|
71
|
+
*num_languages = 0;
|
|
72
|
+
if (text_bytes)
|
|
73
|
+
*text_bytes = text_bytes_tmp;
|
|
74
|
+
COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
|
|
75
|
+
language3_and_percent3_should_be_of_the_same_size);
|
|
76
|
+
for (int i = 0; i < arraysize(language3); ++i) {
|
|
77
|
+
if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
|
|
78
|
+
percent3[i] >= kMinTextPercentToCountLanguage) {
|
|
79
|
+
++*num_languages;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return language3[0];
|
|
84
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
|
|
7
|
+
|
|
8
|
+
#include "languages/public/languages.h"
|
|
9
|
+
#include "unicode/utypes.h"
|
|
10
|
+
|
|
11
|
+
namespace CompactLangDet {
|
|
12
|
+
struct DetectionTables;
|
|
13
|
+
} // namespace CompactLangDet
|
|
14
|
+
|
|
15
|
+
// Detects a language of the UTF-16 encoded zero-terminated text.
|
|
16
|
+
// [in] detection_tables - internal CLD data tables (see compact_lang_det.h).
|
|
17
|
+
// Can be NULL, in this case CLD will fall back to builtin static tables.
|
|
18
|
+
// [in] text - UTF-16 encoded text to detect a language of.
|
|
19
|
+
// [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
|
|
20
|
+
// [out] is_reliable - true, if returned language was detected reliably.
|
|
21
|
+
// See compact_lang_det.h for details.
|
|
22
|
+
// [out] num_languages - set to the number of languages detected on the page.
|
|
23
|
+
// Language counts only if it's detected in more than 20% of the text.
|
|
24
|
+
// [out, optional] error_code - set to 0 in case of success, Windows
|
|
25
|
+
// GetLastError() code otherwise. Pass NULL, if not interested in errors.
|
|
26
|
+
// See encodings/compact_lang_det/compact_lang_det.h,
|
|
27
|
+
// CompactLangDet::DetectLanguage() description for other input parameters
|
|
28
|
+
// description.
|
|
29
|
+
// Returns: Language enum.
|
|
30
|
+
// Returns NUM_LANGUAGES in case of any error.
|
|
31
|
+
// See googleclient/languages/internal/languages.cc
|
|
32
|
+
// for details.
|
|
33
|
+
Language DetectLanguageOfUnicodeText(
|
|
34
|
+
const CompactLangDet::DetectionTables* detection_tables,
|
|
35
|
+
const UChar* text, bool is_plain_text,
|
|
36
|
+
bool* is_reliable, int* num_languages,
|
|
37
|
+
int* error_code, int* text_bytes);
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
|
|
7
|
+
|
|
8
|
+
namespace cld_UniLib {
|
|
9
|
+
|
|
10
|
+
// Return length of a single UTF-8 source character
|
|
11
|
+
int OneCharLen(const char* src);
|
|
12
|
+
|
|
13
|
+
} // namespace cld_UniLib
|
|
14
|
+
|
|
15
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// This code is not actually used, it was copied here for the reference only.
|
|
6
|
+
// See cld_htmlutils_windows.cc for Windows version of this code.
|
|
7
|
+
|
|
8
|
+
#include "i18n/encodings/compact_lang_det/cld_unilib.h"
|
|
9
|
+
|
|
10
|
+
#include "util/utf8/unilib.h"
|
|
11
|
+
|
|
12
|
+
namespace cld_UniLib {
|
|
13
|
+
|
|
14
|
+
int OneCharLen(const char* src) {
|
|
15
|
+
return UniLib::OneCharLen(src);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
} // namespace cld_UniLib
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
|
6
|
+
|
|
7
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
8
|
+
|
|
9
|
+
namespace cld_UniLib {
|
|
10
|
+
|
|
11
|
+
// Table of UTF-8 character lengths, based on first byte
|
|
12
|
+
static const unsigned char kUTF8LenTbl[256] = {
|
|
13
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
14
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
15
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
16
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
17
|
+
|
|
18
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
19
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
20
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
|
21
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
// Return length of a single UTF-8 source character
|
|
25
|
+
int OneCharLen(const char* src) {
|
|
26
|
+
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
} // namespace cld_UniLib
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
|
|
7
|
+
|
|
8
|
+
#if !defined(CLD_WINDOWS)
|
|
9
|
+
|
|
10
|
+
//#include "third_party/utf/utf.h"
|
|
11
|
+
|
|
12
|
+
#else
|
|
13
|
+
|
|
14
|
+
enum {
|
|
15
|
+
UTFmax = 4, // maximum bytes per rune
|
|
16
|
+
Runesync = 0x80, // cannot represent part of a UTF sequence (<)
|
|
17
|
+
Runeself = 0x80, // rune and UTF sequences are the same (<)
|
|
18
|
+
Runeerror = 0xFFFD, // decoding error in UTF
|
|
19
|
+
Runemax = 0x10FFFF, // maximum rune value
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
#endif
|
|
23
|
+
|
|
24
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
6
|
+
|
|
7
|
+
// Return true if current Tbl pointer is within state0 range
|
|
8
|
+
// Note that unsigned compare checks both ends of range simultaneously
|
|
9
|
+
static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
|
|
10
|
+
const uint8* Tbl0 = &st->state_table[st->state0];
|
|
11
|
+
return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
// Look up property of one UTF-8 character and advance over it
|
|
16
|
+
// Return 0 if input length is zero
|
|
17
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
18
|
+
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
|
19
|
+
const uint8** src,
|
|
20
|
+
int* srclen) {
|
|
21
|
+
if (*srclen <= 0) {
|
|
22
|
+
return 0;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const uint8* lsrc = *src;
|
|
26
|
+
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
27
|
+
const uint8* Tbl = Tbl_0;
|
|
28
|
+
int e;
|
|
29
|
+
int eshift = st->entry_shift;
|
|
30
|
+
|
|
31
|
+
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
32
|
+
unsigned char c = lsrc[0];
|
|
33
|
+
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
34
|
+
e = Tbl[c];
|
|
35
|
+
*src += 1;
|
|
36
|
+
*srclen -= 1;
|
|
37
|
+
} else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
|
|
38
|
+
e = Tbl[c];
|
|
39
|
+
Tbl = &Tbl_0[e << eshift];
|
|
40
|
+
e = Tbl[lsrc[1]];
|
|
41
|
+
*src += 2;
|
|
42
|
+
*srclen -= 2;
|
|
43
|
+
} else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
|
|
44
|
+
e = Tbl[c];
|
|
45
|
+
Tbl = &Tbl_0[e << eshift];
|
|
46
|
+
e = Tbl[lsrc[1]];
|
|
47
|
+
Tbl = &Tbl_0[e << eshift];
|
|
48
|
+
e = Tbl[lsrc[2]];
|
|
49
|
+
*src += 3;
|
|
50
|
+
*srclen -= 3;
|
|
51
|
+
}else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
|
|
52
|
+
e = Tbl[c];
|
|
53
|
+
Tbl = &Tbl_0[e << eshift];
|
|
54
|
+
e = Tbl[lsrc[1]];
|
|
55
|
+
Tbl = &Tbl_0[e << eshift];
|
|
56
|
+
e = Tbl[lsrc[2]];
|
|
57
|
+
Tbl = &Tbl_0[e << eshift];
|
|
58
|
+
e = Tbl[lsrc[3]];
|
|
59
|
+
*src += 4;
|
|
60
|
+
*srclen -= 4;
|
|
61
|
+
} else { // Ill-formed
|
|
62
|
+
e = 0;
|
|
63
|
+
*src += 1;
|
|
64
|
+
*srclen -= 1;
|
|
65
|
+
}
|
|
66
|
+
return e;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// BigOneByte versions are needed for tables > 240 states, but most
|
|
70
|
+
// won't need the TwoByte versions.
|
|
71
|
+
// Internally, to next-to-last offset is multiplied by 16 and the last
|
|
72
|
+
// offset is relative instead of absolute.
|
|
73
|
+
// Look up property of one UTF-8 character and advance over it
|
|
74
|
+
// Return 0 if input length is zero
|
|
75
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
76
|
+
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
|
77
|
+
const uint8** src,
|
|
78
|
+
int* srclen) {
|
|
79
|
+
if (*srclen <= 0) {
|
|
80
|
+
return 0;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const uint8* lsrc = *src;
|
|
84
|
+
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
85
|
+
const uint8* Tbl = Tbl_0;
|
|
86
|
+
int e;
|
|
87
|
+
int eshift = st->entry_shift;
|
|
88
|
+
|
|
89
|
+
// Short series of tests faster than switch, optimizes 7-bit ASCII
|
|
90
|
+
unsigned char c = lsrc[0];
|
|
91
|
+
if (static_cast<signed char>(c) >= 0) { // one byte
|
|
92
|
+
e = Tbl[c];
|
|
93
|
+
*src += 1;
|
|
94
|
+
*srclen -= 1;
|
|
95
|
+
} else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
|
|
96
|
+
e = Tbl[c];
|
|
97
|
+
Tbl = &Tbl_0[e << eshift];
|
|
98
|
+
e = Tbl[lsrc[1]];
|
|
99
|
+
*src += 2;
|
|
100
|
+
*srclen -= 2;
|
|
101
|
+
} else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
|
|
102
|
+
e = Tbl[c];
|
|
103
|
+
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
104
|
+
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
|
|
105
|
+
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
106
|
+
e = Tbl[lsrc[2]];
|
|
107
|
+
*src += 3;
|
|
108
|
+
*srclen -= 3;
|
|
109
|
+
}else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
|
|
110
|
+
e = Tbl[c];
|
|
111
|
+
Tbl = &Tbl_0[e << eshift];
|
|
112
|
+
e = Tbl[lsrc[1]];
|
|
113
|
+
Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
|
|
114
|
+
e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
|
|
115
|
+
Tbl = &Tbl[e << eshift]; // Relative +/-
|
|
116
|
+
e = Tbl[lsrc[3]];
|
|
117
|
+
*src += 4;
|
|
118
|
+
*srclen -= 4;
|
|
119
|
+
} else { // Ill-formed
|
|
120
|
+
e = 0;
|
|
121
|
+
*src += 1;
|
|
122
|
+
*srclen -= 1;
|
|
123
|
+
}
|
|
124
|
+
return e;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
|
128
|
+
// Always scan complete UTF-8 characters
|
|
129
|
+
// Set number of bytes scanned. Return reason for exiting
|
|
130
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
|
131
|
+
const uint8* str,
|
|
132
|
+
const int len,
|
|
133
|
+
int* bytes_consumed) {
|
|
134
|
+
int eshift = st->entry_shift; // 6 (space optimized) or 8
|
|
135
|
+
// int nEntries = (1 << eshift); // 64 or 256 entries per state
|
|
136
|
+
|
|
137
|
+
const uint8* isrc = str;
|
|
138
|
+
//reinterpret_cast<const uint8*>(str.data());
|
|
139
|
+
const uint8* src = isrc;
|
|
140
|
+
//const int len = str.length();
|
|
141
|
+
const uint8* srclimit = isrc + len;
|
|
142
|
+
const uint8* srclimit8 = srclimit - 7;
|
|
143
|
+
*bytes_consumed = 0;
|
|
144
|
+
if (len == 0) return kExitOK;
|
|
145
|
+
|
|
146
|
+
const uint8* Tbl_0 = &st->state_table[st->state0];
|
|
147
|
+
|
|
148
|
+
DoAgain:
|
|
149
|
+
// Do state-table scan
|
|
150
|
+
int e = 0;
|
|
151
|
+
uint8 c;
|
|
152
|
+
|
|
153
|
+
// Do fast for groups of 8 identity bytes.
|
|
154
|
+
// This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
|
|
155
|
+
// including slowing slightly on cr/lf/ht
|
|
156
|
+
//----------------------------
|
|
157
|
+
const uint8* Tbl2 = &st->fast_state[0];
|
|
158
|
+
uint32 losub = st->losub;
|
|
159
|
+
uint32 hiadd = st->hiadd;
|
|
160
|
+
while (src < srclimit8) {
|
|
161
|
+
uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
|
|
162
|
+
uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
|
|
163
|
+
src += 8;
|
|
164
|
+
// This is a fast range check for all bytes in [lowsub..0x80-hiadd)
|
|
165
|
+
uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
|
|
166
|
+
(s4567 - losub) | (s4567 + hiadd);
|
|
167
|
+
if ((temp & 0x80808080) != 0) {
|
|
168
|
+
// We typically end up here on cr/lf/ht; src was incremented
|
|
169
|
+
int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
|
|
170
|
+
(Tbl2[src[-6]] | Tbl2[src[-5]]);
|
|
171
|
+
if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
|
|
172
|
+
e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
|
|
173
|
+
(Tbl2[src[-2]] | Tbl2[src[-1]]);
|
|
174
|
+
if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
|
|
175
|
+
// Else OK, go around again
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
//----------------------------
|
|
179
|
+
|
|
180
|
+
// Byte-at-a-time scan
|
|
181
|
+
//----------------------------
|
|
182
|
+
const uint8* Tbl = Tbl_0;
|
|
183
|
+
while (src < srclimit) {
|
|
184
|
+
c = *src;
|
|
185
|
+
e = Tbl[c];
|
|
186
|
+
src++;
|
|
187
|
+
if (e >= kExitIllegalStructure) {break;}
|
|
188
|
+
Tbl = &Tbl_0[e << eshift];
|
|
189
|
+
}
|
|
190
|
+
//----------------------------
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
// Exit posibilities:
|
|
194
|
+
// Some exit code, !state0, back up over last char
|
|
195
|
+
// Some exit code, state0, back up one byte exactly
|
|
196
|
+
// source consumed, !state0, back up over partial char
|
|
197
|
+
// source consumed, state0, exit OK
|
|
198
|
+
// For illegal byte in state0, avoid backup up over PREVIOUS char
|
|
199
|
+
// For truncated last char, back up to beginning of it
|
|
200
|
+
|
|
201
|
+
if (e >= kExitIllegalStructure) {
|
|
202
|
+
// Back up over exactly one byte of rejected/illegal UTF-8 character
|
|
203
|
+
src--;
|
|
204
|
+
// Back up more if needed
|
|
205
|
+
if (!InStateZero(st, Tbl)) {
|
|
206
|
+
do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
207
|
+
}
|
|
208
|
+
} else if (!InStateZero(st, Tbl)) {
|
|
209
|
+
// Back up over truncated UTF-8 character
|
|
210
|
+
e = kExitIllegalStructure;
|
|
211
|
+
do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
|
|
212
|
+
} else {
|
|
213
|
+
// Normal termination, source fully consumed
|
|
214
|
+
e = kExitOK;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if (e == kExitDoAgain) {
|
|
218
|
+
// Loop back up to the fast scan
|
|
219
|
+
goto DoAgain;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
*bytes_consumed = src - isrc;
|
|
223
|
+
return e;
|
|
224
|
+
}
|