cld 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +27 -0
- data/Manifest +106 -0
- data/README.rdoc +173 -0
- data/Rakefile +15 -0
- data/base/basictypes.h +348 -0
- data/base/build_config.h +115 -0
- data/base/casts.h +156 -0
- data/base/commandlineflags.h +443 -0
- data/base/crash.h +41 -0
- data/base/dynamic_annotations.h +358 -0
- data/base/global_strip_options.h +59 -0
- data/base/log_severity.h +46 -0
- data/base/logging.h +1403 -0
- data/base/macros.h +243 -0
- data/base/port.h +54 -0
- data/base/scoped_ptr.h +428 -0
- data/base/stl_decl.h +0 -0
- data/base/stl_decl_msvc.h +107 -0
- data/base/string_util.h +29 -0
- data/base/strtoint.h +93 -0
- data/base/template_util.h +96 -0
- data/base/type_traits.h +198 -0
- data/base/vlog_is_on.h +143 -0
- data/build.sh +48 -0
- data/build.win.cmd +28 -0
- data/cld.gemspec +30 -0
- data/cld_encodings.h +95 -0
- data/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/encodings/compact_lang_det/#tote.cc# +299 -0
- data/encodings/compact_lang_det/#tote.h# +89 -0
- data/encodings/compact_lang_det/cldutil.cc +905 -0
- data/encodings/compact_lang_det/cldutil.h +1205 -0
- data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/encodings/compact_lang_det/compile.cmd +1 -0
- data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/encodings/compact_lang_det/tote.cc +299 -0
- data/encodings/compact_lang_det/tote.h +89 -0
- data/encodings/compact_lang_det/unittest_data.h +193 -0
- data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/encodings/internal/encodings.cc +12 -0
- data/encodings/lang_enc.h +254 -0
- data/encodings/proto/encodings.pb.h +169 -0
- data/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/languages/internal/#languages.cc# +337 -0
- data/languages/internal/languages.cc +337 -0
- data/languages/proto/languages.pb.h +179 -0
- data/languages/public/languages.h +379 -0
- data/lib/cld.rb +12 -0
- data/test/test.rb +570 -0
- data/thunk.cc +131 -0
- metadata +168 -0
@@ -0,0 +1,29 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
6
|
+
|
7
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
8
|
+
|
9
|
+
namespace cld_UniLib {
|
10
|
+
|
11
|
+
// Table of UTF-8 character lengths, based on first byte
|
12
|
+
static const unsigned char kUTF8LenTbl[256] = {
|
13
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
14
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
15
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
16
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
17
|
+
|
18
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
19
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
20
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
21
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
22
|
+
};
|
23
|
+
|
24
|
+
// Return length of a single UTF-8 source character
|
25
|
+
int OneCharLen(const char* src) {
|
26
|
+
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
|
27
|
+
}
|
28
|
+
|
29
|
+
} // namespace cld_UniLib
|
@@ -0,0 +1,10 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
|
7
|
+
|
8
|
+
#include "base/basictypes.h"
|
9
|
+
|
10
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
|
@@ -0,0 +1,28 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
|
7
|
+
|
8
|
+
#if !defined(CLD_WINDOWS)
|
9
|
+
|
10
|
+
#include "base/commandlineflags.h"
|
11
|
+
|
12
|
+
#else
|
13
|
+
|
14
|
+
#undef DEFINE_bool
|
15
|
+
#define DEFINE_bool(name, default_value, comment) \
|
16
|
+
const bool FLAGS_##name = default_value;
|
17
|
+
#undef DEFINE_int32
|
18
|
+
#define DEFINE_int32(name, default_value, comment) \
|
19
|
+
const int32 FLAGS_##name = default_value;
|
20
|
+
|
21
|
+
#undef DECLARE_bool
|
22
|
+
#define DECLARE_bool(name) extern const bool FLAGS_##name;
|
23
|
+
#undef DECLARE_int32
|
24
|
+
#define DECLARE_int32(name) extern int32 FLAGS_##name;
|
25
|
+
|
26
|
+
#endif
|
27
|
+
|
28
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
|
@@ -0,0 +1,18 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
|
7
|
+
|
8
|
+
#if !defined(CLD_WINDOWS)
|
9
|
+
|
10
|
+
#include "base/google.h"
|
11
|
+
|
12
|
+
#else
|
13
|
+
|
14
|
+
// Include nothing
|
15
|
+
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
|
@@ -0,0 +1,13 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
|
7
|
+
|
8
|
+
// Src points to '&'
|
9
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
10
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
11
|
+
int* tlen, int* plen);
|
12
|
+
|
13
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
|
@@ -0,0 +1,32 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
// Author: alekseys@google.com (Aleksey Shlyapnikov)
|
6
|
+
|
7
|
+
// This code is not actually used, it was copied here for the reference only.
|
8
|
+
// See cld_htmlutils_windows.cc for Windows version of this code.
|
9
|
+
|
10
|
+
#include "cld/encodings/compact_lang_det/win/cld_htmlutils.h"
|
11
|
+
|
12
|
+
#include "cld/third_party/utf/utf.h" // for runetochar
|
13
|
+
#include "cld/webutil/html/htmlutils.h" // for ReadEntity
|
14
|
+
|
15
|
+
// Copied from getonescriptspan.cc
|
16
|
+
|
17
|
+
// Src points to '&'
|
18
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
19
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
20
|
+
int* tlen, int* plen) {
|
21
|
+
char32 entval = HtmlUtils::ReadEntity(src, len, tlen);
|
22
|
+
// ReadEntity does this already: entval = FixUnicodeValue(entval);
|
23
|
+
|
24
|
+
if (entval > 0) {
|
25
|
+
*plen = runetochar(dst, &entval);
|
26
|
+
} else {
|
27
|
+
// Illegal entity; ignore the '&'
|
28
|
+
*tlen = 1;
|
29
|
+
*plen = 0;
|
30
|
+
}
|
31
|
+
// fprintf(stderr,"t%d p%d]\n", *tlen, *plen);
|
32
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/win/cld_htmlutils.h"
|
6
|
+
|
7
|
+
// Src points to '&'
|
8
|
+
// Writes entity value to dst. Returns take(src), put(dst) byte counts
|
9
|
+
void EntityToBuffer(const char* src, int len, char* dst,
|
10
|
+
int* tlen, int* plen) {
|
11
|
+
// On Windows we do not have to do anything, browser expands HTML entities
|
12
|
+
// for us, so text we're retrieving from it is ready for translation as it is.
|
13
|
+
// But:
|
14
|
+
|
15
|
+
// This is a temporary solution to let us continue the development without
|
16
|
+
// having a real DOM text scraping in place. For now the full HTML is fed
|
17
|
+
// to CLD for language detection and just ignoring entities is good enough
|
18
|
+
// for testing. Later entities will be expanded by browser itself.
|
19
|
+
|
20
|
+
// Skip entity in the source.
|
21
|
+
*tlen = 1;
|
22
|
+
do {
|
23
|
+
++src;
|
24
|
+
++*tlen;
|
25
|
+
} while (*src && *src != ';');
|
26
|
+
// Report a bogus entity (space).
|
27
|
+
*dst = ' ';
|
28
|
+
*plen = 1;
|
29
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
|
7
|
+
|
8
|
+
#if !defined(CLD_WINDOWS)
|
9
|
+
|
10
|
+
#include "base/logging.h"
|
11
|
+
|
12
|
+
#else
|
13
|
+
|
14
|
+
#undef CHECK
|
15
|
+
#define CHECK(expr)
|
16
|
+
#undef DCHECK
|
17
|
+
#define DCHECK(expr)
|
18
|
+
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
|
@@ -0,0 +1,19 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
|
7
|
+
|
8
|
+
#include "base/macros.h"
|
9
|
+
|
10
|
+
// Checks for Win32 result and if it indicates failure, returns it.
|
11
|
+
#define RETURN_IF_ERROR(cmd) \
|
12
|
+
do { \
|
13
|
+
DWORD result_ = (cmd); \
|
14
|
+
if (0 != result_) \
|
15
|
+
return result_; \
|
16
|
+
} \
|
17
|
+
while (0);
|
18
|
+
|
19
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
|
@@ -0,0 +1,26 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
|
7
|
+
|
8
|
+
#if !defined(CLD_WINDOWS)
|
9
|
+
|
10
|
+
//#include "cld/base/strtoint.h"
|
11
|
+
|
12
|
+
#else
|
13
|
+
|
14
|
+
#include <stdlib.h>
|
15
|
+
|
16
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
17
|
+
|
18
|
+
// This implementation is not as good as the one in base/strtoint.h,
|
19
|
+
// but it's sufficient for our purposes.
|
20
|
+
inline int32 strto32(const char *nptr, char **endptr, int base) {
|
21
|
+
return static_cast<int32>(strtol(nptr, endptr, base));
|
22
|
+
}
|
23
|
+
|
24
|
+
#endif
|
25
|
+
|
26
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
|
@@ -0,0 +1,84 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/win/cld_unicodetext.h"
|
6
|
+
|
7
|
+
#include <string>
|
8
|
+
#include <vector> // to compile bar/common/component.h
|
9
|
+
|
10
|
+
#include "encodings/compact_lang_det/compact_lang_det.h"
|
11
|
+
#include "base/string_util.h"
|
12
|
+
#include "unicode/normlzr.h"
|
13
|
+
#include "unicode/unistr.h"
|
14
|
+
#include "unicode/ustring.h"
|
15
|
+
|
16
|
+
/*
|
17
|
+
std::string NormalizeText(const UChar* text) {
|
18
|
+
// To avoid a copy, use the read-only aliasing ctor.
|
19
|
+
icu::UnicodeString source(1, text, -1);
|
20
|
+
icu::UnicodeString normalized;
|
21
|
+
UErrorCode status = U_ZERO_ERROR;
|
22
|
+
icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
|
23
|
+
if (U_FAILURE(status))
|
24
|
+
return std::string();
|
25
|
+
normalized.toLower();
|
26
|
+
std::string utf8;
|
27
|
+
// Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
|
28
|
+
// for most web pages) and does pre-flighting followed by malloc for larger
|
29
|
+
// strings. We have to switch to obtaining the buffer with the maximum size
|
30
|
+
// (UTF-16 length * 3) without pre-flighting if necessary.
|
31
|
+
return normalized.toUTF8String(utf8);
|
32
|
+
}
|
33
|
+
*/
|
34
|
+
|
35
|
+
|
36
|
+
// Detects a language of the UTF-16 encoded zero-terminated text.
|
37
|
+
// Returns: Language enum.
|
38
|
+
Language DetectLanguageOfUnicodeText(
|
39
|
+
const CompactLangDet::DetectionTables* detection_tables,
|
40
|
+
const UChar* text, bool is_plain_text,
|
41
|
+
bool* is_reliable, int* num_languages,
|
42
|
+
int* error_code, int* text_bytes) {
|
43
|
+
if (!text || !num_languages)
|
44
|
+
return NUM_LANGUAGES;
|
45
|
+
// Normalize text to NFC, lowercase and convert to UTF-8.
|
46
|
+
std::string utf8_encoded = NormalizeText(text);
|
47
|
+
if (utf8_encoded.empty())
|
48
|
+
return NUM_LANGUAGES;
|
49
|
+
|
50
|
+
// Engage core CLD library language detection.
|
51
|
+
Language language3[3] = {
|
52
|
+
UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
|
53
|
+
};
|
54
|
+
int percent3[3] = { 0, 0, 0 };
|
55
|
+
int text_bytes_tmp = 0;
|
56
|
+
// We ignore return value here due to the problem described in bug 1800161.
|
57
|
+
// For example, translate.google.com was detected as Indonesian. It happened
|
58
|
+
// due to the heuristic in CLD, which ignores English as a top language
|
59
|
+
// in the presence of another reliably detected language.
|
60
|
+
// See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
|
61
|
+
// language3 array is always set according to the detection results and
|
62
|
+
// is not affected by this heuristic.
|
63
|
+
CompactLangDet::DetectLanguageSummary(detection_tables,
|
64
|
+
utf8_encoded.c_str(),
|
65
|
+
utf8_encoded.length(),
|
66
|
+
is_plain_text, language3, percent3,
|
67
|
+
&text_bytes_tmp, is_reliable);
|
68
|
+
|
69
|
+
// Calcualte a number of languages detected in more than 20% of the text.
|
70
|
+
const int kMinTextPercentToCountLanguage = 20;
|
71
|
+
*num_languages = 0;
|
72
|
+
if (text_bytes)
|
73
|
+
*text_bytes = text_bytes_tmp;
|
74
|
+
COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
|
75
|
+
language3_and_percent3_should_be_of_the_same_size);
|
76
|
+
for (int i = 0; i < arraysize(language3); ++i) {
|
77
|
+
if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
|
78
|
+
percent3[i] >= kMinTextPercentToCountLanguage) {
|
79
|
+
++*num_languages;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
return language3[0];
|
84
|
+
}
|
@@ -0,0 +1,40 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
|
7
|
+
|
8
|
+
#include "languages/public/languages.h"
|
9
|
+
#include "unicode/utypes.h"
|
10
|
+
|
11
|
+
namespace CompactLangDet {
|
12
|
+
struct DetectionTables;
|
13
|
+
} // namespace CompactLangDet
|
14
|
+
|
15
|
+
// Detects a language of the UTF-16 encoded zero-terminated text.
|
16
|
+
// [in] detection_tables - internal CLD data tables (see compact_lang_det.h).
|
17
|
+
// Can be NULL, in this case CLD will fall back to builtin static tables.
|
18
|
+
// [in] text - UTF-16 encoded text to detect a language of.
|
19
|
+
// [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
|
20
|
+
// [out] is_reliable - true, if returned language was detected reliably.
|
21
|
+
// See compact_lang_det.h for details.
|
22
|
+
// [out] num_languages - set to the number of languages detected on the page.
|
23
|
+
// Language counts only if it's detected in more than 20% of the text.
|
24
|
+
// [out, optional] error_code - set to 0 in case of success, Windows
|
25
|
+
// GetLastError() code otherwise. Pass NULL, if not interested in errors.
|
26
|
+
// See encodings/compact_lang_det/compact_lang_det.h,
|
27
|
+
// CompactLangDet::DetectLanguage() description for other input parameters
|
28
|
+
// description.
|
29
|
+
// Returns: Language enum.
|
30
|
+
// Returns NUM_LANGUAGES in case of any error.
|
31
|
+
// See googleclient/languages/internal/languages.cc
|
32
|
+
// for details.
|
33
|
+
Language DetectLanguageOfUnicodeText(
|
34
|
+
const CompactLangDet::DetectionTables* detection_tables,
|
35
|
+
const UChar* text, bool is_plain_text,
|
36
|
+
bool* is_reliable, int* num_languages,
|
37
|
+
int* error_code, int* text_bytes);
|
38
|
+
|
39
|
+
|
40
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
|
@@ -0,0 +1,15 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
|
7
|
+
|
8
|
+
namespace cld_UniLib {
|
9
|
+
|
10
|
+
// Return length of a single UTF-8 source character
|
11
|
+
int OneCharLen(const char* src);
|
12
|
+
|
13
|
+
} // namespace cld_UniLib
|
14
|
+
|
15
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
|
@@ -0,0 +1,18 @@
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
// This code is not actually used, it was copied here for the reference only.
|
6
|
+
// See cld_htmlutils_windows.cc for Windows version of this code.
|
7
|
+
|
8
|
+
#include "i18n/encodings/compact_lang_det/cld_unilib.h"
|
9
|
+
|
10
|
+
#include "util/utf8/unilib.h"
|
11
|
+
|
12
|
+
namespace cld_UniLib {
|
13
|
+
|
14
|
+
int OneCharLen(const char* src) {
|
15
|
+
return UniLib::OneCharLen(src);
|
16
|
+
}
|
17
|
+
|
18
|
+
} // namespace cld_UniLib
|
@@ -0,0 +1,29 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
6
|
+
|
7
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
8
|
+
|
9
|
+
namespace cld_UniLib {
|
10
|
+
|
11
|
+
// Table of UTF-8 character lengths, based on first byte
|
12
|
+
static const unsigned char kUTF8LenTbl[256] = {
|
13
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
14
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
15
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
16
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
17
|
+
|
18
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
19
|
+
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
20
|
+
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
21
|
+
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
|
22
|
+
};
|
23
|
+
|
24
|
+
// Return length of a single UTF-8 source character
|
25
|
+
int OneCharLen(const char* src) {
|
26
|
+
return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
|
27
|
+
}
|
28
|
+
|
29
|
+
} // namespace cld_UniLib
|
@@ -0,0 +1,24 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
|
7
|
+
|
8
|
+
#if !defined(CLD_WINDOWS)
|
9
|
+
|
10
|
+
//#include "third_party/utf/utf.h"
|
11
|
+
|
12
|
+
#else
|
13
|
+
|
14
|
+
enum {
|
15
|
+
UTFmax = 4, // maximum bytes per rune
|
16
|
+
Runesync = 0x80, // cannot represent part of a UTF sequence (<)
|
17
|
+
Runeself = 0x80, // rune and UTF sequences are the same (<)
|
18
|
+
Runeerror = 0xFFFD, // decoding error in UTF
|
19
|
+
Runemax = 0x10FFFF, // maximum rune value
|
20
|
+
};
|
21
|
+
|
22
|
+
#endif
|
23
|
+
|
24
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
|