language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,10 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
7
+
8
+ #include "base/basictypes.h"
9
+
10
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
@@ -0,0 +1,28 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "base/commandlineflags.h"
11
+
12
+ #else
13
+
14
+ #undef DEFINE_bool
15
+ #define DEFINE_bool(name, default_value, comment) \
16
+ const bool FLAGS_##name = default_value;
17
+ #undef DEFINE_int32
18
+ #define DEFINE_int32(name, default_value, comment) \
19
+ const int32 FLAGS_##name = default_value;
20
+
21
+ #undef DECLARE_bool
22
+ #define DECLARE_bool(name) extern const bool FLAGS_##name;
23
+ #undef DECLARE_int32
24
+ #define DECLARE_int32(name) extern int32 FLAGS_##name;
25
+
26
+ #endif
27
+
28
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
@@ -0,0 +1,18 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "base/google.h"
11
+
12
+ #else
13
+
14
+ // Include nothing
15
+
16
+ #endif
17
+
18
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
@@ -0,0 +1,13 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
7
+
8
+ // Src points to '&'
9
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
10
+ void EntityToBuffer(const char* src, int len, char* dst,
11
+ int* tlen, int* plen);
12
+
13
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
@@ -0,0 +1,32 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Author: alekseys@google.com (Aleksey Shlyapnikov)
6
+
7
+ // This code is not actually used, it was copied here for the reference only.
8
+ // See cld_htmlutils_windows.cc for Windows version of this code.
9
+
10
+ #include "cld/encodings/compact_lang_det/win/cld_htmlutils.h"
11
+
12
+ #include "cld/third_party/utf/utf.h" // for runetochar
13
+ #include "cld/webutil/html/htmlutils.h" // for ReadEntity
14
+
15
+ // Copied from getonescriptspan.cc
16
+
17
+ // Src points to '&'
18
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
19
+ void EntityToBuffer(const char* src, int len, char* dst,
20
+ int* tlen, int* plen) {
21
+ char32 entval = HtmlUtils::ReadEntity(src, len, tlen);
22
+ // ReadEntity does this already: entval = FixUnicodeValue(entval);
23
+
24
+ if (entval > 0) {
25
+ *plen = runetochar(dst, &entval);
26
+ } else {
27
+ // Illegal entity; ignore the '&'
28
+ *tlen = 1;
29
+ *plen = 0;
30
+ }
31
+ // fprintf(stderr,"t%d p%d]\n", *tlen, *plen);
32
+ }
@@ -0,0 +1,29 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_htmlutils.h"
6
+
7
+ // Src points to '&'
8
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
9
+ void EntityToBuffer(const char* src, int len, char* dst,
10
+ int* tlen, int* plen) {
11
+ // On Windows we do not have to do anything, browser expands HTML entities
12
+ // for us, so text we're retrieving from it is ready for translation as it is.
13
+ // But:
14
+
15
+ // This is a temporary solution to let us continue the development without
16
+ // having a real DOM text scraping in place. For now the full HTML is fed
17
+ // to CLD for language detection and just ignoring entities is good enough
18
+ // for testing. Later entities will be expanded by browser itself.
19
+
20
+ // Skip entity in the source.
21
+ *tlen = 1;
22
+ do {
23
+ ++src;
24
+ ++*tlen;
25
+ } while (*src && *src != ';');
26
+ // Report a bogus entity (space).
27
+ *dst = ' ';
28
+ *plen = 1;
29
+ }
@@ -0,0 +1,21 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "base/logging.h"
11
+
12
+ #else
13
+
14
+ #undef CHECK
15
+ #define CHECK(expr)
16
+ #undef DCHECK
17
+ #define DCHECK(expr)
18
+
19
+ #endif
20
+
21
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
@@ -0,0 +1,19 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
7
+
8
+ #include "base/macros.h"
9
+
10
+ // Checks for Win32 result and if it indicates failure, returns it.
11
+ #define RETURN_IF_ERROR(cmd) \
12
+ do { \
13
+ DWORD result_ = (cmd); \
14
+ if (0 != result_) \
15
+ return result_; \
16
+ } \
17
+ while (0);
18
+
19
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
@@ -0,0 +1,26 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ //#include "cld/base/strtoint.h"
11
+
12
+ #else
13
+
14
+ #include <stdlib.h>
15
+
16
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
17
+
18
+ // This implementation is not as good as the one in base/strtoint.h,
19
+ // but it's sufficient for our purposes.
20
+ inline int32 strto32(const char *nptr, char **endptr, int base) {
21
+ return static_cast<int32>(strtol(nptr, endptr, base));
22
+ }
23
+
24
+ #endif
25
+
26
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
@@ -0,0 +1,84 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_unicodetext.h"
6
+
7
+ #include <string>
8
+ #include <vector> // to compile bar/common/component.h
9
+
10
+ #include "encodings/compact_lang_det/compact_lang_det.h"
11
+ #include "base/string_util.h"
12
+ #include "unicode/normlzr.h"
13
+ #include "unicode/unistr.h"
14
+ #include "unicode/ustring.h"
15
+
16
+ /*
17
+ std::string NormalizeText(const UChar* text) {
18
+ // To avoid a copy, use the read-only aliasing ctor.
19
+ icu::UnicodeString source(1, text, -1);
20
+ icu::UnicodeString normalized;
21
+ UErrorCode status = U_ZERO_ERROR;
22
+ icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
23
+ if (U_FAILURE(status))
24
+ return std::string();
25
+ normalized.toLower();
26
+ std::string utf8;
27
+ // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
28
+ // for most web pages) and does pre-flighting followed by malloc for larger
29
+ // strings. We have to switch to obtaining the buffer with the maximum size
30
+ // (UTF-16 length * 3) without pre-flighting if necessary.
31
+ return normalized.toUTF8String(utf8);
32
+ }
33
+ */
34
+
35
+
36
+ // Detects a language of the UTF-16 encoded zero-terminated text.
37
+ // Returns: Language enum.
38
+ Language DetectLanguageOfUnicodeText(
39
+ const CompactLangDet::DetectionTables* detection_tables,
40
+ const UChar* text, bool is_plain_text,
41
+ bool* is_reliable, int* num_languages,
42
+ int* error_code, int* text_bytes) {
43
+ if (!text || !num_languages)
44
+ return NUM_LANGUAGES;
45
+ // Normalize text to NFC, lowercase and convert to UTF-8.
46
+ std::string utf8_encoded = NormalizeText(text);
47
+ if (utf8_encoded.empty())
48
+ return NUM_LANGUAGES;
49
+
50
+ // Engage core CLD library language detection.
51
+ Language language3[3] = {
52
+ UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
53
+ };
54
+ int percent3[3] = { 0, 0, 0 };
55
+ int text_bytes_tmp = 0;
56
+ // We ignore return value here due to the problem described in bug 1800161.
57
+ // For example, translate.google.com was detected as Indonesian. It happened
58
+ // due to the heuristic in CLD, which ignores English as a top language
59
+ // in the presence of another reliably detected language.
60
+ // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
61
+ // language3 array is always set according to the detection results and
62
+ // is not affected by this heuristic.
63
+ CompactLangDet::DetectLanguageSummary(detection_tables,
64
+ utf8_encoded.c_str(),
65
+ utf8_encoded.length(),
66
+ is_plain_text, language3, percent3,
67
+ &text_bytes_tmp, is_reliable);
68
+
69
+ // Calcualte a number of languages detected in more than 20% of the text.
70
+ const int kMinTextPercentToCountLanguage = 20;
71
+ *num_languages = 0;
72
+ if (text_bytes)
73
+ *text_bytes = text_bytes_tmp;
74
+ COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
75
+ language3_and_percent3_should_be_of_the_same_size);
76
+ for (int i = 0; i < arraysize(language3); ++i) {
77
+ if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
78
+ percent3[i] >= kMinTextPercentToCountLanguage) {
79
+ ++*num_languages;
80
+ }
81
+ }
82
+
83
+ return language3[0];
84
+ }
@@ -0,0 +1,40 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
7
+
8
+ #include "languages/public/languages.h"
9
+ #include "unicode/utypes.h"
10
+
11
+ namespace CompactLangDet {
12
+ struct DetectionTables;
13
+ } // namespace CompactLangDet
14
+
15
+ // Detects a language of the UTF-16 encoded zero-terminated text.
16
+ // [in] detection_tables - internal CLD data tables (see compact_lang_det.h).
17
+ // Can be NULL, in this case CLD will fall back to builtin static tables.
18
+ // [in] text - UTF-16 encoded text to detect a language of.
19
+ // [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
20
+ // [out] is_reliable - true, if returned language was detected reliably.
21
+ // See compact_lang_det.h for details.
22
+ // [out] num_languages - set to the number of languages detected on the page.
23
+ // Language counts only if it's detected in more than 20% of the text.
24
+ // [out, optional] error_code - set to 0 in case of success, Windows
25
+ // GetLastError() code otherwise. Pass NULL, if not interested in errors.
26
+ // See encodings/compact_lang_det/compact_lang_det.h,
27
+ // CompactLangDet::DetectLanguage() description for other input parameters
28
+ // description.
29
+ // Returns: Language enum.
30
+ // Returns NUM_LANGUAGES in case of any error.
31
+ // See googleclient/languages/internal/languages.cc
32
+ // for details.
33
+ Language DetectLanguageOfUnicodeText(
34
+ const CompactLangDet::DetectionTables* detection_tables,
35
+ const UChar* text, bool is_plain_text,
36
+ bool* is_reliable, int* num_languages,
37
+ int* error_code, int* text_bytes);
38
+
39
+
40
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
@@ -0,0 +1,15 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
7
+
8
+ namespace cld_UniLib {
9
+
10
+ // Return length of a single UTF-8 source character
11
+ int OneCharLen(const char* src);
12
+
13
+ } // namespace cld_UniLib
14
+
15
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
@@ -0,0 +1,18 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This code is not actually used, it was copied here for the reference only.
6
+ // See cld_htmlutils_windows.cc for Windows version of this code.
7
+
8
+ #include "i18n/encodings/compact_lang_det/cld_unilib.h"
9
+
10
+ #include "util/utf8/unilib.h"
11
+
12
+ namespace cld_UniLib {
13
+
14
+ int OneCharLen(const char* src) {
15
+ return UniLib::OneCharLen(src);
16
+ }
17
+
18
+ } // namespace cld_UniLib
@@ -0,0 +1,29 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_unilib.h"
6
+
7
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
8
+
9
+ namespace cld_UniLib {
10
+
11
+ // Table of UTF-8 character lengths, based on first byte
12
+ static const unsigned char kUTF8LenTbl[256] = {
13
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
14
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
15
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
16
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
17
+
18
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
19
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
20
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
21
+ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
22
+ };
23
+
24
+ // Return length of a single UTF-8 source character
25
+ int OneCharLen(const char* src) {
26
+ return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
27
+ }
28
+
29
+ } // namespace cld_UniLib
@@ -0,0 +1,24 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ //#include "third_party/utf/utf.h"
11
+
12
+ #else
13
+
14
+ enum {
15
+ UTFmax = 4, // maximum bytes per rune
16
+ Runesync = 0x80, // cannot represent part of a UTF sequence (<)
17
+ Runeself = 0x80, // rune and UTF sequences are the same (<)
18
+ Runeerror = 0xFFFD, // decoding error in UTF
19
+ Runemax = 0x10FFFF, // maximum rune value
20
+ };
21
+
22
+ #endif
23
+
24
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
@@ -0,0 +1,224 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
6
+
7
+ // Return true if current Tbl pointer is within state0 range
8
+ // Note that unsigned compare checks both ends of range simultaneously
9
+ static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
10
+ const uint8* Tbl0 = &st->state_table[st->state0];
11
+ return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
12
+ }
13
+
14
+
15
+ // Look up property of one UTF-8 character and advance over it
16
+ // Return 0 if input length is zero
17
+ // Return 0 and advance one byte if input is ill-formed
18
+ uint8 UTF8GenericProperty(const UTF8PropObj* st,
19
+ const uint8** src,
20
+ int* srclen) {
21
+ if (*srclen <= 0) {
22
+ return 0;
23
+ }
24
+
25
+ const uint8* lsrc = *src;
26
+ const uint8* Tbl_0 = &st->state_table[st->state0];
27
+ const uint8* Tbl = Tbl_0;
28
+ int e;
29
+ int eshift = st->entry_shift;
30
+
31
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
32
+ unsigned char c = lsrc[0];
33
+ if (static_cast<signed char>(c) >= 0) { // one byte
34
+ e = Tbl[c];
35
+ *src += 1;
36
+ *srclen -= 1;
37
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
38
+ e = Tbl[c];
39
+ Tbl = &Tbl_0[e << eshift];
40
+ e = Tbl[lsrc[1]];
41
+ *src += 2;
42
+ *srclen -= 2;
43
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
44
+ e = Tbl[c];
45
+ Tbl = &Tbl_0[e << eshift];
46
+ e = Tbl[lsrc[1]];
47
+ Tbl = &Tbl_0[e << eshift];
48
+ e = Tbl[lsrc[2]];
49
+ *src += 3;
50
+ *srclen -= 3;
51
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
52
+ e = Tbl[c];
53
+ Tbl = &Tbl_0[e << eshift];
54
+ e = Tbl[lsrc[1]];
55
+ Tbl = &Tbl_0[e << eshift];
56
+ e = Tbl[lsrc[2]];
57
+ Tbl = &Tbl_0[e << eshift];
58
+ e = Tbl[lsrc[3]];
59
+ *src += 4;
60
+ *srclen -= 4;
61
+ } else { // Ill-formed
62
+ e = 0;
63
+ *src += 1;
64
+ *srclen -= 1;
65
+ }
66
+ return e;
67
+ }
68
+
69
+ // BigOneByte versions are needed for tables > 240 states, but most
70
+ // won't need the TwoByte versions.
71
+ // Internally, to next-to-last offset is multiplied by 16 and the last
72
+ // offset is relative instead of absolute.
73
+ // Look up property of one UTF-8 character and advance over it
74
+ // Return 0 if input length is zero
75
+ // Return 0 and advance one byte if input is ill-formed
76
+ uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
77
+ const uint8** src,
78
+ int* srclen) {
79
+ if (*srclen <= 0) {
80
+ return 0;
81
+ }
82
+
83
+ const uint8* lsrc = *src;
84
+ const uint8* Tbl_0 = &st->state_table[st->state0];
85
+ const uint8* Tbl = Tbl_0;
86
+ int e;
87
+ int eshift = st->entry_shift;
88
+
89
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
90
+ unsigned char c = lsrc[0];
91
+ if (static_cast<signed char>(c) >= 0) { // one byte
92
+ e = Tbl[c];
93
+ *src += 1;
94
+ *srclen -= 1;
95
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
96
+ e = Tbl[c];
97
+ Tbl = &Tbl_0[e << eshift];
98
+ e = Tbl[lsrc[1]];
99
+ *src += 2;
100
+ *srclen -= 2;
101
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
102
+ e = Tbl[c];
103
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
104
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
105
+ Tbl = &Tbl[e << eshift]; // Relative +/-
106
+ e = Tbl[lsrc[2]];
107
+ *src += 3;
108
+ *srclen -= 3;
109
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
110
+ e = Tbl[c];
111
+ Tbl = &Tbl_0[e << eshift];
112
+ e = Tbl[lsrc[1]];
113
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
114
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
115
+ Tbl = &Tbl[e << eshift]; // Relative +/-
116
+ e = Tbl[lsrc[3]];
117
+ *src += 4;
118
+ *srclen -= 4;
119
+ } else { // Ill-formed
120
+ e = 0;
121
+ *src += 1;
122
+ *srclen -= 1;
123
+ }
124
+ return e;
125
+ }
126
+
127
+ // Scan a UTF-8 stringpiece based on a state table.
128
+ // Always scan complete UTF-8 characters
129
+ // Set number of bytes scanned. Return reason for exiting
130
+ int UTF8GenericScan(const UTF8ScanObj* st,
131
+ const uint8* str,
132
+ const int len,
133
+ int* bytes_consumed) {
134
+ int eshift = st->entry_shift; // 6 (space optimized) or 8
135
+ // int nEntries = (1 << eshift); // 64 or 256 entries per state
136
+
137
+ const uint8* isrc = str;
138
+ //reinterpret_cast<const uint8*>(str.data());
139
+ const uint8* src = isrc;
140
+ //const int len = str.length();
141
+ const uint8* srclimit = isrc + len;
142
+ const uint8* srclimit8 = srclimit - 7;
143
+ *bytes_consumed = 0;
144
+ if (len == 0) return kExitOK;
145
+
146
+ const uint8* Tbl_0 = &st->state_table[st->state0];
147
+
148
+ DoAgain:
149
+ // Do state-table scan
150
+ int e = 0;
151
+ uint8 c;
152
+
153
+ // Do fast for groups of 8 identity bytes.
154
+ // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
155
+ // including slowing slightly on cr/lf/ht
156
+ //----------------------------
157
+ const uint8* Tbl2 = &st->fast_state[0];
158
+ uint32 losub = st->losub;
159
+ uint32 hiadd = st->hiadd;
160
+ while (src < srclimit8) {
161
+ uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
162
+ uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
163
+ src += 8;
164
+ // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
165
+ uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
166
+ (s4567 - losub) | (s4567 + hiadd);
167
+ if ((temp & 0x80808080) != 0) {
168
+ // We typically end up here on cr/lf/ht; src was incremented
169
+ int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
170
+ (Tbl2[src[-6]] | Tbl2[src[-5]]);
171
+ if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
172
+ e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
173
+ (Tbl2[src[-2]] | Tbl2[src[-1]]);
174
+ if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
175
+ // Else OK, go around again
176
+ }
177
+ }
178
+ //----------------------------
179
+
180
+ // Byte-at-a-time scan
181
+ //----------------------------
182
+ const uint8* Tbl = Tbl_0;
183
+ while (src < srclimit) {
184
+ c = *src;
185
+ e = Tbl[c];
186
+ src++;
187
+ if (e >= kExitIllegalStructure) {break;}
188
+ Tbl = &Tbl_0[e << eshift];
189
+ }
190
+ //----------------------------
191
+
192
+
193
+ // Exit posibilities:
194
+ // Some exit code, !state0, back up over last char
195
+ // Some exit code, state0, back up one byte exactly
196
+ // source consumed, !state0, back up over partial char
197
+ // source consumed, state0, exit OK
198
+ // For illegal byte in state0, avoid backup up over PREVIOUS char
199
+ // For truncated last char, back up to beginning of it
200
+
201
+ if (e >= kExitIllegalStructure) {
202
+ // Back up over exactly one byte of rejected/illegal UTF-8 character
203
+ src--;
204
+ // Back up more if needed
205
+ if (!InStateZero(st, Tbl)) {
206
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
207
+ }
208
+ } else if (!InStateZero(st, Tbl)) {
209
+ // Back up over truncated UTF-8 character
210
+ e = kExitIllegalStructure;
211
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
212
+ } else {
213
+ // Normal termination, source fully consumed
214
+ e = kExitOK;
215
+ }
216
+
217
+ if (e == kExitDoAgain) {
218
+ // Loop back up to the fast scan
219
+ goto DoAgain;
220
+ }
221
+
222
+ *bytes_consumed = src - isrc;
223
+ return e;
224
+ }