language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,10 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
7
+
8
+ #include "base/basictypes.h"
9
+
10
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_BASICTYPES_H_
@@ -0,0 +1,28 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "base/commandlineflags.h"
11
+
12
+ #else
13
+
14
+ #undef DEFINE_bool
15
+ #define DEFINE_bool(name, default_value, comment) \
16
+ const bool FLAGS_##name = default_value;
17
+ #undef DEFINE_int32
18
+ #define DEFINE_int32(name, default_value, comment) \
19
+ const int32 FLAGS_##name = default_value;
20
+
21
+ #undef DECLARE_bool
22
+ #define DECLARE_bool(name) extern const bool FLAGS_##name;
23
+ #undef DECLARE_int32
24
+ #define DECLARE_int32(name) extern int32 FLAGS_##name;
25
+
26
+ #endif
27
+
28
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_COMMANDLINEFLAGS_H_
@@ -0,0 +1,18 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "base/google.h"
11
+
12
+ #else
13
+
14
+ // Include nothing
15
+
16
+ #endif
17
+
18
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_GOOGLE_H_
@@ -0,0 +1,13 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
7
+
8
+ // Src points to '&'
9
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
10
+ void EntityToBuffer(const char* src, int len, char* dst,
11
+ int* tlen, int* plen);
12
+
13
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_HTMLUTILS_H_
@@ -0,0 +1,32 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Author: alekseys@google.com (Aleksey Shlyapnikov)
6
+
7
+ // This code is not actually used, it was copied here for the reference only.
8
+ // See cld_htmlutils_windows.cc for Windows version of this code.
9
+
10
+ #include "cld/encodings/compact_lang_det/win/cld_htmlutils.h"
11
+
12
+ #include "cld/third_party/utf/utf.h" // for runetochar
13
+ #include "cld/webutil/html/htmlutils.h" // for ReadEntity
14
+
15
+ // Copied from getonescriptspan.cc
16
+
17
+ // Src points to '&'
18
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
19
+ void EntityToBuffer(const char* src, int len, char* dst,
20
+ int* tlen, int* plen) {
21
+ char32 entval = HtmlUtils::ReadEntity(src, len, tlen);
22
+ // ReadEntity does this already: entval = FixUnicodeValue(entval);
23
+
24
+ if (entval > 0) {
25
+ *plen = runetochar(dst, &entval);
26
+ } else {
27
+ // Illegal entity; ignore the '&'
28
+ *tlen = 1;
29
+ *plen = 0;
30
+ }
31
+ // fprintf(stderr,"t%d p%d]\n", *tlen, *plen);
32
+ }
@@ -0,0 +1,29 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_htmlutils.h"
6
+
7
+ // Src points to '&'
8
+ // Writes entity value to dst. Returns take(src), put(dst) byte counts
9
+ void EntityToBuffer(const char* src, int len, char* dst,
10
+ int* tlen, int* plen) {
11
+ // On Windows we do not have to do anything, browser expands HTML entities
12
+ // for us, so text we're retrieving from it is ready for translation as it is.
13
+ // But:
14
+
15
+ // This is a temporary solution to let us continue the development without
16
+ // having a real DOM text scraping in place. For now the full HTML is fed
17
+ // to CLD for language detection and just ignoring entities is good enough
18
+ // for testing. Later entities will be expanded by browser itself.
19
+
20
+ // Skip entity in the source.
21
+ *tlen = 1;
22
+ do {
23
+ ++src;
24
+ ++*tlen;
25
+ } while (*src && *src != ';');
26
+ // Report a bogus entity (space).
27
+ *dst = ' ';
28
+ *plen = 1;
29
+ }
@@ -0,0 +1,21 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "base/logging.h"
11
+
12
+ #else
13
+
14
+ #undef CHECK
15
+ #define CHECK(expr)
16
+ #undef DCHECK
17
+ #define DCHECK(expr)
18
+
19
+ #endif
20
+
21
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_LOGGING_H_
@@ -0,0 +1,19 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
7
+
8
+ #include "base/macros.h"
9
+
10
+ // Checks for Win32 result and if it indicates failure, returns it.
11
+ #define RETURN_IF_ERROR(cmd) \
12
+ do { \
13
+ DWORD result_ = (cmd); \
14
+ if (0 != result_) \
15
+ return result_; \
16
+ } \
17
+ while (0);
18
+
19
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_MACROS_H_
@@ -0,0 +1,26 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ //#include "cld/base/strtoint.h"
11
+
12
+ #else
13
+
14
+ #include <stdlib.h>
15
+
16
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
17
+
18
+ // This implementation is not as good as the one in base/strtoint.h,
19
+ // but it's sufficient for our purposes.
20
+ inline int32 strto32(const char *nptr, char **endptr, int base) {
21
+ return static_cast<int32>(strtol(nptr, endptr, base));
22
+ }
23
+
24
+ #endif
25
+
26
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_STRTOINT_H_
@@ -0,0 +1,84 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_unicodetext.h"
6
+
7
+ #include <string>
8
+ #include <vector> // to compile bar/common/component.h
9
+
10
+ #include "encodings/compact_lang_det/compact_lang_det.h"
11
+ #include "base/string_util.h"
12
+ #include "unicode/normlzr.h"
13
+ #include "unicode/unistr.h"
14
+ #include "unicode/ustring.h"
15
+
16
+ /*
17
+ std::string NormalizeText(const UChar* text) {
18
+ // To avoid a copy, use the read-only aliasing ctor.
19
+ icu::UnicodeString source(1, text, -1);
20
+ icu::UnicodeString normalized;
21
+ UErrorCode status = U_ZERO_ERROR;
22
+ icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
23
+ if (U_FAILURE(status))
24
+ return std::string();
25
+ normalized.toLower();
26
+ std::string utf8;
27
+ // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
28
+ // for most web pages) and does pre-flighting followed by malloc for larger
29
+ // strings. We have to switch to obtaining the buffer with the maximum size
30
+ // (UTF-16 length * 3) without pre-flighting if necessary.
31
+ return normalized.toUTF8String(utf8);
32
+ }
33
+ */
34
+
35
+
36
+ // Detects a language of the UTF-16 encoded zero-terminated text.
37
+ // Returns: Language enum.
38
+ Language DetectLanguageOfUnicodeText(
39
+ const CompactLangDet::DetectionTables* detection_tables,
40
+ const UChar* text, bool is_plain_text,
41
+ bool* is_reliable, int* num_languages,
42
+ int* error_code, int* text_bytes) {
43
+ if (!text || !num_languages)
44
+ return NUM_LANGUAGES;
45
+ // Normalize text to NFC, lowercase and convert to UTF-8.
46
+ std::string utf8_encoded = NormalizeText(text);
47
+ if (utf8_encoded.empty())
48
+ return NUM_LANGUAGES;
49
+
50
+ // Engage core CLD library language detection.
51
+ Language language3[3] = {
52
+ UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
53
+ };
54
+ int percent3[3] = { 0, 0, 0 };
55
+ int text_bytes_tmp = 0;
56
+ // We ignore return value here due to the problem described in bug 1800161.
57
+ // For example, translate.google.com was detected as Indonesian. It happened
58
+ // due to the heuristic in CLD, which ignores English as a top language
59
+ // in the presence of another reliably detected language.
60
+ // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
61
+ // language3 array is always set according to the detection results and
62
+ // is not affected by this heuristic.
63
+ CompactLangDet::DetectLanguageSummary(detection_tables,
64
+ utf8_encoded.c_str(),
65
+ utf8_encoded.length(),
66
+ is_plain_text, language3, percent3,
67
+ &text_bytes_tmp, is_reliable);
68
+
69
+ // Calcualte a number of languages detected in more than 20% of the text.
70
+ const int kMinTextPercentToCountLanguage = 20;
71
+ *num_languages = 0;
72
+ if (text_bytes)
73
+ *text_bytes = text_bytes_tmp;
74
+ COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
75
+ language3_and_percent3_should_be_of_the_same_size);
76
+ for (int i = 0; i < arraysize(language3); ++i) {
77
+ if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
78
+ percent3[i] >= kMinTextPercentToCountLanguage) {
79
+ ++*num_languages;
80
+ }
81
+ }
82
+
83
+ return language3[0];
84
+ }
@@ -0,0 +1,40 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
7
+
8
+ #include "languages/public/languages.h"
9
+ #include "unicode/utypes.h"
10
+
11
+ namespace CompactLangDet {
12
+ struct DetectionTables;
13
+ } // namespace CompactLangDet
14
+
15
+ // Detects a language of the UTF-16 encoded zero-terminated text.
16
+ // [in] detection_tables - internal CLD data tables (see compact_lang_det.h).
17
+ // Can be NULL, in this case CLD will fall back to builtin static tables.
18
+ // [in] text - UTF-16 encoded text to detect a language of.
19
+ // [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
20
+ // [out] is_reliable - true, if returned language was detected reliably.
21
+ // See compact_lang_det.h for details.
22
+ // [out] num_languages - set to the number of languages detected on the page.
23
+ // Language counts only if it's detected in more than 20% of the text.
24
+ // [out, optional] error_code - set to 0 in case of success, Windows
25
+ // GetLastError() code otherwise. Pass NULL, if not interested in errors.
26
+ // See encodings/compact_lang_det/compact_lang_det.h,
27
+ // CompactLangDet::DetectLanguage() description for other input parameters
28
+ // description.
29
+ // Returns: Language enum.
30
+ // Returns NUM_LANGUAGES in case of any error.
31
+ // See googleclient/languages/internal/languages.cc
32
+ // for details.
33
+ Language DetectLanguageOfUnicodeText(
34
+ const CompactLangDet::DetectionTables* detection_tables,
35
+ const UChar* text, bool is_plain_text,
36
+ bool* is_reliable, int* num_languages,
37
+ int* error_code, int* text_bytes);
38
+
39
+
40
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
@@ -0,0 +1,15 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
7
+
8
+ namespace cld_UniLib {
9
+
10
+ // Return length of a single UTF-8 source character
11
+ int OneCharLen(const char* src);
12
+
13
+ } // namespace cld_UniLib
14
+
15
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNILIB_H_
@@ -0,0 +1,18 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // This code is not actually used, it was copied here for the reference only.
6
+ // See cld_htmlutils_windows.cc for Windows version of this code.
7
+
8
+ #include "i18n/encodings/compact_lang_det/cld_unilib.h"
9
+
10
+ #include "util/utf8/unilib.h"
11
+
12
+ namespace cld_UniLib {
13
+
14
+ int OneCharLen(const char* src) {
15
+ return UniLib::OneCharLen(src);
16
+ }
17
+
18
+ } // namespace cld_UniLib
@@ -0,0 +1,29 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_unilib.h"
6
+
7
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
8
+
9
+ namespace cld_UniLib {
10
+
11
+ // Table of UTF-8 character lengths, based on first byte
12
+ static const unsigned char kUTF8LenTbl[256] = {
13
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
14
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
15
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
16
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
17
+
18
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
19
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
20
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
21
+ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
22
+ };
23
+
24
+ // Return length of a single UTF-8 source character
25
+ int OneCharLen(const char* src) {
26
+ return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
27
+ }
28
+
29
+ } // namespace cld_UniLib
@@ -0,0 +1,24 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ //#include "third_party/utf/utf.h"
11
+
12
+ #else
13
+
14
+ enum {
15
+ UTFmax = 4, // maximum bytes per rune
16
+ Runesync = 0x80, // cannot represent part of a UTF sequence (<)
17
+ Runeself = 0x80, // rune and UTF sequences are the same (<)
18
+ Runeerror = 0xFFFD, // decoding error in UTF
19
+ Runemax = 0x10FFFF, // maximum rune value
20
+ };
21
+
22
+ #endif
23
+
24
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF_H_
@@ -0,0 +1,224 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
6
+
7
+ // Return true if current Tbl pointer is within state0 range
8
+ // Note that unsigned compare checks both ends of range simultaneously
9
+ static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
10
+ const uint8* Tbl0 = &st->state_table[st->state0];
11
+ return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
12
+ }
13
+
14
+
15
+ // Look up property of one UTF-8 character and advance over it
16
+ // Return 0 if input length is zero
17
+ // Return 0 and advance one byte if input is ill-formed
18
+ uint8 UTF8GenericProperty(const UTF8PropObj* st,
19
+ const uint8** src,
20
+ int* srclen) {
21
+ if (*srclen <= 0) {
22
+ return 0;
23
+ }
24
+
25
+ const uint8* lsrc = *src;
26
+ const uint8* Tbl_0 = &st->state_table[st->state0];
27
+ const uint8* Tbl = Tbl_0;
28
+ int e;
29
+ int eshift = st->entry_shift;
30
+
31
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
32
+ unsigned char c = lsrc[0];
33
+ if (static_cast<signed char>(c) >= 0) { // one byte
34
+ e = Tbl[c];
35
+ *src += 1;
36
+ *srclen -= 1;
37
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
38
+ e = Tbl[c];
39
+ Tbl = &Tbl_0[e << eshift];
40
+ e = Tbl[lsrc[1]];
41
+ *src += 2;
42
+ *srclen -= 2;
43
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
44
+ e = Tbl[c];
45
+ Tbl = &Tbl_0[e << eshift];
46
+ e = Tbl[lsrc[1]];
47
+ Tbl = &Tbl_0[e << eshift];
48
+ e = Tbl[lsrc[2]];
49
+ *src += 3;
50
+ *srclen -= 3;
51
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
52
+ e = Tbl[c];
53
+ Tbl = &Tbl_0[e << eshift];
54
+ e = Tbl[lsrc[1]];
55
+ Tbl = &Tbl_0[e << eshift];
56
+ e = Tbl[lsrc[2]];
57
+ Tbl = &Tbl_0[e << eshift];
58
+ e = Tbl[lsrc[3]];
59
+ *src += 4;
60
+ *srclen -= 4;
61
+ } else { // Ill-formed
62
+ e = 0;
63
+ *src += 1;
64
+ *srclen -= 1;
65
+ }
66
+ return e;
67
+ }
68
+
69
+ // BigOneByte versions are needed for tables > 240 states, but most
70
+ // won't need the TwoByte versions.
71
+ // Internally, to next-to-last offset is multiplied by 16 and the last
72
+ // offset is relative instead of absolute.
73
+ // Look up property of one UTF-8 character and advance over it
74
+ // Return 0 if input length is zero
75
+ // Return 0 and advance one byte if input is ill-formed
76
+ uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
77
+ const uint8** src,
78
+ int* srclen) {
79
+ if (*srclen <= 0) {
80
+ return 0;
81
+ }
82
+
83
+ const uint8* lsrc = *src;
84
+ const uint8* Tbl_0 = &st->state_table[st->state0];
85
+ const uint8* Tbl = Tbl_0;
86
+ int e;
87
+ int eshift = st->entry_shift;
88
+
89
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
90
+ unsigned char c = lsrc[0];
91
+ if (static_cast<signed char>(c) >= 0) { // one byte
92
+ e = Tbl[c];
93
+ *src += 1;
94
+ *srclen -= 1;
95
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
96
+ e = Tbl[c];
97
+ Tbl = &Tbl_0[e << eshift];
98
+ e = Tbl[lsrc[1]];
99
+ *src += 2;
100
+ *srclen -= 2;
101
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
102
+ e = Tbl[c];
103
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
104
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
105
+ Tbl = &Tbl[e << eshift]; // Relative +/-
106
+ e = Tbl[lsrc[2]];
107
+ *src += 3;
108
+ *srclen -= 3;
109
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
110
+ e = Tbl[c];
111
+ Tbl = &Tbl_0[e << eshift];
112
+ e = Tbl[lsrc[1]];
113
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
114
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
115
+ Tbl = &Tbl[e << eshift]; // Relative +/-
116
+ e = Tbl[lsrc[3]];
117
+ *src += 4;
118
+ *srclen -= 4;
119
+ } else { // Ill-formed
120
+ e = 0;
121
+ *src += 1;
122
+ *srclen -= 1;
123
+ }
124
+ return e;
125
+ }
126
+
127
+ // Scan a UTF-8 stringpiece based on a state table.
128
+ // Always scan complete UTF-8 characters
129
+ // Set number of bytes scanned. Return reason for exiting
130
+ int UTF8GenericScan(const UTF8ScanObj* st,
131
+ const uint8* str,
132
+ const int len,
133
+ int* bytes_consumed) {
134
+ int eshift = st->entry_shift; // 6 (space optimized) or 8
135
+ // int nEntries = (1 << eshift); // 64 or 256 entries per state
136
+
137
+ const uint8* isrc = str;
138
+ //reinterpret_cast<const uint8*>(str.data());
139
+ const uint8* src = isrc;
140
+ //const int len = str.length();
141
+ const uint8* srclimit = isrc + len;
142
+ const uint8* srclimit8 = srclimit - 7;
143
+ *bytes_consumed = 0;
144
+ if (len == 0) return kExitOK;
145
+
146
+ const uint8* Tbl_0 = &st->state_table[st->state0];
147
+
148
+ DoAgain:
149
+ // Do state-table scan
150
+ int e = 0;
151
+ uint8 c;
152
+
153
+ // Do fast for groups of 8 identity bytes.
154
+ // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
155
+ // including slowing slightly on cr/lf/ht
156
+ //----------------------------
157
+ const uint8* Tbl2 = &st->fast_state[0];
158
+ uint32 losub = st->losub;
159
+ uint32 hiadd = st->hiadd;
160
+ while (src < srclimit8) {
161
+ uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
162
+ uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
163
+ src += 8;
164
+ // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
165
+ uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
166
+ (s4567 - losub) | (s4567 + hiadd);
167
+ if ((temp & 0x80808080) != 0) {
168
+ // We typically end up here on cr/lf/ht; src was incremented
169
+ int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
170
+ (Tbl2[src[-6]] | Tbl2[src[-5]]);
171
+ if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
172
+ e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
173
+ (Tbl2[src[-2]] | Tbl2[src[-1]]);
174
+ if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
175
+ // Else OK, go around again
176
+ }
177
+ }
178
+ //----------------------------
179
+
180
+ // Byte-at-a-time scan
181
+ //----------------------------
182
+ const uint8* Tbl = Tbl_0;
183
+ while (src < srclimit) {
184
+ c = *src;
185
+ e = Tbl[c];
186
+ src++;
187
+ if (e >= kExitIllegalStructure) {break;}
188
+ Tbl = &Tbl_0[e << eshift];
189
+ }
190
+ //----------------------------
191
+
192
+
193
+ // Exit posibilities:
194
+ // Some exit code, !state0, back up over last char
195
+ // Some exit code, state0, back up one byte exactly
196
+ // source consumed, !state0, back up over partial char
197
+ // source consumed, state0, exit OK
198
+ // For illegal byte in state0, avoid backup up over PREVIOUS char
199
+ // For truncated last char, back up to beginning of it
200
+
201
+ if (e >= kExitIllegalStructure) {
202
+ // Back up over exactly one byte of rejected/illegal UTF-8 character
203
+ src--;
204
+ // Back up more if needed
205
+ if (!InStateZero(st, Tbl)) {
206
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
207
+ }
208
+ } else if (!InStateZero(st, Tbl)) {
209
+ // Back up over truncated UTF-8 character
210
+ e = kExitIllegalStructure;
211
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
212
+ } else {
213
+ // Normal termination, source fully consumed
214
+ e = kExitOK;
215
+ }
216
+
217
+ if (e == kExitDoAgain) {
218
+ // Loop back up to the fast scan
219
+ goto DoAgain;
220
+ }
221
+
222
+ *bytes_consumed = src - isrc;
223
+ return e;
224
+ }