language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,141 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "util/utf8/utf8statetable.h"
11
+
12
+ #else
13
+
14
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
15
+
16
+ // These four-byte entries compactly encode how many bytes 0..255 to delete
17
+ // in making a string replacement, how many bytes to add 0..255, and the offset
18
+ // 0..64k-1 of the replacement string in remap_string.
19
+ struct RemapEntry {
20
+ uint8 delete_bytes;
21
+ uint8 add_bytes;
22
+ uint16 bytes_offset;
23
+ };
24
+
25
+ // Exit type codes for state tables. All but the first get stuffed into
26
+ // signed one-byte entries. The first is only generated by executable code.
27
+ // To distinguish from next-state entries, these must be contiguous and
28
+ // all <= kExitNone
29
+ typedef enum {
30
+ kExitDstSpaceFull = 239,
31
+ kExitIllegalStructure, // 240
32
+ kExitOK, // 241
33
+ kExitReject, // ...
34
+ kExitReplace1,
35
+ kExitReplace2,
36
+ kExitReplace3,
37
+ kExitReplace21,
38
+ kExitReplace31,
39
+ kExitReplace32,
40
+ kExitReplaceOffset1,
41
+ kExitReplaceOffset2,
42
+ kExitReplace1S0,
43
+ kExitSpecial,
44
+ kExitDoAgain,
45
+ kExitRejectAlt,
46
+ kExitNone // 255
47
+ } ExitReason;
48
+
49
+ typedef enum {
50
+ kExitDstSpaceFull_2 = -32769,
51
+ kExitIllegalStructure_2, // -32768
52
+ kExitOK_2, // -32767
53
+ kExitReject_2, // ...
54
+ kExitReplace1_2,
55
+ kExitReplace2_2,
56
+ kExitReplace3_2,
57
+ kExitReplace21_2,
58
+ kExitReplace31_2,
59
+ kExitReplace32_2,
60
+ kExitReplaceOffset1_2,
61
+ kExitReplaceOffset2_2,
62
+ kExitReplace1S0_2,
63
+ kExitSpecial_2,
64
+ kExitDoAgain_2,
65
+ kExitRejectAlt_2,
66
+ kExitNone_2 // -32753
67
+ } ExitReason_2;
68
+
69
+ // This struct represents one entire state table. The three initialized byte
70
+ // areas are state_table, remap_base, and remap_string. state0 and state0_size
71
+ // give the byte offset and length within state_table of the initial state --
72
+ // table lookups are expected to start and end in this state, but for
73
+ // truncated UTF-8 strings, may end in a different state. These allow a quick
74
+ // test for that condition. entry_shift is 8 for tables subscripted by a full
75
+ // byte value and 6 for space-optimized tables subscripted by only six
76
+ // significant bits in UTF-8 continuation bytes.
77
+ typedef struct {
78
+ const uint32 state0;
79
+ const uint32 state0_size;
80
+ const uint32 total_size;
81
+ const int max_expand;
82
+ const int entry_shift;
83
+ const int bytes_per_entry;
84
+ const uint32 losub;
85
+ const uint32 hiadd;
86
+ const uint8* state_table;
87
+ const RemapEntry* remap_base;
88
+ const uint8* remap_string;
89
+ const uint8* fast_state;
90
+ } UTF8StateMachineObj;
91
+
92
+ // Near-duplicate declaration for tables with two-byte entries
93
+ typedef struct {
94
+ const uint32 state0;
95
+ const uint32 state0_size;
96
+ const uint32 total_size;
97
+ const int max_expand;
98
+ const int entry_shift;
99
+ const int bytes_per_entry;
100
+ const uint32 losub;
101
+ const uint32 hiadd;
102
+ const signed short* state_table;
103
+ const RemapEntry* remap_base;
104
+ const uint8* remap_string;
105
+ const uint8* fast_state;
106
+ } UTF8StateMachineObj_2;
107
+
108
+
109
+ typedef UTF8StateMachineObj UTF8PropObj;
110
+ typedef UTF8StateMachineObj UTF8ScanObj;
111
+ typedef UTF8StateMachineObj_2 UTF8PropObj_2;
112
+
113
+
114
+ // Look up property of one UTF-8 character and advance over it
115
+ // Return 0 if input length is zero
116
+ // Return 0 and advance one byte if input is ill-formed
117
+ uint8 UTF8GenericProperty(const UTF8PropObj* st,
118
+ const uint8** src,
119
+ int* srclen);
120
+
121
+ // BigOneByte versions are needed for tables > 240 states, but most
122
+ // won't need the TwoByte versions.
123
+
124
+ // Look up property of one UTF-8 character and advance over it
125
+ // Return 0 if input length is zero
126
+ // Return 0 and advance one byte if input is ill-formed
127
+ uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
128
+ const uint8** src,
129
+ int* srclen);
130
+
131
+ // Scan a UTF-8 stringpiece based on a state table.
132
+ // Always scan complete UTF-8 characters
133
+ // Set number of bytes scanned. Return reason for exiting
134
+ int UTF8GenericScan(const UTF8ScanObj* st,
135
+ const uint8* str,
136
+ const int len,
137
+ int* bytes_consumed);
138
+
139
+ #endif
140
+
141
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
@@ -0,0 +1,22 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
7
+
8
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
9
+
10
+ namespace cld {
11
+
12
+ // Scan a UTF-8 stringpiece based on a state table.
13
+ // Always scan complete UTF-8 characters
14
+ // Set number of bytes scanned. Return reason for exiting
15
+ int UTF8GenericScan(const UTF8ScanObj* st,
16
+ const char* src,
17
+ int len,
18
+ int* bytes_consumed);
19
+
20
+ } // namespace cld
21
+
22
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
@@ -0,0 +1,18 @@
1
+ // Copyright 2009 Google Inc. All Rights Reserved.
2
+ // Author: alekseys@google.com (Aleksey Shlyapnikov)
3
+
4
+ // This code is not actually used, it was copied here for the reference only.
5
+ // See cld_htmlutils_windows.cc for Windows version of this code.
6
+
7
+ #include "cld/encodings/compact_lang_det/win/cld_utf8utils.h"
8
+
9
+ #include "cld/util/utf8/utf8statetable.h"
10
+
11
+ namespace cld {
12
+
13
+ int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
14
+ int* bytes_consumed) {
15
+ return ::UTF8GenericScan(st, StringPiece(src, len), bytes_consumed);
16
+ }
17
+
18
+ } // namespace cld
@@ -0,0 +1,17 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_utf8utils.h"
6
+
7
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
8
+
9
+ namespace cld {
10
+
11
+ int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
12
+ int* bytes_consumed) {
13
+ return ::UTF8GenericScan(st, reinterpret_cast<const uint8*>(src), len,
14
+ bytes_consumed);
15
+ }
16
+
17
+ } // namespace cld
@@ -0,0 +1,172 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/normalizedunicodetext.h"
6
+
7
+ #include <tchar.h>
8
+ #include <windows.h>
9
+ #include <winnls.h>
10
+
11
+ #include "encodings/compact_lang_det/win/cld_scopedptr.h"
12
+
13
+
14
+ namespace {
15
+
16
+ // Function prototypes copied from MSDN.
17
+ typedef BOOL (WINAPI *IsNormalizedStringFunction)(NORM_FORM NormForm,
18
+ LPCWSTR lpSrcString,
19
+ int cwSrcLength);
20
+ typedef int (WINAPI *NormalizeStringFunction)(NORM_FORM NormForm,
21
+ LPCWSTR lpSrcString,
22
+ int cwSrcLength,
23
+ LPWSTR lpDstString,
24
+ int cwDstLength);
25
+
26
+ // A class to provide an access to Normaliz.dll functions.
27
+ // New normalization API implemented in Normaliz.dll is available starting
28
+ // from Windows XP SP2, that's why we have to bind to it dynamically.
29
+ class NormalizationAPI {
30
+ public:
31
+ // Creates fully initialized NormalizationAPI object.
32
+ // Loads DLL and binds all referenced functions.
33
+ NormalizationAPI()
34
+ : library_(_T("Normaliz.dll")) {
35
+ if (library_.IsValid()) {
36
+ is_normalized_string_.Bind(library_.handle(), "IsNormalizedString");
37
+ normalize_string_.Bind(library_.handle(), "NormalizeString");
38
+ }
39
+ }
40
+
41
+ // Proxy functions for the ones loaded from DLL.
42
+ BOOL IsNormalizedString(NORM_FORM NormForm, LPCWSTR lpSrcString,
43
+ int cwSrcLength) {
44
+ if (!is_normalized_string_.IsValid())
45
+ return FALSE;
46
+ return is_normalized_string_.function()(NormForm, lpSrcString, cwSrcLength);
47
+ }
48
+ int NormalizeString(NORM_FORM NormForm, LPCWSTR lpSrcString, int cwSrcLength,
49
+ LPWSTR lpDstString, int cwDstLength) {
50
+ if (!normalize_string_.IsValid()) {
51
+ ::SetLastError(ERROR_INVALID_FUNCTION);
52
+ return 0;
53
+ }
54
+ return normalize_string_.function()(NormForm, lpSrcString, cwSrcLength,
55
+ lpDstString, cwDstLength);
56
+ }
57
+
58
+ // Returns true if all functions were bound successfully.
59
+ // This implies that library_ itself was loaded successfully.
60
+ bool IsValid() const {
61
+ return is_normalized_string_.IsValid() && normalize_string_.IsValid();
62
+ }
63
+
64
+ private:
65
+ // Holds a handle to loaded Normaliz.dll.
66
+ ScopedLibrary library_;
67
+ // Pointers to the functions loaded from Normaliz.dll.
68
+ FunctionFromDll<IsNormalizedStringFunction> is_normalized_string_;
69
+ FunctionFromDll<NormalizeStringFunction> normalize_string_;
70
+
71
+ DISALLOW_COPY_AND_ASSIGN(NormalizationAPI);
72
+ };
73
+
74
+ static NormalizationAPI normalization_api;
75
+
76
+ } // namespace
77
+
78
+
79
+ // NormalizedUnicodeText
80
+
81
+ NormalizedUnicodeText::NormalizedUnicodeText()
82
+ : normalized_text_(NULL) {
83
+ }
84
+
85
+
86
+ DWORD NormalizedUnicodeText::Normalize(NORM_FORM normalization_form,
87
+ const WCHAR* text) {
88
+ DWORD result = 0;
89
+ normalized_text_ = TryToNormalizeText(normalization_form, text, &result);
90
+ return result;
91
+ }
92
+
93
+
94
+ const WCHAR* NormalizedUnicodeText::TryToNormalizeText(
95
+ NORM_FORM normalization_form, const WCHAR* text, DWORD *error_code) {
96
+ if (!text) {
97
+ text_.reset();
98
+ return text;
99
+ }
100
+ _ASSERT(NULL != error_code);
101
+ if (!error_code)
102
+ return text;
103
+
104
+ if (!normalization_api.IsValid()) {
105
+ // Fall back to the previous version of normalization API.
106
+ int folded_text_size = ::FoldStringW(MAP_PRECOMPOSED, text, -1, NULL, 0);
107
+ if (!folded_text_size) {
108
+ *error_code = ::GetLastError();
109
+ return text;
110
+ }
111
+
112
+ text_.reset(new WCHAR[folded_text_size]);
113
+ if (!text_.get()) {
114
+ *error_code = ERROR_OUTOFMEMORY;
115
+ return text;
116
+ }
117
+
118
+ int folding_result =
119
+ ::FoldStringW(MAP_PRECOMPOSED, text, -1, text_.get(), folded_text_size);
120
+ if (!folding_result) {
121
+ *error_code = ::GetLastError();
122
+ text_.reset();
123
+ return text;
124
+ }
125
+
126
+ return text_.get();
127
+ }
128
+
129
+ // No need to allocate anything when text is already normalized.
130
+ if (normalization_api.IsNormalizedString(normalization_form, text, -1))
131
+ return text;
132
+
133
+ // Get the first approximation for the buffer size required to store
134
+ // normalized text.
135
+ int normalized_text_size_guess =
136
+ normalization_api.NormalizeString(normalization_form, text, -1, NULL, 0);
137
+
138
+ while (normalized_text_size_guess > 0) {
139
+ text_.reset(new WCHAR[normalized_text_size_guess]);
140
+ if (!text_.get()) {
141
+ *error_code = ERROR_OUTOFMEMORY;
142
+ break;
143
+ }
144
+
145
+ int normalized_text_size =
146
+ normalization_api.NormalizeString(normalization_form, text, -1,
147
+ text_.get(),
148
+ normalized_text_size_guess);
149
+
150
+ if (normalized_text_size > 0) {
151
+ // Text was successfully converted.
152
+ return text_.get();
153
+ }
154
+
155
+ if (ERROR_INSUFFICIENT_BUFFER != ::GetLastError()) {
156
+ *error_code = ::GetLastError();
157
+ // Text cannot be normalized, use the original.
158
+ // By the way, ERROR_SUCCESS is a puzzling case.
159
+ // MSDN says 'The action completed successfully but yielded no results'.
160
+ // Does this mean that output buffer was not changed?
161
+ // Anyway, just in case, also return the original text.
162
+ break;
163
+ }
164
+
165
+ // Try again with the corrected buffer size.
166
+ normalized_text_size_guess = -normalized_text_size;
167
+ }
168
+
169
+ // Use the original text in case of any problem with normalization.
170
+ text_.reset();
171
+ return text;
172
+ }
@@ -0,0 +1,67 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
7
+
8
+ #include <tchar.h>
9
+ #include <windows.h>
10
+
11
+ #include "encodings/compact_lang_det/win/cld_scopedptr.h"
12
+
13
+
14
+ #if (WINVER < 0x0600)
15
+ // Copied from winnls.h, we're not using the latest SDK yet.
16
+ typedef enum _NORM_FORM {
17
+ NormalizationOther = 0,
18
+ NormalizationC = 0x1,
19
+ NormalizationD = 0x2,
20
+ NormalizationKC = 0x5,
21
+ NormalizationKD = 0x6
22
+ } NORM_FORM;
23
+ #endif
24
+
25
+
26
+ // Gives you back a normalized version of the input text. Normalization is
27
+ // performed to the specified form.
28
+ // Instance lifetime should be within the lifetime span of the 'text'.
29
+ class NormalizedUnicodeText {
30
+ public:
31
+ // Creates an empty instance of NormalizedUnicodeText.
32
+ NormalizedUnicodeText();
33
+
34
+ // Creates a fully initialized instance of NormalizedUnicodeText.
35
+ // [in] normalization_form - normalization rule set (see MSDN for details).
36
+ // [in] text - zero-terminated UTF-16 encoded string.
37
+ // Returns 0 in case of success, Win32 error code in case of failure.
38
+ // In case of failure, get() returns the original text.
39
+ DWORD Normalize(NORM_FORM normalization_form, const WCHAR* text);
40
+
41
+ // Returns pointer to the normalized text.
42
+ const WCHAR* get() const { return normalized_text_; }
43
+
44
+ private:
45
+ // Normalizes 'text' by the 'normalization_form' rules.
46
+ // [in] normalization_form - normalization rule set (see MSDN for details).
47
+ // [in] text - zero-terminated UTF-16 encoded string.
48
+ // [out] error_code - Win32 error code.
49
+ const WCHAR* TryToNormalizeText(NORM_FORM normalization_form,
50
+ const WCHAR* text, DWORD *error_code);
51
+
52
+ // Pointer to the normalized text.
53
+ const WCHAR* normalized_text_;
54
+ // When the source text is already normalized by the requested normalization
55
+ // form, text_ is not used and normalized_text_ just points to the source
56
+ // text. When the source text requres normalization, text_ contains normalized
57
+ // version of the source text and normalized_text_ points to this buffer.
58
+ // Since CLD requires NormalizationC form and the overwhelming majority of all
59
+ // texts in the Internet is already normalized to this form, it's expected
60
+ // that this class will not introduce any runtime memory overhead.
61
+ scoped_array<WCHAR> text_;
62
+
63
+ DISALLOW_COPY_AND_ASSIGN(NormalizedUnicodeText);
64
+ };
65
+
66
+
67
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
@@ -0,0 +1,12 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/public/encodings.h"
6
+
7
+
8
+ // We do not use it, just to please a compiler and minimize ported
9
+ // code changes.
10
+ const char * EncodingName(const Encoding enc) {
11
+ return "";
12
+ }