language_detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
@@ -0,0 +1,141 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
7
|
+
|
8
|
+
#if !defined(CLD_WINDOWS)
|
9
|
+
|
10
|
+
#include "util/utf8/utf8statetable.h"
|
11
|
+
|
12
|
+
#else
|
13
|
+
|
14
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
15
|
+
|
16
|
+
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
17
|
+
// in making a string replacement, how many bytes to add 0..255, and the offset
|
18
|
+
// 0..64k-1 of the replacement string in remap_string.
|
19
|
+
struct RemapEntry {
|
20
|
+
uint8 delete_bytes;
|
21
|
+
uint8 add_bytes;
|
22
|
+
uint16 bytes_offset;
|
23
|
+
};
|
24
|
+
|
25
|
+
// Exit type codes for state tables. All but the first get stuffed into
|
26
|
+
// signed one-byte entries. The first is only generated by executable code.
|
27
|
+
// To distinguish from next-state entries, these must be contiguous and
|
28
|
+
// all <= kExitNone
|
29
|
+
typedef enum {
|
30
|
+
kExitDstSpaceFull = 239,
|
31
|
+
kExitIllegalStructure, // 240
|
32
|
+
kExitOK, // 241
|
33
|
+
kExitReject, // ...
|
34
|
+
kExitReplace1,
|
35
|
+
kExitReplace2,
|
36
|
+
kExitReplace3,
|
37
|
+
kExitReplace21,
|
38
|
+
kExitReplace31,
|
39
|
+
kExitReplace32,
|
40
|
+
kExitReplaceOffset1,
|
41
|
+
kExitReplaceOffset2,
|
42
|
+
kExitReplace1S0,
|
43
|
+
kExitSpecial,
|
44
|
+
kExitDoAgain,
|
45
|
+
kExitRejectAlt,
|
46
|
+
kExitNone // 255
|
47
|
+
} ExitReason;
|
48
|
+
|
49
|
+
typedef enum {
|
50
|
+
kExitDstSpaceFull_2 = -32769,
|
51
|
+
kExitIllegalStructure_2, // -32768
|
52
|
+
kExitOK_2, // -32767
|
53
|
+
kExitReject_2, // ...
|
54
|
+
kExitReplace1_2,
|
55
|
+
kExitReplace2_2,
|
56
|
+
kExitReplace3_2,
|
57
|
+
kExitReplace21_2,
|
58
|
+
kExitReplace31_2,
|
59
|
+
kExitReplace32_2,
|
60
|
+
kExitReplaceOffset1_2,
|
61
|
+
kExitReplaceOffset2_2,
|
62
|
+
kExitReplace1S0_2,
|
63
|
+
kExitSpecial_2,
|
64
|
+
kExitDoAgain_2,
|
65
|
+
kExitRejectAlt_2,
|
66
|
+
kExitNone_2 // -32753
|
67
|
+
} ExitReason_2;
|
68
|
+
|
69
|
+
// This struct represents one entire state table. The three initialized byte
|
70
|
+
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
71
|
+
// give the byte offset and length within state_table of the initial state --
|
72
|
+
// table lookups are expected to start and end in this state, but for
|
73
|
+
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
74
|
+
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
75
|
+
// byte value and 6 for space-optimized tables subscripted by only six
|
76
|
+
// significant bits in UTF-8 continuation bytes.
|
77
|
+
typedef struct {
|
78
|
+
const uint32 state0;
|
79
|
+
const uint32 state0_size;
|
80
|
+
const uint32 total_size;
|
81
|
+
const int max_expand;
|
82
|
+
const int entry_shift;
|
83
|
+
const int bytes_per_entry;
|
84
|
+
const uint32 losub;
|
85
|
+
const uint32 hiadd;
|
86
|
+
const uint8* state_table;
|
87
|
+
const RemapEntry* remap_base;
|
88
|
+
const uint8* remap_string;
|
89
|
+
const uint8* fast_state;
|
90
|
+
} UTF8StateMachineObj;
|
91
|
+
|
92
|
+
// Near-duplicate declaration for tables with two-byte entries
|
93
|
+
typedef struct {
|
94
|
+
const uint32 state0;
|
95
|
+
const uint32 state0_size;
|
96
|
+
const uint32 total_size;
|
97
|
+
const int max_expand;
|
98
|
+
const int entry_shift;
|
99
|
+
const int bytes_per_entry;
|
100
|
+
const uint32 losub;
|
101
|
+
const uint32 hiadd;
|
102
|
+
const signed short* state_table;
|
103
|
+
const RemapEntry* remap_base;
|
104
|
+
const uint8* remap_string;
|
105
|
+
const uint8* fast_state;
|
106
|
+
} UTF8StateMachineObj_2;
|
107
|
+
|
108
|
+
|
109
|
+
typedef UTF8StateMachineObj UTF8PropObj;
|
110
|
+
typedef UTF8StateMachineObj UTF8ScanObj;
|
111
|
+
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
112
|
+
|
113
|
+
|
114
|
+
// Look up property of one UTF-8 character and advance over it
|
115
|
+
// Return 0 if input length is zero
|
116
|
+
// Return 0 and advance one byte if input is ill-formed
|
117
|
+
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
118
|
+
const uint8** src,
|
119
|
+
int* srclen);
|
120
|
+
|
121
|
+
// BigOneByte versions are needed for tables > 240 states, but most
|
122
|
+
// won't need the TwoByte versions.
|
123
|
+
|
124
|
+
// Look up property of one UTF-8 character and advance over it
|
125
|
+
// Return 0 if input length is zero
|
126
|
+
// Return 0 and advance one byte if input is ill-formed
|
127
|
+
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
128
|
+
const uint8** src,
|
129
|
+
int* srclen);
|
130
|
+
|
131
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
132
|
+
// Always scan complete UTF-8 characters
|
133
|
+
// Set number of bytes scanned. Return reason for exiting
|
134
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
135
|
+
const uint8* str,
|
136
|
+
const int len,
|
137
|
+
int* bytes_consumed);
|
138
|
+
|
139
|
+
#endif
|
140
|
+
|
141
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
@@ -0,0 +1,22 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
7
|
+
|
8
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
9
|
+
|
10
|
+
namespace cld {
|
11
|
+
|
12
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
13
|
+
// Always scan complete UTF-8 characters
|
14
|
+
// Set number of bytes scanned. Return reason for exiting
|
15
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
16
|
+
const char* src,
|
17
|
+
int len,
|
18
|
+
int* bytes_consumed);
|
19
|
+
|
20
|
+
} // namespace cld
|
21
|
+
|
22
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
@@ -0,0 +1,18 @@
|
|
1
|
+
// Copyright 2009 Google Inc. All Rights Reserved.
|
2
|
+
// Author: alekseys@google.com (Aleksey Shlyapnikov)
|
3
|
+
|
4
|
+
// This code is not actually used, it was copied here for the reference only.
|
5
|
+
// See cld_htmlutils_windows.cc for Windows version of this code.
|
6
|
+
|
7
|
+
#include "cld/encodings/compact_lang_det/win/cld_utf8utils.h"
|
8
|
+
|
9
|
+
#include "cld/util/utf8/utf8statetable.h"
|
10
|
+
|
11
|
+
namespace cld {
|
12
|
+
|
13
|
+
int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
|
14
|
+
int* bytes_consumed) {
|
15
|
+
return ::UTF8GenericScan(st, StringPiece(src, len), bytes_consumed);
|
16
|
+
}
|
17
|
+
|
18
|
+
} // namespace cld
|
@@ -0,0 +1,17 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
|
6
|
+
|
7
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
8
|
+
|
9
|
+
namespace cld {
|
10
|
+
|
11
|
+
int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
|
12
|
+
int* bytes_consumed) {
|
13
|
+
return ::UTF8GenericScan(st, reinterpret_cast<const uint8*>(src), len,
|
14
|
+
bytes_consumed);
|
15
|
+
}
|
16
|
+
|
17
|
+
} // namespace cld
|
@@ -0,0 +1,172 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/compact_lang_det/win/normalizedunicodetext.h"
|
6
|
+
|
7
|
+
#include <tchar.h>
|
8
|
+
#include <windows.h>
|
9
|
+
#include <winnls.h>
|
10
|
+
|
11
|
+
#include "encodings/compact_lang_det/win/cld_scopedptr.h"
|
12
|
+
|
13
|
+
|
14
|
+
namespace {
|
15
|
+
|
16
|
+
// Function prototypes copied from MSDN.
|
17
|
+
typedef BOOL (WINAPI *IsNormalizedStringFunction)(NORM_FORM NormForm,
|
18
|
+
LPCWSTR lpSrcString,
|
19
|
+
int cwSrcLength);
|
20
|
+
typedef int (WINAPI *NormalizeStringFunction)(NORM_FORM NormForm,
|
21
|
+
LPCWSTR lpSrcString,
|
22
|
+
int cwSrcLength,
|
23
|
+
LPWSTR lpDstString,
|
24
|
+
int cwDstLength);
|
25
|
+
|
26
|
+
// A class to provide an access to Normaliz.dll functions.
|
27
|
+
// New normalization API implemented in Normaliz.dll is available starting
|
28
|
+
// from Windows XP SP2, that's why we have to bind to it dynamically.
|
29
|
+
class NormalizationAPI {
|
30
|
+
public:
|
31
|
+
// Creates fully initialized NormalizationAPI object.
|
32
|
+
// Loads DLL and binds all referenced functions.
|
33
|
+
NormalizationAPI()
|
34
|
+
: library_(_T("Normaliz.dll")) {
|
35
|
+
if (library_.IsValid()) {
|
36
|
+
is_normalized_string_.Bind(library_.handle(), "IsNormalizedString");
|
37
|
+
normalize_string_.Bind(library_.handle(), "NormalizeString");
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
// Proxy functions for the ones loaded from DLL.
|
42
|
+
BOOL IsNormalizedString(NORM_FORM NormForm, LPCWSTR lpSrcString,
|
43
|
+
int cwSrcLength) {
|
44
|
+
if (!is_normalized_string_.IsValid())
|
45
|
+
return FALSE;
|
46
|
+
return is_normalized_string_.function()(NormForm, lpSrcString, cwSrcLength);
|
47
|
+
}
|
48
|
+
int NormalizeString(NORM_FORM NormForm, LPCWSTR lpSrcString, int cwSrcLength,
|
49
|
+
LPWSTR lpDstString, int cwDstLength) {
|
50
|
+
if (!normalize_string_.IsValid()) {
|
51
|
+
::SetLastError(ERROR_INVALID_FUNCTION);
|
52
|
+
return 0;
|
53
|
+
}
|
54
|
+
return normalize_string_.function()(NormForm, lpSrcString, cwSrcLength,
|
55
|
+
lpDstString, cwDstLength);
|
56
|
+
}
|
57
|
+
|
58
|
+
// Returns true if all functions were bound successfully.
|
59
|
+
// This implies that library_ itself was loaded successfully.
|
60
|
+
bool IsValid() const {
|
61
|
+
return is_normalized_string_.IsValid() && normalize_string_.IsValid();
|
62
|
+
}
|
63
|
+
|
64
|
+
private:
|
65
|
+
// Holds a handle to loaded Normaliz.dll.
|
66
|
+
ScopedLibrary library_;
|
67
|
+
// Pointers to the functions loaded from Normaliz.dll.
|
68
|
+
FunctionFromDll<IsNormalizedStringFunction> is_normalized_string_;
|
69
|
+
FunctionFromDll<NormalizeStringFunction> normalize_string_;
|
70
|
+
|
71
|
+
DISALLOW_COPY_AND_ASSIGN(NormalizationAPI);
|
72
|
+
};
|
73
|
+
|
74
|
+
static NormalizationAPI normalization_api;
|
75
|
+
|
76
|
+
} // namespace
|
77
|
+
|
78
|
+
|
79
|
+
// NormalizedUnicodeText
|
80
|
+
|
81
|
+
NormalizedUnicodeText::NormalizedUnicodeText()
|
82
|
+
: normalized_text_(NULL) {
|
83
|
+
}
|
84
|
+
|
85
|
+
|
86
|
+
DWORD NormalizedUnicodeText::Normalize(NORM_FORM normalization_form,
|
87
|
+
const WCHAR* text) {
|
88
|
+
DWORD result = 0;
|
89
|
+
normalized_text_ = TryToNormalizeText(normalization_form, text, &result);
|
90
|
+
return result;
|
91
|
+
}
|
92
|
+
|
93
|
+
|
94
|
+
const WCHAR* NormalizedUnicodeText::TryToNormalizeText(
|
95
|
+
NORM_FORM normalization_form, const WCHAR* text, DWORD *error_code) {
|
96
|
+
if (!text) {
|
97
|
+
text_.reset();
|
98
|
+
return text;
|
99
|
+
}
|
100
|
+
_ASSERT(NULL != error_code);
|
101
|
+
if (!error_code)
|
102
|
+
return text;
|
103
|
+
|
104
|
+
if (!normalization_api.IsValid()) {
|
105
|
+
// Fall back to the previous version of normalization API.
|
106
|
+
int folded_text_size = ::FoldStringW(MAP_PRECOMPOSED, text, -1, NULL, 0);
|
107
|
+
if (!folded_text_size) {
|
108
|
+
*error_code = ::GetLastError();
|
109
|
+
return text;
|
110
|
+
}
|
111
|
+
|
112
|
+
text_.reset(new WCHAR[folded_text_size]);
|
113
|
+
if (!text_.get()) {
|
114
|
+
*error_code = ERROR_OUTOFMEMORY;
|
115
|
+
return text;
|
116
|
+
}
|
117
|
+
|
118
|
+
int folding_result =
|
119
|
+
::FoldStringW(MAP_PRECOMPOSED, text, -1, text_.get(), folded_text_size);
|
120
|
+
if (!folding_result) {
|
121
|
+
*error_code = ::GetLastError();
|
122
|
+
text_.reset();
|
123
|
+
return text;
|
124
|
+
}
|
125
|
+
|
126
|
+
return text_.get();
|
127
|
+
}
|
128
|
+
|
129
|
+
// No need to allocate anything when text is already normalized.
|
130
|
+
if (normalization_api.IsNormalizedString(normalization_form, text, -1))
|
131
|
+
return text;
|
132
|
+
|
133
|
+
// Get the first approximation for the buffer size required to store
|
134
|
+
// normalized text.
|
135
|
+
int normalized_text_size_guess =
|
136
|
+
normalization_api.NormalizeString(normalization_form, text, -1, NULL, 0);
|
137
|
+
|
138
|
+
while (normalized_text_size_guess > 0) {
|
139
|
+
text_.reset(new WCHAR[normalized_text_size_guess]);
|
140
|
+
if (!text_.get()) {
|
141
|
+
*error_code = ERROR_OUTOFMEMORY;
|
142
|
+
break;
|
143
|
+
}
|
144
|
+
|
145
|
+
int normalized_text_size =
|
146
|
+
normalization_api.NormalizeString(normalization_form, text, -1,
|
147
|
+
text_.get(),
|
148
|
+
normalized_text_size_guess);
|
149
|
+
|
150
|
+
if (normalized_text_size > 0) {
|
151
|
+
// Text was successfully converted.
|
152
|
+
return text_.get();
|
153
|
+
}
|
154
|
+
|
155
|
+
if (ERROR_INSUFFICIENT_BUFFER != ::GetLastError()) {
|
156
|
+
*error_code = ::GetLastError();
|
157
|
+
// Text cannot be normalized, use the original.
|
158
|
+
// By the way, ERROR_SUCCESS is a puzzling case.
|
159
|
+
// MSDN says 'The action completed successfully but yielded no results'.
|
160
|
+
// Does this mean that output buffer was not changed?
|
161
|
+
// Anyway, just in case, also return the original text.
|
162
|
+
break;
|
163
|
+
}
|
164
|
+
|
165
|
+
// Try again with the corrected buffer size.
|
166
|
+
normalized_text_size_guess = -normalized_text_size;
|
167
|
+
}
|
168
|
+
|
169
|
+
// Use the original text in case of any problem with normalization.
|
170
|
+
text_.reset();
|
171
|
+
return text;
|
172
|
+
}
|
@@ -0,0 +1,67 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
|
7
|
+
|
8
|
+
#include <tchar.h>
|
9
|
+
#include <windows.h>
|
10
|
+
|
11
|
+
#include "encodings/compact_lang_det/win/cld_scopedptr.h"
|
12
|
+
|
13
|
+
|
14
|
+
#if (WINVER < 0x0600)
|
15
|
+
// Copied from winnls.h, we're not using the latest SDK yet.
|
16
|
+
typedef enum _NORM_FORM {
|
17
|
+
NormalizationOther = 0,
|
18
|
+
NormalizationC = 0x1,
|
19
|
+
NormalizationD = 0x2,
|
20
|
+
NormalizationKC = 0x5,
|
21
|
+
NormalizationKD = 0x6
|
22
|
+
} NORM_FORM;
|
23
|
+
#endif
|
24
|
+
|
25
|
+
|
26
|
+
// Gives you back a normalized version of the input text. Normalization is
|
27
|
+
// performed to the specified form.
|
28
|
+
// Instance lifetime should be within the lifetime span of the 'text'.
|
29
|
+
class NormalizedUnicodeText {
|
30
|
+
public:
|
31
|
+
// Creates an empty instance of NormalizedUnicodeText.
|
32
|
+
NormalizedUnicodeText();
|
33
|
+
|
34
|
+
// Creates a fully initialized instance of NormalizedUnicodeText.
|
35
|
+
// [in] normalization_form - normalization rule set (see MSDN for details).
|
36
|
+
// [in] text - zero-terminated UTF-16 encoded string.
|
37
|
+
// Returns 0 in case of success, Win32 error code in case of failure.
|
38
|
+
// In case of failure, get() returns the original text.
|
39
|
+
DWORD Normalize(NORM_FORM normalization_form, const WCHAR* text);
|
40
|
+
|
41
|
+
// Returns pointer to the normalized text.
|
42
|
+
const WCHAR* get() const { return normalized_text_; }
|
43
|
+
|
44
|
+
private:
|
45
|
+
// Normalizes 'text' by the 'normalization_form' rules.
|
46
|
+
// [in] normalization_form - normalization rule set (see MSDN for details).
|
47
|
+
// [in] text - zero-terminated UTF-16 encoded string.
|
48
|
+
// [out] error_code - Win32 error code.
|
49
|
+
const WCHAR* TryToNormalizeText(NORM_FORM normalization_form,
|
50
|
+
const WCHAR* text, DWORD *error_code);
|
51
|
+
|
52
|
+
// Pointer to the normalized text.
|
53
|
+
const WCHAR* normalized_text_;
|
54
|
+
// When the source text is already normalized by the requested normalization
|
55
|
+
// form, text_ is not used and normalized_text_ just points to the source
|
56
|
+
// text. When the source text requres normalization, text_ contains normalized
|
57
|
+
// version of the source text and normalized_text_ points to this buffer.
|
58
|
+
// Since CLD requires NormalizationC form and the overwhelming majority of all
|
59
|
+
// texts in the Internet is already normalized to this form, it's expected
|
60
|
+
// that this class will not introduce any runtime memory overhead.
|
61
|
+
scoped_array<WCHAR> text_;
|
62
|
+
|
63
|
+
DISALLOW_COPY_AND_ASSIGN(NormalizedUnicodeText);
|
64
|
+
};
|
65
|
+
|
66
|
+
|
67
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
|
@@ -0,0 +1,12 @@
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
3
|
+
// found in the LICENSE file.
|
4
|
+
|
5
|
+
#include "encodings/public/encodings.h"
|
6
|
+
|
7
|
+
|
8
|
+
// We do not use it, just to please a compiler and minimize ported
|
9
|
+
// code changes.
|
10
|
+
const char * EncodingName(const Encoding enc) {
|
11
|
+
return "";
|
12
|
+
}
|