language_detection 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +11 -0
- data/ext/cld/Makefile +34 -0
- data/ext/cld/base/basictypes.h +348 -0
- data/ext/cld/base/build_config.h +124 -0
- data/ext/cld/base/casts.h +156 -0
- data/ext/cld/base/commandlineflags.h +443 -0
- data/ext/cld/base/crash.h +41 -0
- data/ext/cld/base/dynamic_annotations.h +358 -0
- data/ext/cld/base/global_strip_options.h +59 -0
- data/ext/cld/base/log_severity.h +46 -0
- data/ext/cld/base/logging.h +1403 -0
- data/ext/cld/base/macros.h +243 -0
- data/ext/cld/base/port.h +54 -0
- data/ext/cld/base/scoped_ptr.h +428 -0
- data/ext/cld/base/stl_decl.h +0 -0
- data/ext/cld/base/stl_decl_msvc.h +107 -0
- data/ext/cld/base/string_util.h +29 -0
- data/ext/cld/base/strtoint.h +93 -0
- data/ext/cld/base/template_util.h +96 -0
- data/ext/cld/base/type_traits.h +198 -0
- data/ext/cld/base/vlog_is_on.h +143 -0
- data/ext/cld/cld.so +0 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
- data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
- data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
- data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
- data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/ext/cld/encodings/internal/encodings.cc +12 -0
- data/ext/cld/encodings/lang_enc.h +254 -0
- data/ext/cld/encodings/proto/encodings.pb.h +169 -0
- data/ext/cld/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +1 -0
- data/ext/cld/language_detection.cc +88 -0
- data/ext/cld/languages/internal/languages.cc +337 -0
- data/ext/cld/languages/proto/languages.pb.h +179 -0
- data/ext/cld/languages/public/languages.h +379 -0
- data/language_detection.gemspec +28 -0
- data/lib/language_detection/string.rb +1 -0
- data/lib/language_detection/version.rb +3 -0
- data/lib/language_detection.rb +54 -0
- data/test/_helper.rb +15 -0
- data/test/fixtures/languages.csv +80 -0
- data/test/language_detection_test.rb +88 -0
- metadata +250 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
|
7
|
+
|
|
8
|
+
#if !defined(CLD_WINDOWS)
|
|
9
|
+
|
|
10
|
+
#include "util/utf8/utf8statetable.h"
|
|
11
|
+
|
|
12
|
+
#else
|
|
13
|
+
|
|
14
|
+
#include "encodings/compact_lang_det/win/cld_basictypes.h"
|
|
15
|
+
|
|
16
|
+
// These four-byte entries compactly encode how many bytes 0..255 to delete
|
|
17
|
+
// in making a string replacement, how many bytes to add 0..255, and the offset
|
|
18
|
+
// 0..64k-1 of the replacement string in remap_string.
|
|
19
|
+
struct RemapEntry {
|
|
20
|
+
uint8 delete_bytes;
|
|
21
|
+
uint8 add_bytes;
|
|
22
|
+
uint16 bytes_offset;
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
// Exit type codes for state tables. All but the first get stuffed into
|
|
26
|
+
// signed one-byte entries. The first is only generated by executable code.
|
|
27
|
+
// To distinguish from next-state entries, these must be contiguous and
|
|
28
|
+
// all <= kExitNone
|
|
29
|
+
typedef enum {
|
|
30
|
+
kExitDstSpaceFull = 239,
|
|
31
|
+
kExitIllegalStructure, // 240
|
|
32
|
+
kExitOK, // 241
|
|
33
|
+
kExitReject, // ...
|
|
34
|
+
kExitReplace1,
|
|
35
|
+
kExitReplace2,
|
|
36
|
+
kExitReplace3,
|
|
37
|
+
kExitReplace21,
|
|
38
|
+
kExitReplace31,
|
|
39
|
+
kExitReplace32,
|
|
40
|
+
kExitReplaceOffset1,
|
|
41
|
+
kExitReplaceOffset2,
|
|
42
|
+
kExitReplace1S0,
|
|
43
|
+
kExitSpecial,
|
|
44
|
+
kExitDoAgain,
|
|
45
|
+
kExitRejectAlt,
|
|
46
|
+
kExitNone // 255
|
|
47
|
+
} ExitReason;
|
|
48
|
+
|
|
49
|
+
typedef enum {
|
|
50
|
+
kExitDstSpaceFull_2 = -32769,
|
|
51
|
+
kExitIllegalStructure_2, // -32768
|
|
52
|
+
kExitOK_2, // -32767
|
|
53
|
+
kExitReject_2, // ...
|
|
54
|
+
kExitReplace1_2,
|
|
55
|
+
kExitReplace2_2,
|
|
56
|
+
kExitReplace3_2,
|
|
57
|
+
kExitReplace21_2,
|
|
58
|
+
kExitReplace31_2,
|
|
59
|
+
kExitReplace32_2,
|
|
60
|
+
kExitReplaceOffset1_2,
|
|
61
|
+
kExitReplaceOffset2_2,
|
|
62
|
+
kExitReplace1S0_2,
|
|
63
|
+
kExitSpecial_2,
|
|
64
|
+
kExitDoAgain_2,
|
|
65
|
+
kExitRejectAlt_2,
|
|
66
|
+
kExitNone_2 // -32753
|
|
67
|
+
} ExitReason_2;
|
|
68
|
+
|
|
69
|
+
// This struct represents one entire state table. The three initialized byte
|
|
70
|
+
// areas are state_table, remap_base, and remap_string. state0 and state0_size
|
|
71
|
+
// give the byte offset and length within state_table of the initial state --
|
|
72
|
+
// table lookups are expected to start and end in this state, but for
|
|
73
|
+
// truncated UTF-8 strings, may end in a different state. These allow a quick
|
|
74
|
+
// test for that condition. entry_shift is 8 for tables subscripted by a full
|
|
75
|
+
// byte value and 6 for space-optimized tables subscripted by only six
|
|
76
|
+
// significant bits in UTF-8 continuation bytes.
|
|
77
|
+
typedef struct {
|
|
78
|
+
const uint32 state0;
|
|
79
|
+
const uint32 state0_size;
|
|
80
|
+
const uint32 total_size;
|
|
81
|
+
const int max_expand;
|
|
82
|
+
const int entry_shift;
|
|
83
|
+
const int bytes_per_entry;
|
|
84
|
+
const uint32 losub;
|
|
85
|
+
const uint32 hiadd;
|
|
86
|
+
const uint8* state_table;
|
|
87
|
+
const RemapEntry* remap_base;
|
|
88
|
+
const uint8* remap_string;
|
|
89
|
+
const uint8* fast_state;
|
|
90
|
+
} UTF8StateMachineObj;
|
|
91
|
+
|
|
92
|
+
// Near-duplicate declaration for tables with two-byte entries
|
|
93
|
+
typedef struct {
|
|
94
|
+
const uint32 state0;
|
|
95
|
+
const uint32 state0_size;
|
|
96
|
+
const uint32 total_size;
|
|
97
|
+
const int max_expand;
|
|
98
|
+
const int entry_shift;
|
|
99
|
+
const int bytes_per_entry;
|
|
100
|
+
const uint32 losub;
|
|
101
|
+
const uint32 hiadd;
|
|
102
|
+
const signed short* state_table;
|
|
103
|
+
const RemapEntry* remap_base;
|
|
104
|
+
const uint8* remap_string;
|
|
105
|
+
const uint8* fast_state;
|
|
106
|
+
} UTF8StateMachineObj_2;
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
typedef UTF8StateMachineObj UTF8PropObj;
|
|
110
|
+
typedef UTF8StateMachineObj UTF8ScanObj;
|
|
111
|
+
typedef UTF8StateMachineObj_2 UTF8PropObj_2;
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
// Look up property of one UTF-8 character and advance over it
|
|
115
|
+
// Return 0 if input length is zero
|
|
116
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
117
|
+
uint8 UTF8GenericProperty(const UTF8PropObj* st,
|
|
118
|
+
const uint8** src,
|
|
119
|
+
int* srclen);
|
|
120
|
+
|
|
121
|
+
// BigOneByte versions are needed for tables > 240 states, but most
|
|
122
|
+
// won't need the TwoByte versions.
|
|
123
|
+
|
|
124
|
+
// Look up property of one UTF-8 character and advance over it
|
|
125
|
+
// Return 0 if input length is zero
|
|
126
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
127
|
+
uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
|
|
128
|
+
const uint8** src,
|
|
129
|
+
int* srclen);
|
|
130
|
+
|
|
131
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
|
132
|
+
// Always scan complete UTF-8 characters
|
|
133
|
+
// Set number of bytes scanned. Return reason for exiting
|
|
134
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
|
135
|
+
const uint8* str,
|
|
136
|
+
const int len,
|
|
137
|
+
int* bytes_consumed);
|
|
138
|
+
|
|
139
|
+
#endif
|
|
140
|
+
|
|
141
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
|
7
|
+
|
|
8
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
9
|
+
|
|
10
|
+
namespace cld {
|
|
11
|
+
|
|
12
|
+
// Scan a UTF-8 stringpiece based on a state table.
|
|
13
|
+
// Always scan complete UTF-8 characters
|
|
14
|
+
// Set number of bytes scanned. Return reason for exiting
|
|
15
|
+
int UTF8GenericScan(const UTF8ScanObj* st,
|
|
16
|
+
const char* src,
|
|
17
|
+
int len,
|
|
18
|
+
int* bytes_consumed);
|
|
19
|
+
|
|
20
|
+
} // namespace cld
|
|
21
|
+
|
|
22
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Copyright 2009 Google Inc. All Rights Reserved.
|
|
2
|
+
// Author: alekseys@google.com (Aleksey Shlyapnikov)
|
|
3
|
+
|
|
4
|
+
// This code is not actually used, it was copied here for the reference only.
|
|
5
|
+
// See cld_htmlutils_windows.cc for Windows version of this code.
|
|
6
|
+
|
|
7
|
+
#include "cld/encodings/compact_lang_det/win/cld_utf8utils.h"
|
|
8
|
+
|
|
9
|
+
#include "cld/util/utf8/utf8statetable.h"
|
|
10
|
+
|
|
11
|
+
namespace cld {
|
|
12
|
+
|
|
13
|
+
int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
|
|
14
|
+
int* bytes_consumed) {
|
|
15
|
+
return ::UTF8GenericScan(st, StringPiece(src, len), bytes_consumed);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
} // namespace cld
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/cld_utf8utils.h"
|
|
6
|
+
|
|
7
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
8
|
+
|
|
9
|
+
namespace cld {
|
|
10
|
+
|
|
11
|
+
int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
|
|
12
|
+
int* bytes_consumed) {
|
|
13
|
+
return ::UTF8GenericScan(st, reinterpret_cast<const uint8*>(src), len,
|
|
14
|
+
bytes_consumed);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
} // namespace cld
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/compact_lang_det/win/normalizedunicodetext.h"
|
|
6
|
+
|
|
7
|
+
#include <tchar.h>
|
|
8
|
+
#include <windows.h>
|
|
9
|
+
#include <winnls.h>
|
|
10
|
+
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_scopedptr.h"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
namespace {
|
|
15
|
+
|
|
16
|
+
// Function prototypes copied from MSDN.
|
|
17
|
+
typedef BOOL (WINAPI *IsNormalizedStringFunction)(NORM_FORM NormForm,
|
|
18
|
+
LPCWSTR lpSrcString,
|
|
19
|
+
int cwSrcLength);
|
|
20
|
+
typedef int (WINAPI *NormalizeStringFunction)(NORM_FORM NormForm,
|
|
21
|
+
LPCWSTR lpSrcString,
|
|
22
|
+
int cwSrcLength,
|
|
23
|
+
LPWSTR lpDstString,
|
|
24
|
+
int cwDstLength);
|
|
25
|
+
|
|
26
|
+
// A class to provide an access to Normaliz.dll functions.
|
|
27
|
+
// New normalization API implemented in Normaliz.dll is available starting
|
|
28
|
+
// from Windows XP SP2, that's why we have to bind to it dynamically.
|
|
29
|
+
class NormalizationAPI {
|
|
30
|
+
public:
|
|
31
|
+
// Creates fully initialized NormalizationAPI object.
|
|
32
|
+
// Loads DLL and binds all referenced functions.
|
|
33
|
+
NormalizationAPI()
|
|
34
|
+
: library_(_T("Normaliz.dll")) {
|
|
35
|
+
if (library_.IsValid()) {
|
|
36
|
+
is_normalized_string_.Bind(library_.handle(), "IsNormalizedString");
|
|
37
|
+
normalize_string_.Bind(library_.handle(), "NormalizeString");
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Proxy functions for the ones loaded from DLL.
|
|
42
|
+
BOOL IsNormalizedString(NORM_FORM NormForm, LPCWSTR lpSrcString,
|
|
43
|
+
int cwSrcLength) {
|
|
44
|
+
if (!is_normalized_string_.IsValid())
|
|
45
|
+
return FALSE;
|
|
46
|
+
return is_normalized_string_.function()(NormForm, lpSrcString, cwSrcLength);
|
|
47
|
+
}
|
|
48
|
+
int NormalizeString(NORM_FORM NormForm, LPCWSTR lpSrcString, int cwSrcLength,
|
|
49
|
+
LPWSTR lpDstString, int cwDstLength) {
|
|
50
|
+
if (!normalize_string_.IsValid()) {
|
|
51
|
+
::SetLastError(ERROR_INVALID_FUNCTION);
|
|
52
|
+
return 0;
|
|
53
|
+
}
|
|
54
|
+
return normalize_string_.function()(NormForm, lpSrcString, cwSrcLength,
|
|
55
|
+
lpDstString, cwDstLength);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Returns true if all functions were bound successfully.
|
|
59
|
+
// This implies that library_ itself was loaded successfully.
|
|
60
|
+
bool IsValid() const {
|
|
61
|
+
return is_normalized_string_.IsValid() && normalize_string_.IsValid();
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
private:
|
|
65
|
+
// Holds a handle to loaded Normaliz.dll.
|
|
66
|
+
ScopedLibrary library_;
|
|
67
|
+
// Pointers to the functions loaded from Normaliz.dll.
|
|
68
|
+
FunctionFromDll<IsNormalizedStringFunction> is_normalized_string_;
|
|
69
|
+
FunctionFromDll<NormalizeStringFunction> normalize_string_;
|
|
70
|
+
|
|
71
|
+
DISALLOW_COPY_AND_ASSIGN(NormalizationAPI);
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
static NormalizationAPI normalization_api;
|
|
75
|
+
|
|
76
|
+
} // namespace
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
// NormalizedUnicodeText
|
|
80
|
+
|
|
81
|
+
NormalizedUnicodeText::NormalizedUnicodeText()
|
|
82
|
+
: normalized_text_(NULL) {
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
DWORD NormalizedUnicodeText::Normalize(NORM_FORM normalization_form,
|
|
87
|
+
const WCHAR* text) {
|
|
88
|
+
DWORD result = 0;
|
|
89
|
+
normalized_text_ = TryToNormalizeText(normalization_form, text, &result);
|
|
90
|
+
return result;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
const WCHAR* NormalizedUnicodeText::TryToNormalizeText(
|
|
95
|
+
NORM_FORM normalization_form, const WCHAR* text, DWORD *error_code) {
|
|
96
|
+
if (!text) {
|
|
97
|
+
text_.reset();
|
|
98
|
+
return text;
|
|
99
|
+
}
|
|
100
|
+
_ASSERT(NULL != error_code);
|
|
101
|
+
if (!error_code)
|
|
102
|
+
return text;
|
|
103
|
+
|
|
104
|
+
if (!normalization_api.IsValid()) {
|
|
105
|
+
// Fall back to the previous version of normalization API.
|
|
106
|
+
int folded_text_size = ::FoldStringW(MAP_PRECOMPOSED, text, -1, NULL, 0);
|
|
107
|
+
if (!folded_text_size) {
|
|
108
|
+
*error_code = ::GetLastError();
|
|
109
|
+
return text;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
text_.reset(new WCHAR[folded_text_size]);
|
|
113
|
+
if (!text_.get()) {
|
|
114
|
+
*error_code = ERROR_OUTOFMEMORY;
|
|
115
|
+
return text;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
int folding_result =
|
|
119
|
+
::FoldStringW(MAP_PRECOMPOSED, text, -1, text_.get(), folded_text_size);
|
|
120
|
+
if (!folding_result) {
|
|
121
|
+
*error_code = ::GetLastError();
|
|
122
|
+
text_.reset();
|
|
123
|
+
return text;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return text_.get();
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// No need to allocate anything when text is already normalized.
|
|
130
|
+
if (normalization_api.IsNormalizedString(normalization_form, text, -1))
|
|
131
|
+
return text;
|
|
132
|
+
|
|
133
|
+
// Get the first approximation for the buffer size required to store
|
|
134
|
+
// normalized text.
|
|
135
|
+
int normalized_text_size_guess =
|
|
136
|
+
normalization_api.NormalizeString(normalization_form, text, -1, NULL, 0);
|
|
137
|
+
|
|
138
|
+
while (normalized_text_size_guess > 0) {
|
|
139
|
+
text_.reset(new WCHAR[normalized_text_size_guess]);
|
|
140
|
+
if (!text_.get()) {
|
|
141
|
+
*error_code = ERROR_OUTOFMEMORY;
|
|
142
|
+
break;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
int normalized_text_size =
|
|
146
|
+
normalization_api.NormalizeString(normalization_form, text, -1,
|
|
147
|
+
text_.get(),
|
|
148
|
+
normalized_text_size_guess);
|
|
149
|
+
|
|
150
|
+
if (normalized_text_size > 0) {
|
|
151
|
+
// Text was successfully converted.
|
|
152
|
+
return text_.get();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if (ERROR_INSUFFICIENT_BUFFER != ::GetLastError()) {
|
|
156
|
+
*error_code = ::GetLastError();
|
|
157
|
+
// Text cannot be normalized, use the original.
|
|
158
|
+
// By the way, ERROR_SUCCESS is a puzzling case.
|
|
159
|
+
// MSDN says 'The action completed successfully but yielded no results'.
|
|
160
|
+
// Does this mean that output buffer was not changed?
|
|
161
|
+
// Anyway, just in case, also return the original text.
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Try again with the corrected buffer size.
|
|
166
|
+
normalized_text_size_guess = -normalized_text_size;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Use the original text in case of any problem with normalization.
|
|
170
|
+
text_.reset();
|
|
171
|
+
return text;
|
|
172
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#ifndef ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
|
|
6
|
+
#define ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
|
|
7
|
+
|
|
8
|
+
#include <tchar.h>
|
|
9
|
+
#include <windows.h>
|
|
10
|
+
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_scopedptr.h"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
#if (WINVER < 0x0600)
|
|
15
|
+
// Copied from winnls.h, we're not using the latest SDK yet.
|
|
16
|
+
typedef enum _NORM_FORM {
|
|
17
|
+
NormalizationOther = 0,
|
|
18
|
+
NormalizationC = 0x1,
|
|
19
|
+
NormalizationD = 0x2,
|
|
20
|
+
NormalizationKC = 0x5,
|
|
21
|
+
NormalizationKD = 0x6
|
|
22
|
+
} NORM_FORM;
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
// Gives you back a normalized version of the input text. Normalization is
|
|
27
|
+
// performed to the specified form.
|
|
28
|
+
// Instance lifetime should be within the lifetime span of the 'text'.
|
|
29
|
+
class NormalizedUnicodeText {
|
|
30
|
+
public:
|
|
31
|
+
// Creates an empty instance of NormalizedUnicodeText.
|
|
32
|
+
NormalizedUnicodeText();
|
|
33
|
+
|
|
34
|
+
// Creates a fully initialized instance of NormalizedUnicodeText.
|
|
35
|
+
// [in] normalization_form - normalization rule set (see MSDN for details).
|
|
36
|
+
// [in] text - zero-terminated UTF-16 encoded string.
|
|
37
|
+
// Returns 0 in case of success, Win32 error code in case of failure.
|
|
38
|
+
// In case of failure, get() returns the original text.
|
|
39
|
+
DWORD Normalize(NORM_FORM normalization_form, const WCHAR* text);
|
|
40
|
+
|
|
41
|
+
// Returns pointer to the normalized text.
|
|
42
|
+
const WCHAR* get() const { return normalized_text_; }
|
|
43
|
+
|
|
44
|
+
private:
|
|
45
|
+
// Normalizes 'text' by the 'normalization_form' rules.
|
|
46
|
+
// [in] normalization_form - normalization rule set (see MSDN for details).
|
|
47
|
+
// [in] text - zero-terminated UTF-16 encoded string.
|
|
48
|
+
// [out] error_code - Win32 error code.
|
|
49
|
+
const WCHAR* TryToNormalizeText(NORM_FORM normalization_form,
|
|
50
|
+
const WCHAR* text, DWORD *error_code);
|
|
51
|
+
|
|
52
|
+
// Pointer to the normalized text.
|
|
53
|
+
const WCHAR* normalized_text_;
|
|
54
|
+
// When the source text is already normalized by the requested normalization
|
|
55
|
+
// form, text_ is not used and normalized_text_ just points to the source
|
|
56
|
+
// text. When the source text requres normalization, text_ contains normalized
|
|
57
|
+
// version of the source text and normalized_text_ points to this buffer.
|
|
58
|
+
// Since CLD requires NormalizationC form and the overwhelming majority of all
|
|
59
|
+
// texts in the Internet is already normalized to this form, it's expected
|
|
60
|
+
// that this class will not introduce any runtime memory overhead.
|
|
61
|
+
scoped_array<WCHAR> text_;
|
|
62
|
+
|
|
63
|
+
DISALLOW_COPY_AND_ASSIGN(NormalizedUnicodeText);
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_NORMALIZEDUNICODETEXT_H_
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include "encodings/public/encodings.h"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
// We do not use it, just to please a compiler and minimize ported
|
|
9
|
+
// code changes.
|
|
10
|
+
const char * EncodingName(const Encoding enc) {
|
|
11
|
+
return "";
|
|
12
|
+
}
|