cld-fixed 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +27 -0
  6. data/README.md +34 -0
  7. data/Rakefile +5 -0
  8. data/cld.gemspec +22 -0
  9. data/ext/cld/Makefile.am +28 -0
  10. data/ext/cld/Makefile.in +790 -0
  11. data/ext/cld/aclocal.m4 +8895 -0
  12. data/ext/cld/base/basictypes.h +348 -0
  13. data/ext/cld/base/build_config.h +115 -0
  14. data/ext/cld/base/casts.h +156 -0
  15. data/ext/cld/base/commandlineflags.h +443 -0
  16. data/ext/cld/base/crash.h +41 -0
  17. data/ext/cld/base/dynamic_annotations.h +358 -0
  18. data/ext/cld/base/global_strip_options.h +59 -0
  19. data/ext/cld/base/log_severity.h +46 -0
  20. data/ext/cld/base/logging.h +1403 -0
  21. data/ext/cld/base/macros.h +243 -0
  22. data/ext/cld/base/port.h +54 -0
  23. data/ext/cld/base/scoped_ptr.h +428 -0
  24. data/ext/cld/base/stl_decl.h +0 -0
  25. data/ext/cld/base/stl_decl_msvc.h +107 -0
  26. data/ext/cld/base/string_util.h +29 -0
  27. data/ext/cld/base/strtoint.h +93 -0
  28. data/ext/cld/base/template_util.h +96 -0
  29. data/ext/cld/base/type_traits.h +198 -0
  30. data/ext/cld/base/vlog_is_on.h +143 -0
  31. data/ext/cld/build_aux/config.guess +1500 -0
  32. data/ext/cld/build_aux/config.sub +1616 -0
  33. data/ext/cld/build_aux/depcomp +584 -0
  34. data/ext/cld/build_aux/install-sh +507 -0
  35. data/ext/cld/build_aux/ltmain.sh +8745 -0
  36. data/ext/cld/build_aux/missing +367 -0
  37. data/ext/cld/cld_encodings.h +95 -0
  38. data/ext/cld/configure +17362 -0
  39. data/ext/cld/configure.ac +14 -0
  40. data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
  41. data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
  42. data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  43. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  44. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  45. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  46. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  47. data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
  48. data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
  49. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  50. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  51. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  52. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  53. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  54. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  55. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  56. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  57. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  58. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  59. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  60. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  61. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  62. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  63. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  64. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  65. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  66. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  67. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  68. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  69. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  70. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  71. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  72. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  73. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  74. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  75. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  76. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  77. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  78. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  79. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  80. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  81. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  82. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  83. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  84. data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  85. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  86. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  87. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  88. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  89. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  90. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  91. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  92. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  93. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  94. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  95. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  96. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  97. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  98. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  99. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  100. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  101. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  102. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  103. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  104. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  105. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  106. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  107. data/ext/cld/encodings/internal/encodings.cc +12 -0
  108. data/ext/cld/encodings/lang_enc.h +254 -0
  109. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  110. data/ext/cld/encodings/public/encodings.h +301 -0
  111. data/ext/cld/extconf.rb +7 -0
  112. data/ext/cld/languages/internal/#languages.cc# +337 -0
  113. data/ext/cld/languages/internal/languages.cc +336 -0
  114. data/ext/cld/languages/proto/languages.pb.h +179 -0
  115. data/ext/cld/languages/public/languages.h +379 -0
  116. data/ext/cld/thunk.cc +55 -0
  117. data/lib/cld.rb +21 -0
  118. data/lib/cld/version.rb +3 -0
  119. data/spec/cld_spec.rb +67 -0
  120. data/spec/spec_helper.rb +6 -0
  121. metadata +193 -0
@@ -0,0 +1,224 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
6
+
7
+ // Return true if current Tbl pointer is within state0 range
8
+ // Note that unsigned compare checks both ends of range simultaneously
9
+ static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
10
+ const uint8* Tbl0 = &st->state_table[st->state0];
11
+ return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
12
+ }
13
+
14
+
15
+ // Look up property of one UTF-8 character and advance over it
16
+ // Return 0 if input length is zero
17
+ // Return 0 and advance one byte if input is ill-formed
18
+ uint8 UTF8GenericProperty(const UTF8PropObj* st,
19
+ const uint8** src,
20
+ int* srclen) {
21
+ if (*srclen <= 0) {
22
+ return 0;
23
+ }
24
+
25
+ const uint8* lsrc = *src;
26
+ const uint8* Tbl_0 = &st->state_table[st->state0];
27
+ const uint8* Tbl = Tbl_0;
28
+ int e;
29
+ int eshift = st->entry_shift;
30
+
31
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
32
+ unsigned char c = lsrc[0];
33
+ if (static_cast<signed char>(c) >= 0) { // one byte
34
+ e = Tbl[c];
35
+ *src += 1;
36
+ *srclen -= 1;
37
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
38
+ e = Tbl[c];
39
+ Tbl = &Tbl_0[e << eshift];
40
+ e = Tbl[lsrc[1]];
41
+ *src += 2;
42
+ *srclen -= 2;
43
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
44
+ e = Tbl[c];
45
+ Tbl = &Tbl_0[e << eshift];
46
+ e = Tbl[lsrc[1]];
47
+ Tbl = &Tbl_0[e << eshift];
48
+ e = Tbl[lsrc[2]];
49
+ *src += 3;
50
+ *srclen -= 3;
51
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
52
+ e = Tbl[c];
53
+ Tbl = &Tbl_0[e << eshift];
54
+ e = Tbl[lsrc[1]];
55
+ Tbl = &Tbl_0[e << eshift];
56
+ e = Tbl[lsrc[2]];
57
+ Tbl = &Tbl_0[e << eshift];
58
+ e = Tbl[lsrc[3]];
59
+ *src += 4;
60
+ *srclen -= 4;
61
+ } else { // Ill-formed
62
+ e = 0;
63
+ *src += 1;
64
+ *srclen -= 1;
65
+ }
66
+ return e;
67
+ }
68
+
69
+ // BigOneByte versions are needed for tables > 240 states, but most
70
+ // won't need the TwoByte versions.
71
+ // Internally, to next-to-last offset is multiplied by 16 and the last
72
+ // offset is relative instead of absolute.
73
+ // Look up property of one UTF-8 character and advance over it
74
+ // Return 0 if input length is zero
75
+ // Return 0 and advance one byte if input is ill-formed
76
+ uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
77
+ const uint8** src,
78
+ int* srclen) {
79
+ if (*srclen <= 0) {
80
+ return 0;
81
+ }
82
+
83
+ const uint8* lsrc = *src;
84
+ const uint8* Tbl_0 = &st->state_table[st->state0];
85
+ const uint8* Tbl = Tbl_0;
86
+ int e;
87
+ int eshift = st->entry_shift;
88
+
89
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
90
+ unsigned char c = lsrc[0];
91
+ if (static_cast<signed char>(c) >= 0) { // one byte
92
+ e = Tbl[c];
93
+ *src += 1;
94
+ *srclen -= 1;
95
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
96
+ e = Tbl[c];
97
+ Tbl = &Tbl_0[e << eshift];
98
+ e = Tbl[lsrc[1]];
99
+ *src += 2;
100
+ *srclen -= 2;
101
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
102
+ e = Tbl[c];
103
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
104
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
105
+ Tbl = &Tbl[e << eshift]; // Relative +/-
106
+ e = Tbl[lsrc[2]];
107
+ *src += 3;
108
+ *srclen -= 3;
109
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
110
+ e = Tbl[c];
111
+ Tbl = &Tbl_0[e << eshift];
112
+ e = Tbl[lsrc[1]];
113
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
114
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
115
+ Tbl = &Tbl[e << eshift]; // Relative +/-
116
+ e = Tbl[lsrc[3]];
117
+ *src += 4;
118
+ *srclen -= 4;
119
+ } else { // Ill-formed
120
+ e = 0;
121
+ *src += 1;
122
+ *srclen -= 1;
123
+ }
124
+ return e;
125
+ }
126
+
127
+ // Scan a UTF-8 stringpiece based on a state table.
128
+ // Always scan complete UTF-8 characters
129
+ // Set number of bytes scanned. Return reason for exiting
130
+ int UTF8GenericScan(const UTF8ScanObj* st,
131
+ const uint8* str,
132
+ const int len,
133
+ int* bytes_consumed) {
134
+ int eshift = st->entry_shift; // 6 (space optimized) or 8
135
+ // int nEntries = (1 << eshift); // 64 or 256 entries per state
136
+
137
+ const uint8* isrc = str;
138
+ //reinterpret_cast<const uint8*>(str.data());
139
+ const uint8* src = isrc;
140
+ //const int len = str.length();
141
+ const uint8* srclimit = isrc + len;
142
+ const uint8* srclimit8 = srclimit - 7;
143
+ *bytes_consumed = 0;
144
+ if (len == 0) return kExitOK;
145
+
146
+ const uint8* Tbl_0 = &st->state_table[st->state0];
147
+
148
+ DoAgain:
149
+ // Do state-table scan
150
+ int e = 0;
151
+ uint8 c;
152
+
153
+ // Do fast for groups of 8 identity bytes.
154
+ // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
155
+ // including slowing slightly on cr/lf/ht
156
+ //----------------------------
157
+ const uint8* Tbl2 = &st->fast_state[0];
158
+ uint32 losub = st->losub;
159
+ uint32 hiadd = st->hiadd;
160
+ while (src < srclimit8) {
161
+ uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
162
+ uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
163
+ src += 8;
164
+ // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
165
+ uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
166
+ (s4567 - losub) | (s4567 + hiadd);
167
+ if ((temp & 0x80808080) != 0) {
168
+ // We typically end up here on cr/lf/ht; src was incremented
169
+ int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
170
+ (Tbl2[src[-6]] | Tbl2[src[-5]]);
171
+ if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
172
+ e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
173
+ (Tbl2[src[-2]] | Tbl2[src[-1]]);
174
+ if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
175
+ // Else OK, go around again
176
+ }
177
+ }
178
+ //----------------------------
179
+
180
+ // Byte-at-a-time scan
181
+ //----------------------------
182
+ const uint8* Tbl = Tbl_0;
183
+ while (src < srclimit) {
184
+ c = *src;
185
+ e = Tbl[c];
186
+ src++;
187
+ if (e >= kExitIllegalStructure) {break;}
188
+ Tbl = &Tbl_0[e << eshift];
189
+ }
190
+ //----------------------------
191
+
192
+
193
+ // Exit posibilities:
194
+ // Some exit code, !state0, back up over last char
195
+ // Some exit code, state0, back up one byte exactly
196
+ // source consumed, !state0, back up over partial char
197
+ // source consumed, state0, exit OK
198
+ // For illegal byte in state0, avoid backup up over PREVIOUS char
199
+ // For truncated last char, back up to beginning of it
200
+
201
+ if (e >= kExitIllegalStructure) {
202
+ // Back up over exactly one byte of rejected/illegal UTF-8 character
203
+ src--;
204
+ // Back up more if needed
205
+ if (!InStateZero(st, Tbl)) {
206
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
207
+ }
208
+ } else if (!InStateZero(st, Tbl)) {
209
+ // Back up over truncated UTF-8 character
210
+ e = kExitIllegalStructure;
211
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
212
+ } else {
213
+ // Normal termination, source fully consumed
214
+ e = kExitOK;
215
+ }
216
+
217
+ if (e == kExitDoAgain) {
218
+ // Loop back up to the fast scan
219
+ goto DoAgain;
220
+ }
221
+
222
+ *bytes_consumed = src - isrc;
223
+ return e;
224
+ }
@@ -0,0 +1,141 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
7
+
8
+ #if !defined(CLD_WINDOWS)
9
+
10
+ #include "util/utf8/utf8statetable.h"
11
+
12
+ #else
13
+
14
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
15
+
16
+ // These four-byte entries compactly encode how many bytes 0..255 to delete
17
+ // in making a string replacement, how many bytes to add 0..255, and the offset
18
+ // 0..64k-1 of the replacement string in remap_string.
19
+ struct RemapEntry {
20
+ uint8 delete_bytes;
21
+ uint8 add_bytes;
22
+ uint16 bytes_offset;
23
+ };
24
+
25
+ // Exit type codes for state tables. All but the first get stuffed into
26
+ // signed one-byte entries. The first is only generated by executable code.
27
+ // To distinguish from next-state entries, these must be contiguous and
28
+ // all <= kExitNone
29
+ typedef enum {
30
+ kExitDstSpaceFull = 239,
31
+ kExitIllegalStructure, // 240
32
+ kExitOK, // 241
33
+ kExitReject, // ...
34
+ kExitReplace1,
35
+ kExitReplace2,
36
+ kExitReplace3,
37
+ kExitReplace21,
38
+ kExitReplace31,
39
+ kExitReplace32,
40
+ kExitReplaceOffset1,
41
+ kExitReplaceOffset2,
42
+ kExitReplace1S0,
43
+ kExitSpecial,
44
+ kExitDoAgain,
45
+ kExitRejectAlt,
46
+ kExitNone // 255
47
+ } ExitReason;
48
+
49
+ typedef enum {
50
+ kExitDstSpaceFull_2 = -32769,
51
+ kExitIllegalStructure_2, // -32768
52
+ kExitOK_2, // -32767
53
+ kExitReject_2, // ...
54
+ kExitReplace1_2,
55
+ kExitReplace2_2,
56
+ kExitReplace3_2,
57
+ kExitReplace21_2,
58
+ kExitReplace31_2,
59
+ kExitReplace32_2,
60
+ kExitReplaceOffset1_2,
61
+ kExitReplaceOffset2_2,
62
+ kExitReplace1S0_2,
63
+ kExitSpecial_2,
64
+ kExitDoAgain_2,
65
+ kExitRejectAlt_2,
66
+ kExitNone_2 // -32753
67
+ } ExitReason_2;
68
+
69
+ // This struct represents one entire state table. The three initialized byte
70
+ // areas are state_table, remap_base, and remap_string. state0 and state0_size
71
+ // give the byte offset and length within state_table of the initial state --
72
+ // table lookups are expected to start and end in this state, but for
73
+ // truncated UTF-8 strings, may end in a different state. These allow a quick
74
+ // test for that condition. entry_shift is 8 for tables subscripted by a full
75
+ // byte value and 6 for space-optimized tables subscripted by only six
76
+ // significant bits in UTF-8 continuation bytes.
77
+ typedef struct {
78
+ const uint32 state0;
79
+ const uint32 state0_size;
80
+ const uint32 total_size;
81
+ const int max_expand;
82
+ const int entry_shift;
83
+ const int bytes_per_entry;
84
+ const uint32 losub;
85
+ const uint32 hiadd;
86
+ const uint8* state_table;
87
+ const RemapEntry* remap_base;
88
+ const uint8* remap_string;
89
+ const uint8* fast_state;
90
+ } UTF8StateMachineObj;
91
+
92
+ // Near-duplicate declaration for tables with two-byte entries
93
+ typedef struct {
94
+ const uint32 state0;
95
+ const uint32 state0_size;
96
+ const uint32 total_size;
97
+ const int max_expand;
98
+ const int entry_shift;
99
+ const int bytes_per_entry;
100
+ const uint32 losub;
101
+ const uint32 hiadd;
102
+ const signed short* state_table;
103
+ const RemapEntry* remap_base;
104
+ const uint8* remap_string;
105
+ const uint8* fast_state;
106
+ } UTF8StateMachineObj_2;
107
+
108
+
109
+ typedef UTF8StateMachineObj UTF8PropObj;
110
+ typedef UTF8StateMachineObj UTF8ScanObj;
111
+ typedef UTF8StateMachineObj_2 UTF8PropObj_2;
112
+
113
+
114
+ // Look up property of one UTF-8 character and advance over it
115
+ // Return 0 if input length is zero
116
+ // Return 0 and advance one byte if input is ill-formed
117
+ uint8 UTF8GenericProperty(const UTF8PropObj* st,
118
+ const uint8** src,
119
+ int* srclen);
120
+
121
+ // BigOneByte versions are needed for tables > 240 states, but most
122
+ // won't need the TwoByte versions.
123
+
124
+ // Look up property of one UTF-8 character and advance over it
125
+ // Return 0 if input length is zero
126
+ // Return 0 and advance one byte if input is ill-formed
127
+ uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
128
+ const uint8** src,
129
+ int* srclen);
130
+
131
+ // Scan a UTF-8 stringpiece based on a state table.
132
+ // Always scan complete UTF-8 characters
133
+ // Set number of bytes scanned. Return reason for exiting
134
+ int UTF8GenericScan(const UTF8ScanObj* st,
135
+ const uint8* str,
136
+ const int len,
137
+ int* bytes_consumed);
138
+
139
+ #endif
140
+
141
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
@@ -0,0 +1,22 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
7
+
8
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
9
+
10
+ namespace cld {
11
+
12
+ // Scan a UTF-8 stringpiece based on a state table.
13
+ // Always scan complete UTF-8 characters
14
+ // Set number of bytes scanned. Return reason for exiting
15
+ int UTF8GenericScan(const UTF8ScanObj* st,
16
+ const char* src,
17
+ int len,
18
+ int* bytes_consumed);
19
+
20
+ } // namespace cld
21
+
22
+ #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8UTILS_H_
@@ -0,0 +1,18 @@
1
+ // Copyright 2009 Google Inc. All Rights Reserved.
2
+ // Author: alekseys@google.com (Aleksey Shlyapnikov)
3
+
4
+ // This code is not actually used, it was copied here for the reference only.
5
+ // See cld_htmlutils_windows.cc for Windows version of this code.
6
+
7
+ #include "cld/encodings/compact_lang_det/win/cld_utf8utils.h"
8
+
9
+ #include "cld/util/utf8/utf8statetable.h"
10
+
11
+ namespace cld {
12
+
13
+ int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
14
+ int* bytes_consumed) {
15
+ return ::UTF8GenericScan(st, StringPiece(src, len), bytes_consumed);
16
+ }
17
+
18
+ } // namespace cld
@@ -0,0 +1,17 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/cld_utf8utils.h"
6
+
7
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
8
+
9
+ namespace cld {
10
+
11
+ int UTF8GenericScan(const UTF8ScanObj* st, const char* src, int len,
12
+ int* bytes_consumed) {
13
+ return ::UTF8GenericScan(st, reinterpret_cast<const uint8*>(src), len,
14
+ bytes_consumed);
15
+ }
16
+
17
+ } // namespace cld
@@ -0,0 +1,172 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/win/normalizedunicodetext.h"
6
+
7
+ #include <tchar.h>
8
+ #include <windows.h>
9
+ #include <winnls.h>
10
+
11
+ #include "encodings/compact_lang_det/win/cld_scopedptr.h"
12
+
13
+
14
+ namespace {
15
+
16
+ // Function prototypes copied from MSDN.
17
+ typedef BOOL (WINAPI *IsNormalizedStringFunction)(NORM_FORM NormForm,
18
+ LPCWSTR lpSrcString,
19
+ int cwSrcLength);
20
+ typedef int (WINAPI *NormalizeStringFunction)(NORM_FORM NormForm,
21
+ LPCWSTR lpSrcString,
22
+ int cwSrcLength,
23
+ LPWSTR lpDstString,
24
+ int cwDstLength);
25
+
26
+ // A class to provide an access to Normaliz.dll functions.
27
+ // New normalization API implemented in Normaliz.dll is available starting
28
+ // from Windows XP SP2, that's why we have to bind to it dynamically.
29
+ class NormalizationAPI {
30
+ public:
31
+ // Creates fully initialized NormalizationAPI object.
32
+ // Loads DLL and binds all referenced functions.
33
+ NormalizationAPI()
34
+ : library_(_T("Normaliz.dll")) {
35
+ if (library_.IsValid()) {
36
+ is_normalized_string_.Bind(library_.handle(), "IsNormalizedString");
37
+ normalize_string_.Bind(library_.handle(), "NormalizeString");
38
+ }
39
+ }
40
+
41
+ // Proxy functions for the ones loaded from DLL.
42
+ BOOL IsNormalizedString(NORM_FORM NormForm, LPCWSTR lpSrcString,
43
+ int cwSrcLength) {
44
+ if (!is_normalized_string_.IsValid())
45
+ return FALSE;
46
+ return is_normalized_string_.function()(NormForm, lpSrcString, cwSrcLength);
47
+ }
48
+ int NormalizeString(NORM_FORM NormForm, LPCWSTR lpSrcString, int cwSrcLength,
49
+ LPWSTR lpDstString, int cwDstLength) {
50
+ if (!normalize_string_.IsValid()) {
51
+ ::SetLastError(ERROR_INVALID_FUNCTION);
52
+ return 0;
53
+ }
54
+ return normalize_string_.function()(NormForm, lpSrcString, cwSrcLength,
55
+ lpDstString, cwDstLength);
56
+ }
57
+
58
+ // Returns true if all functions were bound successfully.
59
+ // This implies that library_ itself was loaded successfully.
60
+ bool IsValid() const {
61
+ return is_normalized_string_.IsValid() && normalize_string_.IsValid();
62
+ }
63
+
64
+ private:
65
+ // Holds a handle to loaded Normaliz.dll.
66
+ ScopedLibrary library_;
67
+ // Pointers to the functions loaded from Normaliz.dll.
68
+ FunctionFromDll<IsNormalizedStringFunction> is_normalized_string_;
69
+ FunctionFromDll<NormalizeStringFunction> normalize_string_;
70
+
71
+ DISALLOW_COPY_AND_ASSIGN(NormalizationAPI);
72
+ };
73
+
74
+ static NormalizationAPI normalization_api;
75
+
76
+ } // namespace
77
+
78
+
79
+ // NormalizedUnicodeText
80
+
81
+ NormalizedUnicodeText::NormalizedUnicodeText()
82
+ : normalized_text_(NULL) {
83
+ }
84
+
85
+
86
+ DWORD NormalizedUnicodeText::Normalize(NORM_FORM normalization_form,
87
+ const WCHAR* text) {
88
+ DWORD result = 0;
89
+ normalized_text_ = TryToNormalizeText(normalization_form, text, &result);
90
+ return result;
91
+ }
92
+
93
+
94
+ const WCHAR* NormalizedUnicodeText::TryToNormalizeText(
95
+ NORM_FORM normalization_form, const WCHAR* text, DWORD *error_code) {
96
+ if (!text) {
97
+ text_.reset();
98
+ return text;
99
+ }
100
+ _ASSERT(NULL != error_code);
101
+ if (!error_code)
102
+ return text;
103
+
104
+ if (!normalization_api.IsValid()) {
105
+ // Fall back to the previous version of normalization API.
106
+ int folded_text_size = ::FoldStringW(MAP_PRECOMPOSED, text, -1, NULL, 0);
107
+ if (!folded_text_size) {
108
+ *error_code = ::GetLastError();
109
+ return text;
110
+ }
111
+
112
+ text_.reset(new WCHAR[folded_text_size]);
113
+ if (!text_.get()) {
114
+ *error_code = ERROR_OUTOFMEMORY;
115
+ return text;
116
+ }
117
+
118
+ int folding_result =
119
+ ::FoldStringW(MAP_PRECOMPOSED, text, -1, text_.get(), folded_text_size);
120
+ if (!folding_result) {
121
+ *error_code = ::GetLastError();
122
+ text_.reset();
123
+ return text;
124
+ }
125
+
126
+ return text_.get();
127
+ }
128
+
129
+ // No need to allocate anything when text is already normalized.
130
+ if (normalization_api.IsNormalizedString(normalization_form, text, -1))
131
+ return text;
132
+
133
+ // Get the first approximation for the buffer size required to store
134
+ // normalized text.
135
+ int normalized_text_size_guess =
136
+ normalization_api.NormalizeString(normalization_form, text, -1, NULL, 0);
137
+
138
+ while (normalized_text_size_guess > 0) {
139
+ text_.reset(new WCHAR[normalized_text_size_guess]);
140
+ if (!text_.get()) {
141
+ *error_code = ERROR_OUTOFMEMORY;
142
+ break;
143
+ }
144
+
145
+ int normalized_text_size =
146
+ normalization_api.NormalizeString(normalization_form, text, -1,
147
+ text_.get(),
148
+ normalized_text_size_guess);
149
+
150
+ if (normalized_text_size > 0) {
151
+ // Text was successfully converted.
152
+ return text_.get();
153
+ }
154
+
155
+ if (ERROR_INSUFFICIENT_BUFFER != ::GetLastError()) {
156
+ *error_code = ::GetLastError();
157
+ // Text cannot be normalized, use the original.
158
+ // By the way, ERROR_SUCCESS is a puzzling case.
159
+ // MSDN says 'The action completed successfully but yielded no results'.
160
+ // Does this mean that output buffer was not changed?
161
+ // Anyway, just in case, also return the original text.
162
+ break;
163
+ }
164
+
165
+ // Try again with the corrected buffer size.
166
+ normalized_text_size_guess = -normalized_text_size;
167
+ }
168
+
169
+ // Use the original text in case of any problem with normalization.
170
+ text_.reset();
171
+ return text;
172
+ }