tidy-ext 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/LICENSE +50 -0
- data/README +12 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/ext/tidy/access.c +3310 -0
- data/ext/tidy/access.h +279 -0
- data/ext/tidy/alloc.c +107 -0
- data/ext/tidy/attrask.c +209 -0
- data/ext/tidy/attrdict.c +2398 -0
- data/ext/tidy/attrdict.h +122 -0
- data/ext/tidy/attrget.c +213 -0
- data/ext/tidy/attrs.c +1911 -0
- data/ext/tidy/attrs.h +374 -0
- data/ext/tidy/buffio.c +232 -0
- data/ext/tidy/buffio.h +118 -0
- data/ext/tidy/charsets.c +1032 -0
- data/ext/tidy/charsets.h +14 -0
- data/ext/tidy/clean.c +2674 -0
- data/ext/tidy/clean.h +87 -0
- data/ext/tidy/config.c +1746 -0
- data/ext/tidy/config.h +153 -0
- data/ext/tidy/entities.c +419 -0
- data/ext/tidy/entities.h +24 -0
- data/ext/tidy/extconf.rb +5 -0
- data/ext/tidy/fileio.c +106 -0
- data/ext/tidy/fileio.h +46 -0
- data/ext/tidy/forward.h +69 -0
- data/ext/tidy/iconvtc.c +105 -0
- data/ext/tidy/iconvtc.h +15 -0
- data/ext/tidy/istack.c +373 -0
- data/ext/tidy/lexer.c +3825 -0
- data/ext/tidy/lexer.h +617 -0
- data/ext/tidy/localize.c +1882 -0
- data/ext/tidy/mappedio.c +329 -0
- data/ext/tidy/mappedio.h +16 -0
- data/ext/tidy/message.h +207 -0
- data/ext/tidy/parser.c +4408 -0
- data/ext/tidy/parser.h +76 -0
- data/ext/tidy/platform.h +636 -0
- data/ext/tidy/pprint.c +2276 -0
- data/ext/tidy/pprint.h +93 -0
- data/ext/tidy/ruby-tidy.c +195 -0
- data/ext/tidy/streamio.c +1407 -0
- data/ext/tidy/streamio.h +222 -0
- data/ext/tidy/tagask.c +286 -0
- data/ext/tidy/tags.c +955 -0
- data/ext/tidy/tags.h +235 -0
- data/ext/tidy/tidy-int.h +129 -0
- data/ext/tidy/tidy.h +1097 -0
- data/ext/tidy/tidyenum.h +622 -0
- data/ext/tidy/tidylib.c +1751 -0
- data/ext/tidy/tmbstr.c +306 -0
- data/ext/tidy/tmbstr.h +92 -0
- data/ext/tidy/utf8.c +539 -0
- data/ext/tidy/utf8.h +52 -0
- data/ext/tidy/version.h +14 -0
- data/ext/tidy/win32tc.c +795 -0
- data/ext/tidy/win32tc.h +19 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/tidy/compat_spec.rb +44 -0
- data/spec/tidy/remote_uri_spec.rb +14 -0
- data/spec/tidy/test1.html +5 -0
- data/spec/tidy/tidy_spec.rb +34 -0
- metadata +125 -0
data/ext/tidy/win32tc.c
ADDED
@@ -0,0 +1,795 @@
|
|
1
|
+
/* win32tc.c -- Interface to Win32 transcoding routines
|
2
|
+
|
3
|
+
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
4
|
+
See tidy.h for the copyright notice.
|
5
|
+
|
6
|
+
$Id: win32tc.c,v 1.12 2008/08/09 11:55:27 hoehrmann Exp $
|
7
|
+
*/
|
8
|
+
|
9
|
+
/* keep these here to keep file non-empty */
|
10
|
+
#include "tidy.h"
|
11
|
+
#include "forward.h"
|
12
|
+
#include "streamio.h"
|
13
|
+
#include "tmbstr.h"
|
14
|
+
#include "utf8.h"
|
15
|
+
|
16
|
+
#ifdef TIDY_WIN32_MLANG_SUPPORT
|
17
|
+
|
18
|
+
#define VC_EXTRALEAN
|
19
|
+
#define CINTERFACE
|
20
|
+
#define COBJMACROS
|
21
|
+
|
22
|
+
#include <windows.h>
|
23
|
+
#include <mlang.h>
|
24
|
+
|
25
|
+
#undef COBJMACROS
|
26
|
+
#undef CINTERFACE
|
27
|
+
#undef VC_EXTRALEAN
|
28
|
+
|
29
|
+
/* maximum number of bytes for a single character */
|
30
|
+
#define TC_INBUFSIZE 16
|
31
|
+
|
32
|
+
/* maximum number of characters per byte sequence */
|
33
|
+
#define TC_OUTBUFSIZE 16
|
34
|
+
|
35
|
+
#define CreateMLangObject(p) \
|
36
|
+
CoCreateInstance( \
|
37
|
+
&CLSID_CMLangConvertCharset, \
|
38
|
+
NULL, \
|
39
|
+
CLSCTX_ALL, \
|
40
|
+
&IID_IMLangConvertCharset, \
|
41
|
+
(VOID **)&p);
|
42
|
+
|
43
|
+
|
44
|
+
/* Character Set to Microsoft Windows Codepage Identifier map, */
|
45
|
+
/* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */
|
46
|
+
|
47
|
+
/* note: the 'safe' field indicates whether this encoding can be */
|
48
|
+
/* read/written character-by-character; this does not apply to */
|
49
|
+
/* various stateful encodings such as ISO-2022 or UTF-7, these */
|
50
|
+
/* must be read/written as a complete stream. It is possible that */
|
51
|
+
/* some 'unsafe' encodings are marked as 'save'. */
|
52
|
+
|
53
|
+
/* todo: cleanup; Tidy should use only a single mapping table to */
|
54
|
+
/* circumvent unsupported aliases in other transcoding libraries, */
|
55
|
+
/* enable reverse lookup of encoding names and ease maintenance. */
|
56
|
+
|
57
|
+
static struct _nameWinCPMap
|
58
|
+
{
|
59
|
+
tmbstr name;
|
60
|
+
uint wincp;
|
61
|
+
Bool safe;
|
62
|
+
} const NameWinCPMap[] = {
|
63
|
+
{ "cp037", 37, yes },
|
64
|
+
{ "csibm037", 37, yes },
|
65
|
+
{ "ebcdic-cp-ca", 37, yes },
|
66
|
+
{ "ebcdic-cp-nl", 37, yes },
|
67
|
+
{ "ebcdic-cp-us", 37, yes },
|
68
|
+
{ "ebcdic-cp-wt", 37, yes },
|
69
|
+
{ "ibm037", 37, yes },
|
70
|
+
{ "cp437", 437, yes },
|
71
|
+
{ "cspc8codepage437", 437, yes },
|
72
|
+
{ "ibm437", 437, yes },
|
73
|
+
{ "cp500", 500, yes },
|
74
|
+
{ "csibm500", 500, yes },
|
75
|
+
{ "ebcdic-cp-be", 500, yes },
|
76
|
+
{ "ebcdic-cp-ch", 500, yes },
|
77
|
+
{ "ibm500", 500, yes },
|
78
|
+
{ "asmo-708", 708, yes },
|
79
|
+
{ "dos-720", 720, yes },
|
80
|
+
{ "ibm737", 737, yes },
|
81
|
+
{ "ibm775", 775, yes },
|
82
|
+
{ "cp850", 850, yes },
|
83
|
+
{ "ibm850", 850, yes },
|
84
|
+
{ "cp852", 852, yes },
|
85
|
+
{ "ibm852", 852, yes },
|
86
|
+
{ "cp855", 855, yes },
|
87
|
+
{ "ibm855", 855, yes },
|
88
|
+
{ "cp857", 857, yes },
|
89
|
+
{ "ibm857", 857, yes },
|
90
|
+
{ "ccsid00858", 858, yes },
|
91
|
+
{ "cp00858", 858, yes },
|
92
|
+
{ "cp858", 858, yes },
|
93
|
+
{ "ibm00858", 858, yes },
|
94
|
+
{ "pc-multilingual-850+euro", 858, yes },
|
95
|
+
{ "cp860", 860, yes },
|
96
|
+
{ "ibm860", 860, yes },
|
97
|
+
{ "cp861", 861, yes },
|
98
|
+
{ "ibm861", 861, yes },
|
99
|
+
{ "cp862", 862, yes },
|
100
|
+
{ "dos-862", 862, yes },
|
101
|
+
{ "ibm862", 862, yes },
|
102
|
+
{ "cp863", 863, yes },
|
103
|
+
{ "ibm863", 863, yes },
|
104
|
+
{ "cp864", 864, yes },
|
105
|
+
{ "ibm864", 864, yes },
|
106
|
+
{ "cp865", 865, yes },
|
107
|
+
{ "ibm865", 865, yes },
|
108
|
+
{ "cp866", 866, yes },
|
109
|
+
{ "ibm866", 866, yes },
|
110
|
+
{ "cp869", 869, yes },
|
111
|
+
{ "ibm869", 869, yes },
|
112
|
+
{ "cp870", 870, yes },
|
113
|
+
{ "csibm870", 870, yes },
|
114
|
+
{ "ebcdic-cp-roece", 870, yes },
|
115
|
+
{ "ebcdic-cp-yu", 870, yes },
|
116
|
+
{ "ibm870", 870, yes },
|
117
|
+
{ "dos-874", 874, yes },
|
118
|
+
{ "iso-8859-11", 874, yes },
|
119
|
+
{ "tis-620", 874, yes },
|
120
|
+
{ "windows-874", 874, yes },
|
121
|
+
{ "cp875", 875, yes },
|
122
|
+
{ "csshiftjis", 932, yes },
|
123
|
+
{ "cswindows31j", 932, yes },
|
124
|
+
{ "ms_kanji", 932, yes },
|
125
|
+
{ "shift-jis", 932, yes },
|
126
|
+
{ "shift_jis", 932, yes },
|
127
|
+
{ "sjis", 932, yes },
|
128
|
+
{ "x-ms-cp932", 932, yes },
|
129
|
+
{ "x-sjis", 932, yes },
|
130
|
+
{ "chinese", 936, yes },
|
131
|
+
{ "cn-gb", 936, yes },
|
132
|
+
{ "csgb2312", 936, yes },
|
133
|
+
{ "csgb231280", 936, yes },
|
134
|
+
{ "csiso58gb231280", 936, yes },
|
135
|
+
{ "gb2312", 936, yes },
|
136
|
+
{ "gb2312-80", 936, yes },
|
137
|
+
{ "gb231280", 936, yes },
|
138
|
+
{ "gb_2312-80", 936, yes },
|
139
|
+
{ "gbk", 936, yes },
|
140
|
+
{ "iso-ir-58", 936, yes },
|
141
|
+
{ "csksc56011987", 949, yes },
|
142
|
+
{ "iso-ir-149", 949, yes },
|
143
|
+
{ "korean", 949, yes },
|
144
|
+
{ "ks-c-5601", 949, yes },
|
145
|
+
{ "ks-c5601", 949, yes },
|
146
|
+
{ "ks_c_5601", 949, yes },
|
147
|
+
{ "ks_c_5601-1987", 949, yes },
|
148
|
+
{ "ks_c_5601-1989", 949, yes },
|
149
|
+
{ "ks_c_5601_1987", 949, yes },
|
150
|
+
{ "ksc5601", 949, yes },
|
151
|
+
{ "ksc_5601", 949, yes },
|
152
|
+
{ "big5", 950, yes },
|
153
|
+
{ "big5-hkscs", 950, yes },
|
154
|
+
{ "cn-big5", 950, yes },
|
155
|
+
{ "csbig5", 950, yes },
|
156
|
+
{ "x-x-big5", 950, yes },
|
157
|
+
{ "cp1026", 1026, yes },
|
158
|
+
{ "csibm1026", 1026, yes },
|
159
|
+
{ "ibm1026", 1026, yes },
|
160
|
+
{ "ibm01047", 1047, yes },
|
161
|
+
{ "ccsid01140", 1140, yes },
|
162
|
+
{ "cp01140", 1140, yes },
|
163
|
+
{ "ebcdic-us-37+euro", 1140, yes },
|
164
|
+
{ "ibm01140", 1140, yes },
|
165
|
+
{ "ccsid01141", 1141, yes },
|
166
|
+
{ "cp01141", 1141, yes },
|
167
|
+
{ "ebcdic-de-273+euro", 1141, yes },
|
168
|
+
{ "ibm01141", 1141, yes },
|
169
|
+
{ "ccsid01142", 1142, yes },
|
170
|
+
{ "cp01142", 1142, yes },
|
171
|
+
{ "ebcdic-dk-277+euro", 1142, yes },
|
172
|
+
{ "ebcdic-no-277+euro", 1142, yes },
|
173
|
+
{ "ibm01142", 1142, yes },
|
174
|
+
{ "ccsid01143", 1143, yes },
|
175
|
+
{ "cp01143", 1143, yes },
|
176
|
+
{ "ebcdic-fi-278+euro", 1143, yes },
|
177
|
+
{ "ebcdic-se-278+euro", 1143, yes },
|
178
|
+
{ "ibm01143", 1143, yes },
|
179
|
+
{ "ccsid01144", 1144, yes },
|
180
|
+
{ "cp01144", 1144, yes },
|
181
|
+
{ "ebcdic-it-280+euro", 1144, yes },
|
182
|
+
{ "ibm01144", 1144, yes },
|
183
|
+
{ "ccsid01145", 1145, yes },
|
184
|
+
{ "cp01145", 1145, yes },
|
185
|
+
{ "ebcdic-es-284+euro", 1145, yes },
|
186
|
+
{ "ibm01145", 1145, yes },
|
187
|
+
{ "ccsid01146", 1146, yes },
|
188
|
+
{ "cp01146", 1146, yes },
|
189
|
+
{ "ebcdic-gb-285+euro", 1146, yes },
|
190
|
+
{ "ibm01146", 1146, yes },
|
191
|
+
{ "ccsid01147", 1147, yes },
|
192
|
+
{ "cp01147", 1147, yes },
|
193
|
+
{ "ebcdic-fr-297+euro", 1147, yes },
|
194
|
+
{ "ibm01147", 1147, yes },
|
195
|
+
{ "ccsid01148", 1148, yes },
|
196
|
+
{ "cp01148", 1148, yes },
|
197
|
+
{ "ebcdic-international-500+euro", 1148, yes },
|
198
|
+
{ "ibm01148", 1148, yes },
|
199
|
+
{ "ccsid01149", 1149, yes },
|
200
|
+
{ "cp01149", 1149, yes },
|
201
|
+
{ "ebcdic-is-871+euro", 1149, yes },
|
202
|
+
{ "ibm01149", 1149, yes },
|
203
|
+
{ "iso-10646-ucs-2", 1200, yes },
|
204
|
+
{ "ucs-2", 1200, yes },
|
205
|
+
{ "unicode", 1200, yes },
|
206
|
+
{ "utf-16", 1200, yes },
|
207
|
+
{ "utf-16le", 1200, yes },
|
208
|
+
{ "unicodefffe", 1201, yes },
|
209
|
+
{ "utf-16be", 1201, yes },
|
210
|
+
{ "windows-1250", 1250, yes },
|
211
|
+
{ "x-cp1250", 1250, yes },
|
212
|
+
{ "windows-1251", 1251, yes },
|
213
|
+
{ "x-cp1251", 1251, yes },
|
214
|
+
{ "windows-1252", 1252, yes },
|
215
|
+
{ "x-ansi", 1252, yes },
|
216
|
+
{ "windows-1253", 1253, yes },
|
217
|
+
{ "windows-1254", 1254, yes },
|
218
|
+
{ "windows-1255", 1255, yes },
|
219
|
+
{ "cp1256", 1256, yes },
|
220
|
+
{ "windows-1256", 1256, yes },
|
221
|
+
{ "windows-1257", 1257, yes },
|
222
|
+
{ "windows-1258", 1258, yes },
|
223
|
+
{ "johab", 1361, yes },
|
224
|
+
{ "macintosh", 10000, yes },
|
225
|
+
{ "x-mac-japanese", 10001, yes },
|
226
|
+
{ "x-mac-chinesetrad", 10002, yes },
|
227
|
+
{ "x-mac-korean", 10003, yes },
|
228
|
+
{ "x-mac-arabic", 10004, yes },
|
229
|
+
{ "x-mac-hebrew", 10005, yes },
|
230
|
+
{ "x-mac-greek", 10006, yes },
|
231
|
+
{ "x-mac-cyrillic", 10007, yes },
|
232
|
+
{ "x-mac-chinesesimp", 10008, yes },
|
233
|
+
{ "x-mac-romanian", 10010, yes },
|
234
|
+
{ "x-mac-ukrainian", 10017, yes },
|
235
|
+
{ "x-mac-thai", 10021, yes },
|
236
|
+
{ "x-mac-ce", 10029, yes },
|
237
|
+
{ "x-mac-icelandic", 10079, yes },
|
238
|
+
{ "x-mac-turkish", 10081, yes },
|
239
|
+
{ "x-mac-croatian", 10082, yes },
|
240
|
+
{ "x-chinese-cns", 20000, yes },
|
241
|
+
{ "x-cp20001", 20001, yes },
|
242
|
+
{ "x-chinese-eten", 20002, yes },
|
243
|
+
{ "x-cp20003", 20003, yes },
|
244
|
+
{ "x-cp20004", 20004, yes },
|
245
|
+
{ "x-cp20005", 20005, yes },
|
246
|
+
{ "irv", 20105, yes },
|
247
|
+
{ "x-ia5", 20105, yes },
|
248
|
+
{ "din_66003", 20106, yes },
|
249
|
+
{ "german", 20106, yes },
|
250
|
+
{ "x-ia5-german", 20106, yes },
|
251
|
+
{ "sen_850200_b", 20107, yes },
|
252
|
+
{ "swedish", 20107, yes },
|
253
|
+
{ "x-ia5-swedish", 20107, yes },
|
254
|
+
{ "norwegian", 20108, yes },
|
255
|
+
{ "ns_4551-1", 20108, yes },
|
256
|
+
{ "x-ia5-norwegian", 20108, yes },
|
257
|
+
{ "ansi_x3.4-1968", 20127, yes },
|
258
|
+
{ "ansi_x3.4-1986", 20127, yes },
|
259
|
+
{ "ascii", 20127, yes },
|
260
|
+
{ "cp367", 20127, yes },
|
261
|
+
{ "csascii", 20127, yes },
|
262
|
+
{ "ibm367", 20127, yes },
|
263
|
+
{ "iso-ir-6", 20127, yes },
|
264
|
+
{ "iso646-us", 20127, yes },
|
265
|
+
{ "iso_646.irv:1991", 20127, yes },
|
266
|
+
{ "us", 20127, yes },
|
267
|
+
{ "us-ascii", 20127, yes },
|
268
|
+
{ "x-cp20261", 20261, yes },
|
269
|
+
{ "x-cp20269", 20269, yes },
|
270
|
+
{ "cp273", 20273, yes },
|
271
|
+
{ "csibm273", 20273, yes },
|
272
|
+
{ "ibm273", 20273, yes },
|
273
|
+
{ "csibm277", 20277, yes },
|
274
|
+
{ "ebcdic-cp-dk", 20277, yes },
|
275
|
+
{ "ebcdic-cp-no", 20277, yes },
|
276
|
+
{ "ibm277", 20277, yes },
|
277
|
+
{ "cp278", 20278, yes },
|
278
|
+
{ "csibm278", 20278, yes },
|
279
|
+
{ "ebcdic-cp-fi", 20278, yes },
|
280
|
+
{ "ebcdic-cp-se", 20278, yes },
|
281
|
+
{ "ibm278", 20278, yes },
|
282
|
+
{ "cp280", 20280, yes },
|
283
|
+
{ "csibm280", 20280, yes },
|
284
|
+
{ "ebcdic-cp-it", 20280, yes },
|
285
|
+
{ "ibm280", 20280, yes },
|
286
|
+
{ "cp284", 20284, yes },
|
287
|
+
{ "csibm284", 20284, yes },
|
288
|
+
{ "ebcdic-cp-es", 20284, yes },
|
289
|
+
{ "ibm284", 20284, yes },
|
290
|
+
{ "cp285", 20285, yes },
|
291
|
+
{ "csibm285", 20285, yes },
|
292
|
+
{ "ebcdic-cp-gb", 20285, yes },
|
293
|
+
{ "ibm285", 20285, yes },
|
294
|
+
{ "cp290", 20290, yes },
|
295
|
+
{ "csibm290", 20290, yes },
|
296
|
+
{ "ebcdic-jp-kana", 20290, yes },
|
297
|
+
{ "ibm290", 20290, yes },
|
298
|
+
{ "cp297", 20297, yes },
|
299
|
+
{ "csibm297", 20297, yes },
|
300
|
+
{ "ebcdic-cp-fr", 20297, yes },
|
301
|
+
{ "ibm297", 20297, yes },
|
302
|
+
{ "cp420", 20420, yes },
|
303
|
+
{ "csibm420", 20420, yes },
|
304
|
+
{ "ebcdic-cp-ar1", 20420, yes },
|
305
|
+
{ "ibm420", 20420, yes },
|
306
|
+
{ "cp423", 20423, yes },
|
307
|
+
{ "csibm423", 20423, yes },
|
308
|
+
{ "ebcdic-cp-gr", 20423, yes },
|
309
|
+
{ "ibm423", 20423, yes },
|
310
|
+
{ "cp424", 20424, yes },
|
311
|
+
{ "csibm424", 20424, yes },
|
312
|
+
{ "ebcdic-cp-he", 20424, yes },
|
313
|
+
{ "ibm424", 20424, yes },
|
314
|
+
{ "x-ebcdic-koreanextended", 20833, yes },
|
315
|
+
{ "csibmthai", 20838, yes },
|
316
|
+
{ "ibm-thai", 20838, yes },
|
317
|
+
{ "cskoi8r", 20866, yes },
|
318
|
+
{ "koi", 20866, yes },
|
319
|
+
{ "koi8", 20866, yes },
|
320
|
+
{ "koi8-r", 20866, yes },
|
321
|
+
{ "koi8r", 20866, yes },
|
322
|
+
{ "cp871", 20871, yes },
|
323
|
+
{ "csibm871", 20871, yes },
|
324
|
+
{ "ebcdic-cp-is", 20871, yes },
|
325
|
+
{ "ibm871", 20871, yes },
|
326
|
+
{ "cp880", 20880, yes },
|
327
|
+
{ "csibm880", 20880, yes },
|
328
|
+
{ "ebcdic-cyrillic", 20880, yes },
|
329
|
+
{ "ibm880", 20880, yes },
|
330
|
+
{ "cp905", 20905, yes },
|
331
|
+
{ "csibm905", 20905, yes },
|
332
|
+
{ "ebcdic-cp-tr", 20905, yes },
|
333
|
+
{ "ibm905", 20905, yes },
|
334
|
+
{ "ccsid00924", 20924, yes },
|
335
|
+
{ "cp00924", 20924, yes },
|
336
|
+
{ "ebcdic-latin9--euro", 20924, yes },
|
337
|
+
{ "ibm00924", 20924, yes },
|
338
|
+
{ "x-cp20936", 20936, yes },
|
339
|
+
{ "x-cp20949", 20949, yes },
|
340
|
+
{ "cp1025", 21025, yes },
|
341
|
+
{ "x-cp21027", 21027, yes },
|
342
|
+
{ "koi8-ru", 21866, yes },
|
343
|
+
{ "koi8-u", 21866, yes },
|
344
|
+
{ "cp819", 28591, yes },
|
345
|
+
{ "csisolatin1", 28591, yes },
|
346
|
+
{ "ibm819", 28591, yes },
|
347
|
+
{ "iso-8859-1", 28591, yes },
|
348
|
+
{ "iso-ir-100", 28591, yes },
|
349
|
+
{ "iso8859-1", 28591, yes },
|
350
|
+
{ "iso_8859-1", 28591, yes },
|
351
|
+
{ "iso_8859-1:1987", 28591, yes },
|
352
|
+
{ "l1", 28591, yes },
|
353
|
+
{ "latin1", 28591, yes },
|
354
|
+
{ "csisolatin2", 28592, yes },
|
355
|
+
{ "iso-8859-2", 28592, yes },
|
356
|
+
{ "iso-ir-101", 28592, yes },
|
357
|
+
{ "iso8859-2", 28592, yes },
|
358
|
+
{ "iso_8859-2", 28592, yes },
|
359
|
+
{ "iso_8859-2:1987", 28592, yes },
|
360
|
+
{ "l2", 28592, yes },
|
361
|
+
{ "latin2", 28592, yes },
|
362
|
+
{ "csisolatin3", 28593, yes },
|
363
|
+
{ "iso-8859-3", 28593, yes },
|
364
|
+
{ "iso-ir-109", 28593, yes },
|
365
|
+
{ "iso_8859-3", 28593, yes },
|
366
|
+
{ "iso_8859-3:1988", 28593, yes },
|
367
|
+
{ "l3", 28593, yes },
|
368
|
+
{ "latin3", 28593, yes },
|
369
|
+
{ "csisolatin4", 28594, yes },
|
370
|
+
{ "iso-8859-4", 28594, yes },
|
371
|
+
{ "iso-ir-110", 28594, yes },
|
372
|
+
{ "iso_8859-4", 28594, yes },
|
373
|
+
{ "iso_8859-4:1988", 28594, yes },
|
374
|
+
{ "l4", 28594, yes },
|
375
|
+
{ "latin4", 28594, yes },
|
376
|
+
{ "csisolatincyrillic", 28595, yes },
|
377
|
+
{ "cyrillic", 28595, yes },
|
378
|
+
{ "iso-8859-5", 28595, yes },
|
379
|
+
{ "iso-ir-144", 28595, yes },
|
380
|
+
{ "iso_8859-5", 28595, yes },
|
381
|
+
{ "iso_8859-5:1988", 28595, yes },
|
382
|
+
{ "arabic", 28596, yes },
|
383
|
+
{ "csisolatinarabic", 28596, yes },
|
384
|
+
{ "ecma-114", 28596, yes },
|
385
|
+
{ "iso-8859-6", 28596, yes },
|
386
|
+
{ "iso-ir-127", 28596, yes },
|
387
|
+
{ "iso_8859-6", 28596, yes },
|
388
|
+
{ "iso_8859-6:1987", 28596, yes },
|
389
|
+
{ "csisolatingreek", 28597, yes },
|
390
|
+
{ "ecma-118", 28597, yes },
|
391
|
+
{ "elot_928", 28597, yes },
|
392
|
+
{ "greek", 28597, yes },
|
393
|
+
{ "greek8", 28597, yes },
|
394
|
+
{ "iso-8859-7", 28597, yes },
|
395
|
+
{ "iso-ir-126", 28597, yes },
|
396
|
+
{ "iso_8859-7", 28597, yes },
|
397
|
+
{ "iso_8859-7:1987", 28597, yes },
|
398
|
+
{ "csisolatinhebrew", 28598, yes },
|
399
|
+
{ "hebrew", 28598, yes },
|
400
|
+
{ "iso-8859-8", 28598, yes },
|
401
|
+
{ "iso-ir-138", 28598, yes },
|
402
|
+
{ "iso_8859-8", 28598, yes },
|
403
|
+
{ "iso_8859-8:1988", 28598, yes },
|
404
|
+
{ "logical", 28598, yes },
|
405
|
+
{ "visual", 28598, yes },
|
406
|
+
{ "csisolatin5", 28599, yes },
|
407
|
+
{ "iso-8859-9", 28599, yes },
|
408
|
+
{ "iso-ir-148", 28599, yes },
|
409
|
+
{ "iso_8859-9", 28599, yes },
|
410
|
+
{ "iso_8859-9:1989", 28599, yes },
|
411
|
+
{ "l5", 28599, yes },
|
412
|
+
{ "latin5", 28599, yes },
|
413
|
+
{ "iso-8859-13", 28603, yes },
|
414
|
+
{ "csisolatin9", 28605, yes },
|
415
|
+
{ "iso-8859-15", 28605, yes },
|
416
|
+
{ "iso_8859-15", 28605, yes },
|
417
|
+
{ "l9", 28605, yes },
|
418
|
+
{ "latin9", 28605, yes },
|
419
|
+
{ "x-europa", 29001, yes },
|
420
|
+
{ "iso-8859-8-i", 38598, yes },
|
421
|
+
{ "iso-2022-jp", 50220, no },
|
422
|
+
{ "csiso2022jp", 50221, no },
|
423
|
+
{ "csiso2022kr", 50225, no },
|
424
|
+
{ "iso-2022-kr", 50225, no },
|
425
|
+
{ "iso-2022-kr-7", 50225, no },
|
426
|
+
{ "iso-2022-kr-7bit", 50225, no },
|
427
|
+
{ "cp50227", 50227, no },
|
428
|
+
{ "x-cp50227", 50227, no },
|
429
|
+
{ "cp930", 50930, yes },
|
430
|
+
{ "x-ebcdic-japaneseanduscanada", 50931, yes },
|
431
|
+
{ "cp933", 50933, yes },
|
432
|
+
{ "cp935", 50935, yes },
|
433
|
+
{ "cp937", 50937, yes },
|
434
|
+
{ "cp939", 50939, yes },
|
435
|
+
{ "cseucpkdfmtjapanese", 51932, yes },
|
436
|
+
{ "euc-jp", 51932, yes },
|
437
|
+
{ "extended_unix_code_packed_format_for_japanese", 51932, yes },
|
438
|
+
{ "iso-2022-jpeuc", 51932, yes },
|
439
|
+
{ "x-euc", 51932, yes },
|
440
|
+
{ "x-euc-jp", 51932, yes },
|
441
|
+
{ "euc-cn", 51936, yes },
|
442
|
+
{ "x-euc-cn", 51936, yes },
|
443
|
+
{ "cseuckr", 51949, yes },
|
444
|
+
{ "euc-kr", 51949, yes },
|
445
|
+
{ "iso-2022-kr-8", 51949, yes },
|
446
|
+
{ "iso-2022-kr-8bit", 51949, yes },
|
447
|
+
{ "hz-gb-2312", 52936, no },
|
448
|
+
{ "gb18030", 54936, yes },
|
449
|
+
{ "x-iscii-de", 57002, yes },
|
450
|
+
{ "x-iscii-be", 57003, yes },
|
451
|
+
{ "x-iscii-ta", 57004, yes },
|
452
|
+
{ "x-iscii-te", 57005, yes },
|
453
|
+
{ "x-iscii-as", 57006, yes },
|
454
|
+
{ "x-iscii-or", 57007, yes },
|
455
|
+
{ "x-iscii-ka", 57008, yes },
|
456
|
+
{ "x-iscii-ma", 57009, yes },
|
457
|
+
{ "x-iscii-gu", 57010, yes },
|
458
|
+
{ "x-iscii-pa", 57011, yes },
|
459
|
+
{ "csunicode11utf7", 65000, no },
|
460
|
+
{ "unicode-1-1-utf-7", 65000, no },
|
461
|
+
{ "unicode-2-0-utf-7", 65000, no },
|
462
|
+
{ "utf-7", 65000, no },
|
463
|
+
{ "x-unicode-1-1-utf-7", 65000, no },
|
464
|
+
{ "x-unicode-2-0-utf-7", 65000, no },
|
465
|
+
{ "unicode-1-1-utf-8", 65001, yes },
|
466
|
+
{ "unicode-2-0-utf-8", 65001, yes },
|
467
|
+
{ "utf-8", 65001, yes },
|
468
|
+
{ "x-unicode-1-1-utf-8", 65001, yes },
|
469
|
+
{ "x-unicode-2-0-utf-8", 65001, yes },
|
470
|
+
|
471
|
+
/* final entry */
|
472
|
+
{ NULL, 0, no }
|
473
|
+
};
|
474
|
+
|
475
|
+
uint TY_(Win32MLangGetCPFromName)(TidyAllocator *allocator, ctmbstr encoding)
|
476
|
+
{
|
477
|
+
uint i;
|
478
|
+
tmbstr enc;
|
479
|
+
|
480
|
+
/* ensure name is in lower case */
|
481
|
+
enc = TY_(tmbstrdup)(allocator,encoding);
|
482
|
+
enc = TY_(tmbstrtolower)(enc);
|
483
|
+
|
484
|
+
for (i = 0; NameWinCPMap[i].name; ++i)
|
485
|
+
{
|
486
|
+
if (TY_(tmbstrcmp)(NameWinCPMap[i].name, enc) == 0)
|
487
|
+
{
|
488
|
+
IMLangConvertCharset * p = NULL;
|
489
|
+
uint wincp = NameWinCPMap[i].wincp;
|
490
|
+
HRESULT hr;
|
491
|
+
|
492
|
+
TidyFree(allocator, enc);
|
493
|
+
|
494
|
+
/* currently no support for unsafe encodings */
|
495
|
+
if (!NameWinCPMap[i].safe)
|
496
|
+
return 0;
|
497
|
+
|
498
|
+
/* hack for config.c */
|
499
|
+
CoInitialize(NULL);
|
500
|
+
hr = CreateMLangObject(p);
|
501
|
+
|
502
|
+
if (hr != S_OK || !p)
|
503
|
+
{
|
504
|
+
wincp = 0;
|
505
|
+
}
|
506
|
+
else
|
507
|
+
{
|
508
|
+
hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
|
509
|
+
|
510
|
+
if (hr != S_OK)
|
511
|
+
wincp = 0;
|
512
|
+
|
513
|
+
IMLangConvertCharset_Release(p);
|
514
|
+
p = NULL;
|
515
|
+
}
|
516
|
+
|
517
|
+
CoUninitialize();
|
518
|
+
|
519
|
+
return wincp;
|
520
|
+
}
|
521
|
+
}
|
522
|
+
|
523
|
+
TidyFree(allocator, enc);
|
524
|
+
return 0;
|
525
|
+
}
|
526
|
+
|
527
|
+
Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp)
|
528
|
+
{
|
529
|
+
IMLangConvertCharset * p = NULL;
|
530
|
+
HRESULT hr;
|
531
|
+
|
532
|
+
assert( in != NULL );
|
533
|
+
|
534
|
+
CoInitialize(NULL);
|
535
|
+
|
536
|
+
if (wincp == 0)
|
537
|
+
{
|
538
|
+
/* no codepage found for this encoding */
|
539
|
+
return no;
|
540
|
+
}
|
541
|
+
|
542
|
+
hr = CreateMLangObject(p);
|
543
|
+
|
544
|
+
if (hr != S_OK || !p)
|
545
|
+
{
|
546
|
+
/* MLang not supported */
|
547
|
+
return no;
|
548
|
+
}
|
549
|
+
|
550
|
+
hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
|
551
|
+
|
552
|
+
if (hr != S_OK)
|
553
|
+
{
|
554
|
+
/* encoding not supported, insufficient memory, etc. */
|
555
|
+
return no;
|
556
|
+
}
|
557
|
+
|
558
|
+
in->mlang = p;
|
559
|
+
|
560
|
+
return yes;
|
561
|
+
}
|
562
|
+
|
563
|
+
void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in)
|
564
|
+
{
|
565
|
+
IMLangConvertCharset * p;
|
566
|
+
|
567
|
+
assert( in != NULL );
|
568
|
+
|
569
|
+
p = (IMLangConvertCharset *)in->mlang;
|
570
|
+
if (p)
|
571
|
+
{
|
572
|
+
IMLangConvertCharset_Release(p);
|
573
|
+
p = NULL;
|
574
|
+
in->mlang = NULL;
|
575
|
+
}
|
576
|
+
|
577
|
+
CoUninitialize();
|
578
|
+
}
|
579
|
+
|
580
|
+
#if 0
|
581
|
+
Bool Win32MLangInitOutputTranscoder(TidyAllocator *allocator, StreamOut * out, tmbstr encoding)
|
582
|
+
{
|
583
|
+
IMLangConvertCharset * p = NULL;
|
584
|
+
HRESULT hr;
|
585
|
+
uint wincp;
|
586
|
+
|
587
|
+
assert( out != NULL );
|
588
|
+
|
589
|
+
CoInitialize(NULL);
|
590
|
+
|
591
|
+
wincp = TY_(Win32MLangGetCPFromName)(allocator, encoding);
|
592
|
+
if (wincp == 0)
|
593
|
+
{
|
594
|
+
/* no codepage found for this encoding */
|
595
|
+
return no;
|
596
|
+
}
|
597
|
+
|
598
|
+
hr = CreateMLangObject(p);
|
599
|
+
|
600
|
+
if (hr != S_OK || !p)
|
601
|
+
{
|
602
|
+
/* MLang not supported */
|
603
|
+
return no;
|
604
|
+
}
|
605
|
+
|
606
|
+
IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS);
|
607
|
+
|
608
|
+
if (hr != S_OK)
|
609
|
+
{
|
610
|
+
/* encoding not supported, insufficient memory, etc. */
|
611
|
+
return no;
|
612
|
+
}
|
613
|
+
|
614
|
+
out->mlang = p;
|
615
|
+
|
616
|
+
return yes;
|
617
|
+
}
|
618
|
+
|
619
|
+
void Win32MLangUninitOutputTranscoder(StreamOut * out)
|
620
|
+
{
|
621
|
+
IMLangConvertCharset * p;
|
622
|
+
|
623
|
+
assert( out != NULL );
|
624
|
+
|
625
|
+
p = (IMLangConvertCharset *)out->mlang;
|
626
|
+
if (p)
|
627
|
+
{
|
628
|
+
IMLangConvertCharset_Release(p);
|
629
|
+
p = NULL;
|
630
|
+
out->mlang = NULL;
|
631
|
+
}
|
632
|
+
|
633
|
+
CoUninitialize();
|
634
|
+
}
|
635
|
+
#endif
|
636
|
+
|
637
|
+
int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead)
|
638
|
+
{
|
639
|
+
IMLangConvertCharset * p;
|
640
|
+
TidyInputSource * source;
|
641
|
+
CHAR inbuf[TC_INBUFSIZE] = { 0 };
|
642
|
+
WCHAR outbuf[TC_OUTBUFSIZE] = { 0 };
|
643
|
+
HRESULT hr = S_OK;
|
644
|
+
size_t inbufsize = 0;
|
645
|
+
|
646
|
+
assert( in != NULL );
|
647
|
+
assert( &in->source != NULL );
|
648
|
+
assert( bytesRead != NULL );
|
649
|
+
assert( in->mlang != NULL );
|
650
|
+
|
651
|
+
p = (IMLangConvertCharset *)in->mlang;
|
652
|
+
source = &in->source;
|
653
|
+
|
654
|
+
inbuf[inbufsize++] = (CHAR)firstByte;
|
655
|
+
|
656
|
+
while(inbufsize < TC_INBUFSIZE)
|
657
|
+
{
|
658
|
+
UINT outbufsize = TC_OUTBUFSIZE;
|
659
|
+
UINT readNow = inbufsize;
|
660
|
+
int nextByte = EndOfStream;
|
661
|
+
|
662
|
+
hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize);
|
663
|
+
|
664
|
+
assert( hr == S_OK );
|
665
|
+
assert( outbufsize <= 2 );
|
666
|
+
|
667
|
+
if (outbufsize == 2)
|
668
|
+
{
|
669
|
+
/* U+10000-U+10FFFF are returned as a pair of surrogates */
|
670
|
+
tchar m = (tchar)outbuf[0];
|
671
|
+
tchar n = (tchar)outbuf[1];
|
672
|
+
assert( TY_(IsHighSurrogate)(n) && TY_(IsLowSurrogate)(m) );
|
673
|
+
*bytesRead = readNow;
|
674
|
+
return (int)TY_(CombineSurrogatePair)(n, m);
|
675
|
+
}
|
676
|
+
|
677
|
+
if (outbufsize == 1)
|
678
|
+
{
|
679
|
+
/* we found the character */
|
680
|
+
/* set bytesRead and return */
|
681
|
+
*bytesRead = readNow;
|
682
|
+
return (int)outbuf[0];
|
683
|
+
}
|
684
|
+
|
685
|
+
/* we need more bytes */
|
686
|
+
nextByte = source->getByte(source->sourceData);
|
687
|
+
|
688
|
+
if (nextByte == EndOfStream)
|
689
|
+
{
|
690
|
+
/* todo: error message for broken stream? */
|
691
|
+
|
692
|
+
*bytesRead = readNow;
|
693
|
+
return EndOfStream;
|
694
|
+
}
|
695
|
+
|
696
|
+
inbuf[inbufsize++] = (CHAR)nextByte;
|
697
|
+
}
|
698
|
+
|
699
|
+
/* No full character found after reading TC_INBUFSIZE bytes, */
|
700
|
+
/* give up to read this stream, it's obviously unreadable. */
|
701
|
+
|
702
|
+
/* todo: error message for broken stream? */
|
703
|
+
return EndOfStream;
|
704
|
+
}
|
705
|
+
|
706
|
+
Bool Win32MLangIsConvertible(tchar c, StreamOut * out)
|
707
|
+
{
|
708
|
+
IMLangConvertCharset * p;
|
709
|
+
UINT i = 1;
|
710
|
+
HRESULT hr;
|
711
|
+
WCHAR inbuf[2] = { 0 };
|
712
|
+
UINT inbufsize = 0;
|
713
|
+
|
714
|
+
assert( c != 0 );
|
715
|
+
assert( c <= 0x10FFFF );
|
716
|
+
assert( out != NULL );
|
717
|
+
assert( out->mlang != NULL );
|
718
|
+
|
719
|
+
if (c > 0xFFFF)
|
720
|
+
{
|
721
|
+
tchar high = 0;
|
722
|
+
tchar low = 0;
|
723
|
+
|
724
|
+
TY_(SplitSurrogatePair)(c, &low, &high);
|
725
|
+
|
726
|
+
inbuf[inbufsize++] = (WCHAR)low;
|
727
|
+
inbuf[inbufsize++] = (WCHAR)high;
|
728
|
+
}
|
729
|
+
else
|
730
|
+
inbuf[inbufsize++] = (WCHAR)c;
|
731
|
+
|
732
|
+
p = (IMLangConvertCharset *)out->mlang;
|
733
|
+
hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL);
|
734
|
+
|
735
|
+
return hr == S_OK ? yes : no;
|
736
|
+
}
|
737
|
+
|
738
|
+
void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten)
|
739
|
+
{
|
740
|
+
IMLangConvertCharset * p;
|
741
|
+
TidyOutputSink * sink;
|
742
|
+
CHAR outbuf[TC_OUTBUFSIZE] = { 0 };
|
743
|
+
UINT outbufsize = TC_OUTBUFSIZE;
|
744
|
+
HRESULT hr = S_OK;
|
745
|
+
WCHAR inbuf[2] = { 0 };
|
746
|
+
UINT inbufsize = 0;
|
747
|
+
uint i;
|
748
|
+
|
749
|
+
assert( c != 0 );
|
750
|
+
assert( c <= 0x10FFFF );
|
751
|
+
assert( bytesWritten != NULL );
|
752
|
+
assert( out != NULL );
|
753
|
+
assert( &out->sink != NULL );
|
754
|
+
assert( out->mlang != NULL );
|
755
|
+
|
756
|
+
p = (IMLangConvertCharset *)out->mlang;
|
757
|
+
sink = &out->sink;
|
758
|
+
|
759
|
+
if (c > 0xFFFF)
|
760
|
+
{
|
761
|
+
tchar high = 0;
|
762
|
+
tchar low = 0;
|
763
|
+
|
764
|
+
TY_(SplitSurrogatePair)(c, &low, &high);
|
765
|
+
|
766
|
+
inbuf[inbufsize++] = (WCHAR)low;
|
767
|
+
inbuf[inbufsize++] = (WCHAR)high;
|
768
|
+
}
|
769
|
+
else
|
770
|
+
inbuf[inbufsize++] = (WCHAR)c;
|
771
|
+
|
772
|
+
hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize);
|
773
|
+
|
774
|
+
assert( hr == S_OK );
|
775
|
+
assert( outbufsize > 0 );
|
776
|
+
assert( inbufsize == 1 || inbufsize == 2 );
|
777
|
+
|
778
|
+
for (i = 0; i < outbufsize; ++i)
|
779
|
+
sink->putByte(sink->sinkData, (byte)(outbuf[i]));
|
780
|
+
|
781
|
+
*bytesWritten = outbufsize;
|
782
|
+
|
783
|
+
return;
|
784
|
+
}
|
785
|
+
|
786
|
+
#endif /* TIDY_WIN32_MLANG_SUPPORT */
|
787
|
+
|
788
|
+
/*
|
789
|
+
* local variables:
|
790
|
+
* mode: c
|
791
|
+
* indent-tabs-mode: nil
|
792
|
+
* c-basic-offset: 4
|
793
|
+
* eval: (c-set-offset 'substatement-open 0)
|
794
|
+
* end:
|
795
|
+
*/
|