tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
@@ -0,0 +1,795 @@
1
+ /* win32tc.c -- Interface to Win32 transcoding routines
2
+
3
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ $Id: win32tc.c,v 1.12 2008/08/09 11:55:27 hoehrmann Exp $
7
+ */
8
+
9
+ /* keep these here to keep file non-empty */
10
+ #include "tidy.h"
11
+ #include "forward.h"
12
+ #include "streamio.h"
13
+ #include "tmbstr.h"
14
+ #include "utf8.h"
15
+
16
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
17
+
18
+ #define VC_EXTRALEAN
19
+ #define CINTERFACE
20
+ #define COBJMACROS
21
+
22
+ #include <windows.h>
23
+ #include <mlang.h>
24
+
25
+ #undef COBJMACROS
26
+ #undef CINTERFACE
27
+ #undef VC_EXTRALEAN
28
+
29
+ /* maximum number of bytes for a single character */
30
+ #define TC_INBUFSIZE 16
31
+
32
+ /* maximum number of characters per byte sequence */
33
+ #define TC_OUTBUFSIZE 16
34
+
35
+ #define CreateMLangObject(p) \
36
+ CoCreateInstance( \
37
+ &CLSID_CMLangConvertCharset, \
38
+ NULL, \
39
+ CLSCTX_ALL, \
40
+ &IID_IMLangConvertCharset, \
41
+ (VOID **)&p);
42
+
43
+
44
+ /* Character Set to Microsoft Windows Codepage Identifier map, */
45
+ /* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */
46
+
47
+ /* note: the 'safe' field indicates whether this encoding can be */
48
+ /* read/written character-by-character; this does not apply to */
49
+ /* various stateful encodings such as ISO-2022 or UTF-7, these */
50
+ /* must be read/written as a complete stream. It is possible that */
51
+ /* some 'unsafe' encodings are marked as 'save'. */
52
+
53
+ /* todo: cleanup; Tidy should use only a single mapping table to */
54
+ /* circumvent unsupported aliases in other transcoding libraries, */
55
+ /* enable reverse lookup of encoding names and ease maintenance. */
56
+
57
+ static struct _nameWinCPMap
58
+ {
59
+ tmbstr name;
60
+ uint wincp;
61
+ Bool safe;
62
+ } const NameWinCPMap[] = {
63
+ { "cp037", 37, yes },
64
+ { "csibm037", 37, yes },
65
+ { "ebcdic-cp-ca", 37, yes },
66
+ { "ebcdic-cp-nl", 37, yes },
67
+ { "ebcdic-cp-us", 37, yes },
68
+ { "ebcdic-cp-wt", 37, yes },
69
+ { "ibm037", 37, yes },
70
+ { "cp437", 437, yes },
71
+ { "cspc8codepage437", 437, yes },
72
+ { "ibm437", 437, yes },
73
+ { "cp500", 500, yes },
74
+ { "csibm500", 500, yes },
75
+ { "ebcdic-cp-be", 500, yes },
76
+ { "ebcdic-cp-ch", 500, yes },
77
+ { "ibm500", 500, yes },
78
+ { "asmo-708", 708, yes },
79
+ { "dos-720", 720, yes },
80
+ { "ibm737", 737, yes },
81
+ { "ibm775", 775, yes },
82
+ { "cp850", 850, yes },
83
+ { "ibm850", 850, yes },
84
+ { "cp852", 852, yes },
85
+ { "ibm852", 852, yes },
86
+ { "cp855", 855, yes },
87
+ { "ibm855", 855, yes },
88
+ { "cp857", 857, yes },
89
+ { "ibm857", 857, yes },
90
+ { "ccsid00858", 858, yes },
91
+ { "cp00858", 858, yes },
92
+ { "cp858", 858, yes },
93
+ { "ibm00858", 858, yes },
94
+ { "pc-multilingual-850+euro", 858, yes },
95
+ { "cp860", 860, yes },
96
+ { "ibm860", 860, yes },
97
+ { "cp861", 861, yes },
98
+ { "ibm861", 861, yes },
99
+ { "cp862", 862, yes },
100
+ { "dos-862", 862, yes },
101
+ { "ibm862", 862, yes },
102
+ { "cp863", 863, yes },
103
+ { "ibm863", 863, yes },
104
+ { "cp864", 864, yes },
105
+ { "ibm864", 864, yes },
106
+ { "cp865", 865, yes },
107
+ { "ibm865", 865, yes },
108
+ { "cp866", 866, yes },
109
+ { "ibm866", 866, yes },
110
+ { "cp869", 869, yes },
111
+ { "ibm869", 869, yes },
112
+ { "cp870", 870, yes },
113
+ { "csibm870", 870, yes },
114
+ { "ebcdic-cp-roece", 870, yes },
115
+ { "ebcdic-cp-yu", 870, yes },
116
+ { "ibm870", 870, yes },
117
+ { "dos-874", 874, yes },
118
+ { "iso-8859-11", 874, yes },
119
+ { "tis-620", 874, yes },
120
+ { "windows-874", 874, yes },
121
+ { "cp875", 875, yes },
122
+ { "csshiftjis", 932, yes },
123
+ { "cswindows31j", 932, yes },
124
+ { "ms_kanji", 932, yes },
125
+ { "shift-jis", 932, yes },
126
+ { "shift_jis", 932, yes },
127
+ { "sjis", 932, yes },
128
+ { "x-ms-cp932", 932, yes },
129
+ { "x-sjis", 932, yes },
130
+ { "chinese", 936, yes },
131
+ { "cn-gb", 936, yes },
132
+ { "csgb2312", 936, yes },
133
+ { "csgb231280", 936, yes },
134
+ { "csiso58gb231280", 936, yes },
135
+ { "gb2312", 936, yes },
136
+ { "gb2312-80", 936, yes },
137
+ { "gb231280", 936, yes },
138
+ { "gb_2312-80", 936, yes },
139
+ { "gbk", 936, yes },
140
+ { "iso-ir-58", 936, yes },
141
+ { "csksc56011987", 949, yes },
142
+ { "iso-ir-149", 949, yes },
143
+ { "korean", 949, yes },
144
+ { "ks-c-5601", 949, yes },
145
+ { "ks-c5601", 949, yes },
146
+ { "ks_c_5601", 949, yes },
147
+ { "ks_c_5601-1987", 949, yes },
148
+ { "ks_c_5601-1989", 949, yes },
149
+ { "ks_c_5601_1987", 949, yes },
150
+ { "ksc5601", 949, yes },
151
+ { "ksc_5601", 949, yes },
152
+ { "big5", 950, yes },
153
+ { "big5-hkscs", 950, yes },
154
+ { "cn-big5", 950, yes },
155
+ { "csbig5", 950, yes },
156
+ { "x-x-big5", 950, yes },
157
+ { "cp1026", 1026, yes },
158
+ { "csibm1026", 1026, yes },
159
+ { "ibm1026", 1026, yes },
160
+ { "ibm01047", 1047, yes },
161
+ { "ccsid01140", 1140, yes },
162
+ { "cp01140", 1140, yes },
163
+ { "ebcdic-us-37+euro", 1140, yes },
164
+ { "ibm01140", 1140, yes },
165
+ { "ccsid01141", 1141, yes },
166
+ { "cp01141", 1141, yes },
167
+ { "ebcdic-de-273+euro", 1141, yes },
168
+ { "ibm01141", 1141, yes },
169
+ { "ccsid01142", 1142, yes },
170
+ { "cp01142", 1142, yes },
171
+ { "ebcdic-dk-277+euro", 1142, yes },
172
+ { "ebcdic-no-277+euro", 1142, yes },
173
+ { "ibm01142", 1142, yes },
174
+ { "ccsid01143", 1143, yes },
175
+ { "cp01143", 1143, yes },
176
+ { "ebcdic-fi-278+euro", 1143, yes },
177
+ { "ebcdic-se-278+euro", 1143, yes },
178
+ { "ibm01143", 1143, yes },
179
+ { "ccsid01144", 1144, yes },
180
+ { "cp01144", 1144, yes },
181
+ { "ebcdic-it-280+euro", 1144, yes },
182
+ { "ibm01144", 1144, yes },
183
+ { "ccsid01145", 1145, yes },
184
+ { "cp01145", 1145, yes },
185
+ { "ebcdic-es-284+euro", 1145, yes },
186
+ { "ibm01145", 1145, yes },
187
+ { "ccsid01146", 1146, yes },
188
+ { "cp01146", 1146, yes },
189
+ { "ebcdic-gb-285+euro", 1146, yes },
190
+ { "ibm01146", 1146, yes },
191
+ { "ccsid01147", 1147, yes },
192
+ { "cp01147", 1147, yes },
193
+ { "ebcdic-fr-297+euro", 1147, yes },
194
+ { "ibm01147", 1147, yes },
195
+ { "ccsid01148", 1148, yes },
196
+ { "cp01148", 1148, yes },
197
+ { "ebcdic-international-500+euro", 1148, yes },
198
+ { "ibm01148", 1148, yes },
199
+ { "ccsid01149", 1149, yes },
200
+ { "cp01149", 1149, yes },
201
+ { "ebcdic-is-871+euro", 1149, yes },
202
+ { "ibm01149", 1149, yes },
203
+ { "iso-10646-ucs-2", 1200, yes },
204
+ { "ucs-2", 1200, yes },
205
+ { "unicode", 1200, yes },
206
+ { "utf-16", 1200, yes },
207
+ { "utf-16le", 1200, yes },
208
+ { "unicodefffe", 1201, yes },
209
+ { "utf-16be", 1201, yes },
210
+ { "windows-1250", 1250, yes },
211
+ { "x-cp1250", 1250, yes },
212
+ { "windows-1251", 1251, yes },
213
+ { "x-cp1251", 1251, yes },
214
+ { "windows-1252", 1252, yes },
215
+ { "x-ansi", 1252, yes },
216
+ { "windows-1253", 1253, yes },
217
+ { "windows-1254", 1254, yes },
218
+ { "windows-1255", 1255, yes },
219
+ { "cp1256", 1256, yes },
220
+ { "windows-1256", 1256, yes },
221
+ { "windows-1257", 1257, yes },
222
+ { "windows-1258", 1258, yes },
223
+ { "johab", 1361, yes },
224
+ { "macintosh", 10000, yes },
225
+ { "x-mac-japanese", 10001, yes },
226
+ { "x-mac-chinesetrad", 10002, yes },
227
+ { "x-mac-korean", 10003, yes },
228
+ { "x-mac-arabic", 10004, yes },
229
+ { "x-mac-hebrew", 10005, yes },
230
+ { "x-mac-greek", 10006, yes },
231
+ { "x-mac-cyrillic", 10007, yes },
232
+ { "x-mac-chinesesimp", 10008, yes },
233
+ { "x-mac-romanian", 10010, yes },
234
+ { "x-mac-ukrainian", 10017, yes },
235
+ { "x-mac-thai", 10021, yes },
236
+ { "x-mac-ce", 10029, yes },
237
+ { "x-mac-icelandic", 10079, yes },
238
+ { "x-mac-turkish", 10081, yes },
239
+ { "x-mac-croatian", 10082, yes },
240
+ { "x-chinese-cns", 20000, yes },
241
+ { "x-cp20001", 20001, yes },
242
+ { "x-chinese-eten", 20002, yes },
243
+ { "x-cp20003", 20003, yes },
244
+ { "x-cp20004", 20004, yes },
245
+ { "x-cp20005", 20005, yes },
246
+ { "irv", 20105, yes },
247
+ { "x-ia5", 20105, yes },
248
+ { "din_66003", 20106, yes },
249
+ { "german", 20106, yes },
250
+ { "x-ia5-german", 20106, yes },
251
+ { "sen_850200_b", 20107, yes },
252
+ { "swedish", 20107, yes },
253
+ { "x-ia5-swedish", 20107, yes },
254
+ { "norwegian", 20108, yes },
255
+ { "ns_4551-1", 20108, yes },
256
+ { "x-ia5-norwegian", 20108, yes },
257
+ { "ansi_x3.4-1968", 20127, yes },
258
+ { "ansi_x3.4-1986", 20127, yes },
259
+ { "ascii", 20127, yes },
260
+ { "cp367", 20127, yes },
261
+ { "csascii", 20127, yes },
262
+ { "ibm367", 20127, yes },
263
+ { "iso-ir-6", 20127, yes },
264
+ { "iso646-us", 20127, yes },
265
+ { "iso_646.irv:1991", 20127, yes },
266
+ { "us", 20127, yes },
267
+ { "us-ascii", 20127, yes },
268
+ { "x-cp20261", 20261, yes },
269
+ { "x-cp20269", 20269, yes },
270
+ { "cp273", 20273, yes },
271
+ { "csibm273", 20273, yes },
272
+ { "ibm273", 20273, yes },
273
+ { "csibm277", 20277, yes },
274
+ { "ebcdic-cp-dk", 20277, yes },
275
+ { "ebcdic-cp-no", 20277, yes },
276
+ { "ibm277", 20277, yes },
277
+ { "cp278", 20278, yes },
278
+ { "csibm278", 20278, yes },
279
+ { "ebcdic-cp-fi", 20278, yes },
280
+ { "ebcdic-cp-se", 20278, yes },
281
+ { "ibm278", 20278, yes },
282
+ { "cp280", 20280, yes },
283
+ { "csibm280", 20280, yes },
284
+ { "ebcdic-cp-it", 20280, yes },
285
+ { "ibm280", 20280, yes },
286
+ { "cp284", 20284, yes },
287
+ { "csibm284", 20284, yes },
288
+ { "ebcdic-cp-es", 20284, yes },
289
+ { "ibm284", 20284, yes },
290
+ { "cp285", 20285, yes },
291
+ { "csibm285", 20285, yes },
292
+ { "ebcdic-cp-gb", 20285, yes },
293
+ { "ibm285", 20285, yes },
294
+ { "cp290", 20290, yes },
295
+ { "csibm290", 20290, yes },
296
+ { "ebcdic-jp-kana", 20290, yes },
297
+ { "ibm290", 20290, yes },
298
+ { "cp297", 20297, yes },
299
+ { "csibm297", 20297, yes },
300
+ { "ebcdic-cp-fr", 20297, yes },
301
+ { "ibm297", 20297, yes },
302
+ { "cp420", 20420, yes },
303
+ { "csibm420", 20420, yes },
304
+ { "ebcdic-cp-ar1", 20420, yes },
305
+ { "ibm420", 20420, yes },
306
+ { "cp423", 20423, yes },
307
+ { "csibm423", 20423, yes },
308
+ { "ebcdic-cp-gr", 20423, yes },
309
+ { "ibm423", 20423, yes },
310
+ { "cp424", 20424, yes },
311
+ { "csibm424", 20424, yes },
312
+ { "ebcdic-cp-he", 20424, yes },
313
+ { "ibm424", 20424, yes },
314
+ { "x-ebcdic-koreanextended", 20833, yes },
315
+ { "csibmthai", 20838, yes },
316
+ { "ibm-thai", 20838, yes },
317
+ { "cskoi8r", 20866, yes },
318
+ { "koi", 20866, yes },
319
+ { "koi8", 20866, yes },
320
+ { "koi8-r", 20866, yes },
321
+ { "koi8r", 20866, yes },
322
+ { "cp871", 20871, yes },
323
+ { "csibm871", 20871, yes },
324
+ { "ebcdic-cp-is", 20871, yes },
325
+ { "ibm871", 20871, yes },
326
+ { "cp880", 20880, yes },
327
+ { "csibm880", 20880, yes },
328
+ { "ebcdic-cyrillic", 20880, yes },
329
+ { "ibm880", 20880, yes },
330
+ { "cp905", 20905, yes },
331
+ { "csibm905", 20905, yes },
332
+ { "ebcdic-cp-tr", 20905, yes },
333
+ { "ibm905", 20905, yes },
334
+ { "ccsid00924", 20924, yes },
335
+ { "cp00924", 20924, yes },
336
+ { "ebcdic-latin9--euro", 20924, yes },
337
+ { "ibm00924", 20924, yes },
338
+ { "x-cp20936", 20936, yes },
339
+ { "x-cp20949", 20949, yes },
340
+ { "cp1025", 21025, yes },
341
+ { "x-cp21027", 21027, yes },
342
+ { "koi8-ru", 21866, yes },
343
+ { "koi8-u", 21866, yes },
344
+ { "cp819", 28591, yes },
345
+ { "csisolatin1", 28591, yes },
346
+ { "ibm819", 28591, yes },
347
+ { "iso-8859-1", 28591, yes },
348
+ { "iso-ir-100", 28591, yes },
349
+ { "iso8859-1", 28591, yes },
350
+ { "iso_8859-1", 28591, yes },
351
+ { "iso_8859-1:1987", 28591, yes },
352
+ { "l1", 28591, yes },
353
+ { "latin1", 28591, yes },
354
+ { "csisolatin2", 28592, yes },
355
+ { "iso-8859-2", 28592, yes },
356
+ { "iso-ir-101", 28592, yes },
357
+ { "iso8859-2", 28592, yes },
358
+ { "iso_8859-2", 28592, yes },
359
+ { "iso_8859-2:1987", 28592, yes },
360
+ { "l2", 28592, yes },
361
+ { "latin2", 28592, yes },
362
+ { "csisolatin3", 28593, yes },
363
+ { "iso-8859-3", 28593, yes },
364
+ { "iso-ir-109", 28593, yes },
365
+ { "iso_8859-3", 28593, yes },
366
+ { "iso_8859-3:1988", 28593, yes },
367
+ { "l3", 28593, yes },
368
+ { "latin3", 28593, yes },
369
+ { "csisolatin4", 28594, yes },
370
+ { "iso-8859-4", 28594, yes },
371
+ { "iso-ir-110", 28594, yes },
372
+ { "iso_8859-4", 28594, yes },
373
+ { "iso_8859-4:1988", 28594, yes },
374
+ { "l4", 28594, yes },
375
+ { "latin4", 28594, yes },
376
+ { "csisolatincyrillic", 28595, yes },
377
+ { "cyrillic", 28595, yes },
378
+ { "iso-8859-5", 28595, yes },
379
+ { "iso-ir-144", 28595, yes },
380
+ { "iso_8859-5", 28595, yes },
381
+ { "iso_8859-5:1988", 28595, yes },
382
+ { "arabic", 28596, yes },
383
+ { "csisolatinarabic", 28596, yes },
384
+ { "ecma-114", 28596, yes },
385
+ { "iso-8859-6", 28596, yes },
386
+ { "iso-ir-127", 28596, yes },
387
+ { "iso_8859-6", 28596, yes },
388
+ { "iso_8859-6:1987", 28596, yes },
389
+ { "csisolatingreek", 28597, yes },
390
+ { "ecma-118", 28597, yes },
391
+ { "elot_928", 28597, yes },
392
+ { "greek", 28597, yes },
393
+ { "greek8", 28597, yes },
394
+ { "iso-8859-7", 28597, yes },
395
+ { "iso-ir-126", 28597, yes },
396
+ { "iso_8859-7", 28597, yes },
397
+ { "iso_8859-7:1987", 28597, yes },
398
+ { "csisolatinhebrew", 28598, yes },
399
+ { "hebrew", 28598, yes },
400
+ { "iso-8859-8", 28598, yes },
401
+ { "iso-ir-138", 28598, yes },
402
+ { "iso_8859-8", 28598, yes },
403
+ { "iso_8859-8:1988", 28598, yes },
404
+ { "logical", 28598, yes },
405
+ { "visual", 28598, yes },
406
+ { "csisolatin5", 28599, yes },
407
+ { "iso-8859-9", 28599, yes },
408
+ { "iso-ir-148", 28599, yes },
409
+ { "iso_8859-9", 28599, yes },
410
+ { "iso_8859-9:1989", 28599, yes },
411
+ { "l5", 28599, yes },
412
+ { "latin5", 28599, yes },
413
+ { "iso-8859-13", 28603, yes },
414
+ { "csisolatin9", 28605, yes },
415
+ { "iso-8859-15", 28605, yes },
416
+ { "iso_8859-15", 28605, yes },
417
+ { "l9", 28605, yes },
418
+ { "latin9", 28605, yes },
419
+ { "x-europa", 29001, yes },
420
+ { "iso-8859-8-i", 38598, yes },
421
+ { "iso-2022-jp", 50220, no },
422
+ { "csiso2022jp", 50221, no },
423
+ { "csiso2022kr", 50225, no },
424
+ { "iso-2022-kr", 50225, no },
425
+ { "iso-2022-kr-7", 50225, no },
426
+ { "iso-2022-kr-7bit", 50225, no },
427
+ { "cp50227", 50227, no },
428
+ { "x-cp50227", 50227, no },
429
+ { "cp930", 50930, yes },
430
+ { "x-ebcdic-japaneseanduscanada", 50931, yes },
431
+ { "cp933", 50933, yes },
432
+ { "cp935", 50935, yes },
433
+ { "cp937", 50937, yes },
434
+ { "cp939", 50939, yes },
435
+ { "cseucpkdfmtjapanese", 51932, yes },
436
+ { "euc-jp", 51932, yes },
437
+ { "extended_unix_code_packed_format_for_japanese", 51932, yes },
438
+ { "iso-2022-jpeuc", 51932, yes },
439
+ { "x-euc", 51932, yes },
440
+ { "x-euc-jp", 51932, yes },
441
+ { "euc-cn", 51936, yes },
442
+ { "x-euc-cn", 51936, yes },
443
+ { "cseuckr", 51949, yes },
444
+ { "euc-kr", 51949, yes },
445
+ { "iso-2022-kr-8", 51949, yes },
446
+ { "iso-2022-kr-8bit", 51949, yes },
447
+ { "hz-gb-2312", 52936, no },
448
+ { "gb18030", 54936, yes },
449
+ { "x-iscii-de", 57002, yes },
450
+ { "x-iscii-be", 57003, yes },
451
+ { "x-iscii-ta", 57004, yes },
452
+ { "x-iscii-te", 57005, yes },
453
+ { "x-iscii-as", 57006, yes },
454
+ { "x-iscii-or", 57007, yes },
455
+ { "x-iscii-ka", 57008, yes },
456
+ { "x-iscii-ma", 57009, yes },
457
+ { "x-iscii-gu", 57010, yes },
458
+ { "x-iscii-pa", 57011, yes },
459
+ { "csunicode11utf7", 65000, no },
460
+ { "unicode-1-1-utf-7", 65000, no },
461
+ { "unicode-2-0-utf-7", 65000, no },
462
+ { "utf-7", 65000, no },
463
+ { "x-unicode-1-1-utf-7", 65000, no },
464
+ { "x-unicode-2-0-utf-7", 65000, no },
465
+ { "unicode-1-1-utf-8", 65001, yes },
466
+ { "unicode-2-0-utf-8", 65001, yes },
467
+ { "utf-8", 65001, yes },
468
+ { "x-unicode-1-1-utf-8", 65001, yes },
469
+ { "x-unicode-2-0-utf-8", 65001, yes },
470
+
471
+ /* final entry */
472
+ { NULL, 0, no }
473
+ };
474
+
475
+ uint TY_(Win32MLangGetCPFromName)(TidyAllocator *allocator, ctmbstr encoding)
476
+ {
477
+ uint i;
478
+ tmbstr enc;
479
+
480
+ /* ensure name is in lower case */
481
+ enc = TY_(tmbstrdup)(allocator,encoding);
482
+ enc = TY_(tmbstrtolower)(enc);
483
+
484
+ for (i = 0; NameWinCPMap[i].name; ++i)
485
+ {
486
+ if (TY_(tmbstrcmp)(NameWinCPMap[i].name, enc) == 0)
487
+ {
488
+ IMLangConvertCharset * p = NULL;
489
+ uint wincp = NameWinCPMap[i].wincp;
490
+ HRESULT hr;
491
+
492
+ TidyFree(allocator, enc);
493
+
494
+ /* currently no support for unsafe encodings */
495
+ if (!NameWinCPMap[i].safe)
496
+ return 0;
497
+
498
+ /* hack for config.c */
499
+ CoInitialize(NULL);
500
+ hr = CreateMLangObject(p);
501
+
502
+ if (hr != S_OK || !p)
503
+ {
504
+ wincp = 0;
505
+ }
506
+ else
507
+ {
508
+ hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
509
+
510
+ if (hr != S_OK)
511
+ wincp = 0;
512
+
513
+ IMLangConvertCharset_Release(p);
514
+ p = NULL;
515
+ }
516
+
517
+ CoUninitialize();
518
+
519
+ return wincp;
520
+ }
521
+ }
522
+
523
+ TidyFree(allocator, enc);
524
+ return 0;
525
+ }
526
+
527
+ Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp)
528
+ {
529
+ IMLangConvertCharset * p = NULL;
530
+ HRESULT hr;
531
+
532
+ assert( in != NULL );
533
+
534
+ CoInitialize(NULL);
535
+
536
+ if (wincp == 0)
537
+ {
538
+ /* no codepage found for this encoding */
539
+ return no;
540
+ }
541
+
542
+ hr = CreateMLangObject(p);
543
+
544
+ if (hr != S_OK || !p)
545
+ {
546
+ /* MLang not supported */
547
+ return no;
548
+ }
549
+
550
+ hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
551
+
552
+ if (hr != S_OK)
553
+ {
554
+ /* encoding not supported, insufficient memory, etc. */
555
+ return no;
556
+ }
557
+
558
+ in->mlang = p;
559
+
560
+ return yes;
561
+ }
562
+
563
+ void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in)
564
+ {
565
+ IMLangConvertCharset * p;
566
+
567
+ assert( in != NULL );
568
+
569
+ p = (IMLangConvertCharset *)in->mlang;
570
+ if (p)
571
+ {
572
+ IMLangConvertCharset_Release(p);
573
+ p = NULL;
574
+ in->mlang = NULL;
575
+ }
576
+
577
+ CoUninitialize();
578
+ }
579
+
580
+ #if 0
581
+ Bool Win32MLangInitOutputTranscoder(TidyAllocator *allocator, StreamOut * out, tmbstr encoding)
582
+ {
583
+ IMLangConvertCharset * p = NULL;
584
+ HRESULT hr;
585
+ uint wincp;
586
+
587
+ assert( out != NULL );
588
+
589
+ CoInitialize(NULL);
590
+
591
+ wincp = TY_(Win32MLangGetCPFromName)(allocator, encoding);
592
+ if (wincp == 0)
593
+ {
594
+ /* no codepage found for this encoding */
595
+ return no;
596
+ }
597
+
598
+ hr = CreateMLangObject(p);
599
+
600
+ if (hr != S_OK || !p)
601
+ {
602
+ /* MLang not supported */
603
+ return no;
604
+ }
605
+
606
+ IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS);
607
+
608
+ if (hr != S_OK)
609
+ {
610
+ /* encoding not supported, insufficient memory, etc. */
611
+ return no;
612
+ }
613
+
614
+ out->mlang = p;
615
+
616
+ return yes;
617
+ }
618
+
619
+ void Win32MLangUninitOutputTranscoder(StreamOut * out)
620
+ {
621
+ IMLangConvertCharset * p;
622
+
623
+ assert( out != NULL );
624
+
625
+ p = (IMLangConvertCharset *)out->mlang;
626
+ if (p)
627
+ {
628
+ IMLangConvertCharset_Release(p);
629
+ p = NULL;
630
+ out->mlang = NULL;
631
+ }
632
+
633
+ CoUninitialize();
634
+ }
635
+ #endif
636
+
637
+ int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead)
638
+ {
639
+ IMLangConvertCharset * p;
640
+ TidyInputSource * source;
641
+ CHAR inbuf[TC_INBUFSIZE] = { 0 };
642
+ WCHAR outbuf[TC_OUTBUFSIZE] = { 0 };
643
+ HRESULT hr = S_OK;
644
+ size_t inbufsize = 0;
645
+
646
+ assert( in != NULL );
647
+ assert( &in->source != NULL );
648
+ assert( bytesRead != NULL );
649
+ assert( in->mlang != NULL );
650
+
651
+ p = (IMLangConvertCharset *)in->mlang;
652
+ source = &in->source;
653
+
654
+ inbuf[inbufsize++] = (CHAR)firstByte;
655
+
656
+ while(inbufsize < TC_INBUFSIZE)
657
+ {
658
+ UINT outbufsize = TC_OUTBUFSIZE;
659
+ UINT readNow = inbufsize;
660
+ int nextByte = EndOfStream;
661
+
662
+ hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize);
663
+
664
+ assert( hr == S_OK );
665
+ assert( outbufsize <= 2 );
666
+
667
+ if (outbufsize == 2)
668
+ {
669
+ /* U+10000-U+10FFFF are returned as a pair of surrogates */
670
+ tchar m = (tchar)outbuf[0];
671
+ tchar n = (tchar)outbuf[1];
672
+ assert( TY_(IsHighSurrogate)(n) && TY_(IsLowSurrogate)(m) );
673
+ *bytesRead = readNow;
674
+ return (int)TY_(CombineSurrogatePair)(n, m);
675
+ }
676
+
677
+ if (outbufsize == 1)
678
+ {
679
+ /* we found the character */
680
+ /* set bytesRead and return */
681
+ *bytesRead = readNow;
682
+ return (int)outbuf[0];
683
+ }
684
+
685
+ /* we need more bytes */
686
+ nextByte = source->getByte(source->sourceData);
687
+
688
+ if (nextByte == EndOfStream)
689
+ {
690
+ /* todo: error message for broken stream? */
691
+
692
+ *bytesRead = readNow;
693
+ return EndOfStream;
694
+ }
695
+
696
+ inbuf[inbufsize++] = (CHAR)nextByte;
697
+ }
698
+
699
+ /* No full character found after reading TC_INBUFSIZE bytes, */
700
+ /* give up to read this stream, it's obviously unreadable. */
701
+
702
+ /* todo: error message for broken stream? */
703
+ return EndOfStream;
704
+ }
705
+
706
+ Bool Win32MLangIsConvertible(tchar c, StreamOut * out)
707
+ {
708
+ IMLangConvertCharset * p;
709
+ UINT i = 1;
710
+ HRESULT hr;
711
+ WCHAR inbuf[2] = { 0 };
712
+ UINT inbufsize = 0;
713
+
714
+ assert( c != 0 );
715
+ assert( c <= 0x10FFFF );
716
+ assert( out != NULL );
717
+ assert( out->mlang != NULL );
718
+
719
+ if (c > 0xFFFF)
720
+ {
721
+ tchar high = 0;
722
+ tchar low = 0;
723
+
724
+ TY_(SplitSurrogatePair)(c, &low, &high);
725
+
726
+ inbuf[inbufsize++] = (WCHAR)low;
727
+ inbuf[inbufsize++] = (WCHAR)high;
728
+ }
729
+ else
730
+ inbuf[inbufsize++] = (WCHAR)c;
731
+
732
+ p = (IMLangConvertCharset *)out->mlang;
733
+ hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL);
734
+
735
+ return hr == S_OK ? yes : no;
736
+ }
737
+
738
+ void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten)
739
+ {
740
+ IMLangConvertCharset * p;
741
+ TidyOutputSink * sink;
742
+ CHAR outbuf[TC_OUTBUFSIZE] = { 0 };
743
+ UINT outbufsize = TC_OUTBUFSIZE;
744
+ HRESULT hr = S_OK;
745
+ WCHAR inbuf[2] = { 0 };
746
+ UINT inbufsize = 0;
747
+ uint i;
748
+
749
+ assert( c != 0 );
750
+ assert( c <= 0x10FFFF );
751
+ assert( bytesWritten != NULL );
752
+ assert( out != NULL );
753
+ assert( &out->sink != NULL );
754
+ assert( out->mlang != NULL );
755
+
756
+ p = (IMLangConvertCharset *)out->mlang;
757
+ sink = &out->sink;
758
+
759
+ if (c > 0xFFFF)
760
+ {
761
+ tchar high = 0;
762
+ tchar low = 0;
763
+
764
+ TY_(SplitSurrogatePair)(c, &low, &high);
765
+
766
+ inbuf[inbufsize++] = (WCHAR)low;
767
+ inbuf[inbufsize++] = (WCHAR)high;
768
+ }
769
+ else
770
+ inbuf[inbufsize++] = (WCHAR)c;
771
+
772
+ hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize);
773
+
774
+ assert( hr == S_OK );
775
+ assert( outbufsize > 0 );
776
+ assert( inbufsize == 1 || inbufsize == 2 );
777
+
778
+ for (i = 0; i < outbufsize; ++i)
779
+ sink->putByte(sink->sinkData, (byte)(outbuf[i]));
780
+
781
+ *bytesWritten = outbufsize;
782
+
783
+ return;
784
+ }
785
+
786
+ #endif /* TIDY_WIN32_MLANG_SUPPORT */
787
+
788
+ /*
789
+ * local variables:
790
+ * mode: c
791
+ * indent-tabs-mode: nil
792
+ * c-basic-offset: 4
793
+ * eval: (c-set-offset 'substatement-open 0)
794
+ * end:
795
+ */