tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/utf8.c ADDED
@@ -0,0 +1,539 @@
1
+ /* utf8.c -- convert characters to/from UTF-8
2
+
3
+ (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: arnaud02 $
9
+ $Date: 2007/05/30 16:47:31 $
10
+ $Revision: 1.10 $
11
+
12
+ Uses public interfaces to abstract input source and output
13
+ sink, which may be user supplied or either FILE* or memory
14
+ based Tidy implementations. Encoding support is uniform
15
+ regardless of I/O mechanism.
16
+
17
+ Note, UTF-8 encoding, by itself, does not affect the actual
18
+ "codepoints" of the underlying character encoding. In the
19
+ cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
20
+ refer to ISO-10646 "codepoints". For anything else, they
21
+ refer to some other "codepoint" set.
22
+
23
+ Put another way, UTF-8 is a variable length method to
24
+ represent any non-negative integer value. The glyph
25
+ that a integer value represents is unchanged and defined
26
+ externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
27
+ Latin2-9, and so on).
28
+
29
+ Put still another way, UTF-8 is more of a _transfer_ encoding
30
+ than a _character_ encoding, per se.
31
+ */
32
+
33
+ #include "tidy.h"
34
+ #include "forward.h"
35
+ #include "utf8.h"
36
+
37
+ /*
38
+ UTF-8 encoding/decoding functions
39
+ Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
40
+
41
+ Also see below for UTF-16 encoding/decoding functions
42
+
43
+ References :
44
+
45
+ 1) UCS Transformation Format 8 (UTF-8):
46
+ ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
47
+ <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
48
+ <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
49
+
50
+ Table 4 - Mapping from UCS-4 to UTF-8
51
+
52
+ 2) Unicode standards:
53
+ <http://www.unicode.org/unicode/standard/standard.html>
54
+
55
+ 3) Legal UTF-8 byte sequences:
56
+ <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
57
+
58
+ Code point 1st byte 2nd byte 3rd byte 4th byte
59
+ ---------- -------- -------- -------- --------
60
+ U+0000..U+007F 00..7F
61
+ U+0080..U+07FF C2..DF 80..BF
62
+ U+0800..U+0FFF E0 A0..BF 80..BF
63
+ U+1000..U+FFFF E1..EF 80..BF 80..BF
64
+ U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
65
+ U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
66
+ U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
67
+
68
+ The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
69
+ allows for the use of five- and six-byte sequences to encode
70
+ characters that are outside the range of the Unicode character
71
+ set; those five- and six-byte sequences are illegal for the use
72
+ of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
73
+ does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
74
+ (but it does allow other noncharacters).
75
+
76
+ 4) RFC 2279: UTF-8, a transformation format of ISO 10646:
77
+ <http://www.ietf.org/rfc/rfc2279.txt>
78
+
79
+ 5) UTF-8 and Unicode FAQ:
80
+ <http://www.cl.cam.ac.uk/~mgk25/unicode.html>
81
+
82
+ 6) Markus Kuhn's UTF-8 decoder stress test file:
83
+ <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
84
+
85
+ 7) UTF-8 Demo:
86
+ <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
87
+
88
+ 8) UTF-8 Sampler:
89
+ <http://www.columbia.edu/kermit/utf8.html>
90
+
91
+ 9) Transformation Format for 16 Planes of Group 00 (UTF-16):
92
+ ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
93
+ <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
94
+ <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
95
+
96
+ 10) RFC 2781: UTF-16, an encoding of ISO 10646:
97
+ <http://www.ietf.org/rfc/rfc2781.txt>
98
+
99
+ 11) UTF-16 invalid surrogate pairs:
100
+ <http://www.unicode.org/unicode/faq/utf_bom.html#16>
101
+
102
+ UTF-16 UTF-8 UCS-4
103
+ D83F DFF* F0 9F BF B* 0001FFF*
104
+ D87F DFF* F0 AF BF B* 0002FFF*
105
+ D8BF DFF* F0 BF BF B* 0003FFF*
106
+ D8FF DFF* F1 8F BF B* 0004FFF*
107
+ D93F DFF* F1 9F BF B* 0005FFF*
108
+ D97F DFF* F1 AF BF B* 0006FFF*
109
+ ...
110
+ DBBF DFF* F3 BF BF B* 000FFFF*
111
+ DBFF DFF* F4 8F BF B* 0010FFF*
112
+
113
+ * = E or F
114
+
115
+ 1010 A
116
+ 1011 B
117
+ 1100 C
118
+ 1101 D
119
+ 1110 E
120
+ 1111 F
121
+
122
+ */
123
+
124
+ #define kNumUTF8Sequences 7
125
+ #define kMaxUTF8Bytes 4
126
+
127
+ #define kUTF8ByteSwapNotAChar 0xFFFE
128
+ #define kUTF8NotAChar 0xFFFF
129
+
130
+ #define kMaxUTF8FromUCS4 0x10FFFF
131
+
132
+ #define kUTF16SurrogatesBegin 0x10000
133
+ #define kMaxUTF16FromUCS4 0x10FFFF
134
+
135
+ /* UTF-16 surrogate pair areas */
136
+ #define kUTF16LowSurrogateBegin 0xD800
137
+ #define kUTF16LowSurrogateEnd 0xDBFF
138
+ #define kUTF16HighSurrogateBegin 0xDC00
139
+ #define kUTF16HighSurrogateEnd 0xDFFF
140
+
141
+
142
+ /* offsets into validUTF8 table below */
143
+ static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
144
+ {
145
+ 0, /* 1 byte */
146
+ 1, /* 2 bytes */
147
+ 2, /* 3 bytes */
148
+ 4, /* 4 bytes */
149
+ kNumUTF8Sequences /* must be last */
150
+ };
151
+
152
+ static const struct validUTF8Sequence
153
+ {
154
+ uint lowChar;
155
+ uint highChar;
156
+ int numBytes;
157
+ byte validBytes[8];
158
+ } validUTF8[kNumUTF8Sequences] =
159
+ {
160
+ /* low high #bytes byte 1 byte 2 byte 3 byte 4 */
161
+ {0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
162
+ {0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
163
+ {0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
164
+ {0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
165
+ {0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
166
+ {0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
167
+ {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
168
+ };
169
+
170
+ int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
171
+ TidyInputSource* inp, int* count )
172
+ {
173
+ byte tempbuf[10];
174
+ byte *buf = &tempbuf[0];
175
+ uint ch = 0, n = 0;
176
+ int i, bytes = 0;
177
+ Bool hasError = no;
178
+
179
+ if ( successorBytes )
180
+ buf = (byte*) successorBytes;
181
+
182
+ /* special check if we have been passed an EOF char */
183
+ if ( firstByte == EndOfStream )
184
+ {
185
+ /* at present */
186
+ *c = firstByte;
187
+ *count = 1;
188
+ return 0;
189
+ }
190
+
191
+ ch = firstByte; /* first byte is passed in separately */
192
+
193
+ if (ch <= 0x7F) /* 0XXX XXXX one byte */
194
+ {
195
+ n = ch;
196
+ bytes = 1;
197
+ }
198
+ else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
199
+ {
200
+ n = ch & 31;
201
+ bytes = 2;
202
+ }
203
+ else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
204
+ {
205
+ n = ch & 15;
206
+ bytes = 3;
207
+ }
208
+ else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
209
+ {
210
+ n = ch & 7;
211
+ bytes = 4;
212
+ }
213
+ else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
214
+ {
215
+ n = ch & 3;
216
+ bytes = 5;
217
+ hasError = yes;
218
+ }
219
+ else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
220
+ {
221
+ n = ch & 1;
222
+ bytes = 6;
223
+ hasError = yes;
224
+ }
225
+ else
226
+ {
227
+ /* not a valid first byte of a UTF-8 sequence */
228
+ n = ch;
229
+ bytes = 1;
230
+ hasError = yes;
231
+ }
232
+
233
+ /* successor bytes should have the form 10XX XXXX */
234
+
235
+ /* If caller supplied buffer, use it. Else see if caller
236
+ ** supplied an input source, use that.
237
+ */
238
+ if ( successorBytes )
239
+ {
240
+ for ( i=0; i < bytes-1; ++i )
241
+ {
242
+ if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
243
+ {
244
+ hasError = yes;
245
+ bytes = i+1;
246
+ break;
247
+ }
248
+ n = (n << 6) | (buf[i] & 0x3F);
249
+ }
250
+ }
251
+ else if ( inp )
252
+ {
253
+ for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
254
+ {
255
+ int b = inp->getByte( inp->sourceData );
256
+ buf[i] = (tmbchar) b;
257
+
258
+ /* End of data or illegal successor byte value */
259
+ if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
260
+ {
261
+ hasError = yes;
262
+ bytes = i+1;
263
+ if ( b != EOF )
264
+ inp->ungetByte( inp->sourceData, buf[i] );
265
+ break;
266
+ }
267
+ n = (n << 6) | (buf[i] & 0x3F);
268
+ }
269
+ }
270
+ else if ( bytes > 1 )
271
+ {
272
+ hasError = yes;
273
+ bytes = 1;
274
+ }
275
+
276
+ if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
277
+ hasError = yes;
278
+
279
+ if (!hasError && (n > kMaxUTF8FromUCS4))
280
+ hasError = yes;
281
+
282
+ #if 0 /* Breaks Big5 D8 - DF */
283
+ if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
284
+ /* unpaired surrogates not allowed */
285
+ hasError = yes;
286
+ #endif
287
+
288
+ if (!hasError)
289
+ {
290
+ int lo, hi;
291
+
292
+ lo = offsetUTF8Sequences[bytes - 1];
293
+ hi = offsetUTF8Sequences[bytes] - 1;
294
+
295
+ /* check for overlong sequences */
296
+ if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
297
+ hasError = yes;
298
+ else
299
+ {
300
+ hasError = yes; /* assume error until proven otherwise */
301
+
302
+ for (i = lo; i <= hi; i++)
303
+ {
304
+ int tempCount;
305
+ byte theByte;
306
+
307
+ for (tempCount = 0; tempCount < bytes; tempCount++)
308
+ {
309
+ if (!tempCount)
310
+ theByte = (tmbchar) firstByte;
311
+ else
312
+ theByte = buf[tempCount - 1];
313
+
314
+ if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
315
+ theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
316
+ hasError = no;
317
+ if (hasError)
318
+ break;
319
+ }
320
+ }
321
+ }
322
+ }
323
+
324
+ #if 1 && defined(_DEBUG)
325
+ if ( hasError )
326
+ {
327
+ /* debug */
328
+ fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
329
+ fprintf( stderr, "0x%02x ", firstByte );
330
+ for (i = 1; i < bytes; i++)
331
+ fprintf( stderr, "0x%02x ", buf[i - 1] );
332
+ fprintf( stderr, " = U+%04ulx\n", n );
333
+ }
334
+ #endif
335
+
336
+ *count = bytes;
337
+ *c = n;
338
+ if ( hasError )
339
+ return -1;
340
+ return 0;
341
+ }
342
+
343
+ int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
344
+ TidyOutputSink* outp, int* count )
345
+ {
346
+ byte tempbuf[10] = {0};
347
+ byte* buf = &tempbuf[0];
348
+ int bytes = 0;
349
+ Bool hasError = no;
350
+
351
+ if ( encodebuf )
352
+ buf = (byte*) encodebuf;
353
+
354
+ if (c <= 0x7F) /* 0XXX XXXX one byte */
355
+ {
356
+ buf[0] = (tmbchar) c;
357
+ bytes = 1;
358
+ }
359
+ else if (c <= 0x7FF) /* 110X XXXX two bytes */
360
+ {
361
+ buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
362
+ buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
363
+ bytes = 2;
364
+ }
365
+ else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
366
+ {
367
+ buf[0] = (tmbchar) (0xE0 | (c >> 12));
368
+ buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
369
+ buf[2] = (tmbchar) (0x80 | (c & 0x3F));
370
+ bytes = 3;
371
+ if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
372
+ hasError = yes;
373
+ #if 0 /* Breaks Big5 D8 - DF */
374
+ else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
375
+ /* unpaired surrogates not allowed */
376
+ hasError = yes;
377
+ #endif
378
+ }
379
+ else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
380
+ {
381
+ buf[0] = (tmbchar) (0xF0 | (c >> 18));
382
+ buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
383
+ buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
384
+ buf[3] = (tmbchar) (0x80 | (c & 0x3F));
385
+ bytes = 4;
386
+ if (c > kMaxUTF8FromUCS4)
387
+ hasError = yes;
388
+ }
389
+ else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
390
+ {
391
+ buf[0] = (tmbchar) (0xF8 | (c >> 24));
392
+ buf[1] = (tmbchar) (0x80 | (c >> 18));
393
+ buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
394
+ buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
395
+ buf[4] = (tmbchar) (0x80 | (c & 0x3F));
396
+ bytes = 5;
397
+ hasError = yes;
398
+ }
399
+ else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
400
+ {
401
+ buf[0] = (tmbchar) (0xFC | (c >> 30));
402
+ buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
403
+ buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
404
+ buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
405
+ buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
406
+ buf[5] = (tmbchar) (0x80 | (c & 0x3F));
407
+ bytes = 6;
408
+ hasError = yes;
409
+ }
410
+ else
411
+ hasError = yes;
412
+
413
+ /* don't output invalid UTF-8 byte sequence to a stream */
414
+ if ( !hasError && outp != NULL )
415
+ {
416
+ int ix;
417
+ for ( ix=0; ix < bytes; ++ix )
418
+ outp->putByte( outp->sinkData, buf[ix] );
419
+ }
420
+
421
+ #if 1 && defined(_DEBUG)
422
+ if ( hasError )
423
+ {
424
+ int i;
425
+ fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
426
+ for (i = 0; i < bytes; i++)
427
+ fprintf( stderr, "0x%02x ", buf[i] );
428
+ fprintf( stderr, "\n" );
429
+ }
430
+ #endif
431
+
432
+ *count = bytes;
433
+ if (hasError)
434
+ return -1;
435
+ return 0;
436
+ }
437
+
438
+
439
+ /* return one less than the number of bytes used by the UTF-8 byte sequence */
440
+ /* str points to the UTF-8 byte sequence */
441
+ /* the Unicode char is returned in *ch */
442
+ uint TY_(GetUTF8)( ctmbstr str, uint *ch )
443
+ {
444
+ uint n;
445
+ int bytes;
446
+
447
+ int err;
448
+
449
+ bytes = 0;
450
+
451
+ /* first byte "str[0]" is passed in separately from the */
452
+ /* rest of the UTF-8 byte sequence starting at "str[1]" */
453
+ err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
454
+ if (err)
455
+ {
456
+ #if 1 && defined(_DEBUG)
457
+ fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
458
+ #endif
459
+ n = 0xFFFD; /* replacement char */
460
+ }
461
+
462
+ *ch = n;
463
+ return bytes - 1;
464
+ }
465
+
466
+ /* store char c as UTF-8 encoded byte stream */
467
+ tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
468
+ {
469
+ int err, count = 0;
470
+
471
+ err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
472
+ if (err)
473
+ {
474
+ #if 1 && defined(_DEBUG)
475
+ fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
476
+ #endif
477
+ /* replacement char 0xFFFD encoded as UTF-8 */
478
+ buf[0] = (byte) 0xEF;
479
+ buf[1] = (byte) 0xBF;
480
+ buf[2] = (byte) 0xBD;
481
+ count = 3;
482
+ }
483
+
484
+ buf += count;
485
+ return buf;
486
+ }
487
+
488
+ Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
489
+ {
490
+ return ( ucs4 <= kMaxUTF16FromUCS4 );
491
+ }
492
+
493
+ Bool TY_(IsHighSurrogate)( tchar ch )
494
+ {
495
+ return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
496
+ }
497
+ Bool TY_(IsLowSurrogate)( tchar ch )
498
+ {
499
+ return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
500
+ }
501
+
502
+ tchar TY_(CombineSurrogatePair)( tchar high, tchar low )
503
+ {
504
+ assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
505
+ return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
506
+ high - kUTF16HighSurrogateBegin + 0x10000 );
507
+ }
508
+
509
+ Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
510
+ {
511
+ Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
512
+ if ( status )
513
+ {
514
+ *low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
515
+ *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
516
+ }
517
+ return status;
518
+ }
519
+
520
+ Bool TY_(IsValidCombinedChar)( tchar ch )
521
+ {
522
+ return ( ch >= kUTF16SurrogatesBegin &&
523
+ (ch & 0x0000FFFE) != 0x0000FFFE &&
524
+ (ch & 0x0000FFFF) != 0x0000FFFF );
525
+ }
526
+
527
+ Bool TY_(IsCombinedChar)( tchar ch )
528
+ {
529
+ return ( ch >= kUTF16SurrogatesBegin );
530
+ }
531
+
532
+ /*
533
+ * local variables:
534
+ * mode: c
535
+ * indent-tabs-mode: nil
536
+ * c-basic-offset: 4
537
+ * eval: (c-set-offset 'substatement-open 0)
538
+ * end:
539
+ */
data/ext/tidy/utf8.h ADDED
@@ -0,0 +1,52 @@
1
+ #ifndef __UTF8_H__
2
+ #define __UTF8_H__
3
+
4
+ /* utf8.h -- convert characters to/from UTF-8
5
+
6
+ (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2006/09/12 15:14:44 $
13
+ $Revision: 1.5 $
14
+
15
+ */
16
+
17
+ #include "platform.h"
18
+ #include "buffio.h"
19
+
20
+ /* UTF-8 encoding/decoding support
21
+ ** Does not convert character "codepoints", i.e. to/from 10646.
22
+ */
23
+
24
+ int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
25
+ TidyInputSource* inp, int* count );
26
+
27
+ int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
28
+ TidyOutputSink* outp, int* count );
29
+
30
+
31
+ uint TY_(GetUTF8)( ctmbstr str, uint *ch );
32
+ tmbstr TY_(PutUTF8)( tmbstr buf, uint c );
33
+
34
+ #define UNICODE_BOM_BE 0xFEFF /* big-endian (default) UNICODE BOM */
35
+ #define UNICODE_BOM UNICODE_BOM_BE
36
+ #define UNICODE_BOM_LE 0xFFFE /* little-endian UNICODE BOM */
37
+ #define UNICODE_BOM_UTF8 0xEFBBBF /* UTF-8 UNICODE BOM */
38
+
39
+
40
+ Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 );
41
+ Bool TY_(IsHighSurrogate)( tchar ch );
42
+ Bool TY_(IsLowSurrogate)( tchar ch );
43
+
44
+ Bool TY_(IsCombinedChar)( tchar ch );
45
+ Bool TY_(IsValidCombinedChar)( tchar ch );
46
+
47
+ tchar TY_(CombineSurrogatePair)( tchar high, tchar low );
48
+ Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* high, tchar* low );
49
+
50
+
51
+
52
+ #endif /* __UTF8_H__ */
@@ -0,0 +1,14 @@
1
+ /* version information
2
+
3
+ (c) 2007-2009 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: arnaud02 $
9
+ $Date: 2009/03/25 21:37:11 $
10
+ $Revision: 1.46 $
11
+
12
+ */
13
+
14
+ static const char TY_(release_date)[] = "25 March 2009";