tidy-ext 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/LICENSE +50 -0
- data/README +12 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/ext/tidy/access.c +3310 -0
- data/ext/tidy/access.h +279 -0
- data/ext/tidy/alloc.c +107 -0
- data/ext/tidy/attrask.c +209 -0
- data/ext/tidy/attrdict.c +2398 -0
- data/ext/tidy/attrdict.h +122 -0
- data/ext/tidy/attrget.c +213 -0
- data/ext/tidy/attrs.c +1911 -0
- data/ext/tidy/attrs.h +374 -0
- data/ext/tidy/buffio.c +232 -0
- data/ext/tidy/buffio.h +118 -0
- data/ext/tidy/charsets.c +1032 -0
- data/ext/tidy/charsets.h +14 -0
- data/ext/tidy/clean.c +2674 -0
- data/ext/tidy/clean.h +87 -0
- data/ext/tidy/config.c +1746 -0
- data/ext/tidy/config.h +153 -0
- data/ext/tidy/entities.c +419 -0
- data/ext/tidy/entities.h +24 -0
- data/ext/tidy/extconf.rb +5 -0
- data/ext/tidy/fileio.c +106 -0
- data/ext/tidy/fileio.h +46 -0
- data/ext/tidy/forward.h +69 -0
- data/ext/tidy/iconvtc.c +105 -0
- data/ext/tidy/iconvtc.h +15 -0
- data/ext/tidy/istack.c +373 -0
- data/ext/tidy/lexer.c +3825 -0
- data/ext/tidy/lexer.h +617 -0
- data/ext/tidy/localize.c +1882 -0
- data/ext/tidy/mappedio.c +329 -0
- data/ext/tidy/mappedio.h +16 -0
- data/ext/tidy/message.h +207 -0
- data/ext/tidy/parser.c +4408 -0
- data/ext/tidy/parser.h +76 -0
- data/ext/tidy/platform.h +636 -0
- data/ext/tidy/pprint.c +2276 -0
- data/ext/tidy/pprint.h +93 -0
- data/ext/tidy/ruby-tidy.c +195 -0
- data/ext/tidy/streamio.c +1407 -0
- data/ext/tidy/streamio.h +222 -0
- data/ext/tidy/tagask.c +286 -0
- data/ext/tidy/tags.c +955 -0
- data/ext/tidy/tags.h +235 -0
- data/ext/tidy/tidy-int.h +129 -0
- data/ext/tidy/tidy.h +1097 -0
- data/ext/tidy/tidyenum.h +622 -0
- data/ext/tidy/tidylib.c +1751 -0
- data/ext/tidy/tmbstr.c +306 -0
- data/ext/tidy/tmbstr.h +92 -0
- data/ext/tidy/utf8.c +539 -0
- data/ext/tidy/utf8.h +52 -0
- data/ext/tidy/version.h +14 -0
- data/ext/tidy/win32tc.c +795 -0
- data/ext/tidy/win32tc.h +19 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/tidy/compat_spec.rb +44 -0
- data/spec/tidy/remote_uri_spec.rb +14 -0
- data/spec/tidy/test1.html +5 -0
- data/spec/tidy/tidy_spec.rb +34 -0
- metadata +125 -0
data/ext/tidy/utf8.c
ADDED
@@ -0,0 +1,539 @@
|
|
1
|
+
/* utf8.c -- convert characters to/from UTF-8
|
2
|
+
|
3
|
+
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
|
4
|
+
See tidy.h for the copyright notice.
|
5
|
+
|
6
|
+
CVS Info :
|
7
|
+
|
8
|
+
$Author: arnaud02 $
|
9
|
+
$Date: 2007/05/30 16:47:31 $
|
10
|
+
$Revision: 1.10 $
|
11
|
+
|
12
|
+
Uses public interfaces to abstract input source and output
|
13
|
+
sink, which may be user supplied or either FILE* or memory
|
14
|
+
based Tidy implementations. Encoding support is uniform
|
15
|
+
regardless of I/O mechanism.
|
16
|
+
|
17
|
+
Note, UTF-8 encoding, by itself, does not affect the actual
|
18
|
+
"codepoints" of the underlying character encoding. In the
|
19
|
+
cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
|
20
|
+
refer to ISO-10646 "codepoints". For anything else, they
|
21
|
+
refer to some other "codepoint" set.
|
22
|
+
|
23
|
+
Put another way, UTF-8 is a variable length method to
|
24
|
+
represent any non-negative integer value. The glyph
|
25
|
+
that a integer value represents is unchanged and defined
|
26
|
+
externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
|
27
|
+
Latin2-9, and so on).
|
28
|
+
|
29
|
+
Put still another way, UTF-8 is more of a _transfer_ encoding
|
30
|
+
than a _character_ encoding, per se.
|
31
|
+
*/
|
32
|
+
|
33
|
+
#include "tidy.h"
|
34
|
+
#include "forward.h"
|
35
|
+
#include "utf8.h"
|
36
|
+
|
37
|
+
/*
|
38
|
+
UTF-8 encoding/decoding functions
|
39
|
+
Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
|
40
|
+
|
41
|
+
Also see below for UTF-16 encoding/decoding functions
|
42
|
+
|
43
|
+
References :
|
44
|
+
|
45
|
+
1) UCS Transformation Format 8 (UTF-8):
|
46
|
+
ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
|
47
|
+
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
|
48
|
+
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
|
49
|
+
|
50
|
+
Table 4 - Mapping from UCS-4 to UTF-8
|
51
|
+
|
52
|
+
2) Unicode standards:
|
53
|
+
<http://www.unicode.org/unicode/standard/standard.html>
|
54
|
+
|
55
|
+
3) Legal UTF-8 byte sequences:
|
56
|
+
<http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
|
57
|
+
|
58
|
+
Code point 1st byte 2nd byte 3rd byte 4th byte
|
59
|
+
---------- -------- -------- -------- --------
|
60
|
+
U+0000..U+007F 00..7F
|
61
|
+
U+0080..U+07FF C2..DF 80..BF
|
62
|
+
U+0800..U+0FFF E0 A0..BF 80..BF
|
63
|
+
U+1000..U+FFFF E1..EF 80..BF 80..BF
|
64
|
+
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
65
|
+
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
66
|
+
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
67
|
+
|
68
|
+
The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
|
69
|
+
allows for the use of five- and six-byte sequences to encode
|
70
|
+
characters that are outside the range of the Unicode character
|
71
|
+
set; those five- and six-byte sequences are illegal for the use
|
72
|
+
of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
|
73
|
+
does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
|
74
|
+
(but it does allow other noncharacters).
|
75
|
+
|
76
|
+
4) RFC 2279: UTF-8, a transformation format of ISO 10646:
|
77
|
+
<http://www.ietf.org/rfc/rfc2279.txt>
|
78
|
+
|
79
|
+
5) UTF-8 and Unicode FAQ:
|
80
|
+
<http://www.cl.cam.ac.uk/~mgk25/unicode.html>
|
81
|
+
|
82
|
+
6) Markus Kuhn's UTF-8 decoder stress test file:
|
83
|
+
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
|
84
|
+
|
85
|
+
7) UTF-8 Demo:
|
86
|
+
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
|
87
|
+
|
88
|
+
8) UTF-8 Sampler:
|
89
|
+
<http://www.columbia.edu/kermit/utf8.html>
|
90
|
+
|
91
|
+
9) Transformation Format for 16 Planes of Group 00 (UTF-16):
|
92
|
+
ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
|
93
|
+
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
|
94
|
+
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
|
95
|
+
|
96
|
+
10) RFC 2781: UTF-16, an encoding of ISO 10646:
|
97
|
+
<http://www.ietf.org/rfc/rfc2781.txt>
|
98
|
+
|
99
|
+
11) UTF-16 invalid surrogate pairs:
|
100
|
+
<http://www.unicode.org/unicode/faq/utf_bom.html#16>
|
101
|
+
|
102
|
+
UTF-16 UTF-8 UCS-4
|
103
|
+
D83F DFF* F0 9F BF B* 0001FFF*
|
104
|
+
D87F DFF* F0 AF BF B* 0002FFF*
|
105
|
+
D8BF DFF* F0 BF BF B* 0003FFF*
|
106
|
+
D8FF DFF* F1 8F BF B* 0004FFF*
|
107
|
+
D93F DFF* F1 9F BF B* 0005FFF*
|
108
|
+
D97F DFF* F1 AF BF B* 0006FFF*
|
109
|
+
...
|
110
|
+
DBBF DFF* F3 BF BF B* 000FFFF*
|
111
|
+
DBFF DFF* F4 8F BF B* 0010FFF*
|
112
|
+
|
113
|
+
* = E or F
|
114
|
+
|
115
|
+
1010 A
|
116
|
+
1011 B
|
117
|
+
1100 C
|
118
|
+
1101 D
|
119
|
+
1110 E
|
120
|
+
1111 F
|
121
|
+
|
122
|
+
*/
|
123
|
+
|
124
|
+
#define kNumUTF8Sequences 7
|
125
|
+
#define kMaxUTF8Bytes 4
|
126
|
+
|
127
|
+
#define kUTF8ByteSwapNotAChar 0xFFFE
|
128
|
+
#define kUTF8NotAChar 0xFFFF
|
129
|
+
|
130
|
+
#define kMaxUTF8FromUCS4 0x10FFFF
|
131
|
+
|
132
|
+
#define kUTF16SurrogatesBegin 0x10000
|
133
|
+
#define kMaxUTF16FromUCS4 0x10FFFF
|
134
|
+
|
135
|
+
/* UTF-16 surrogate pair areas */
|
136
|
+
#define kUTF16LowSurrogateBegin 0xD800
|
137
|
+
#define kUTF16LowSurrogateEnd 0xDBFF
|
138
|
+
#define kUTF16HighSurrogateBegin 0xDC00
|
139
|
+
#define kUTF16HighSurrogateEnd 0xDFFF
|
140
|
+
|
141
|
+
|
142
|
+
/* offsets into validUTF8 table below */
|
143
|
+
static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
|
144
|
+
{
|
145
|
+
0, /* 1 byte */
|
146
|
+
1, /* 2 bytes */
|
147
|
+
2, /* 3 bytes */
|
148
|
+
4, /* 4 bytes */
|
149
|
+
kNumUTF8Sequences /* must be last */
|
150
|
+
};
|
151
|
+
|
152
|
+
static const struct validUTF8Sequence
|
153
|
+
{
|
154
|
+
uint lowChar;
|
155
|
+
uint highChar;
|
156
|
+
int numBytes;
|
157
|
+
byte validBytes[8];
|
158
|
+
} validUTF8[kNumUTF8Sequences] =
|
159
|
+
{
|
160
|
+
/* low high #bytes byte 1 byte 2 byte 3 byte 4 */
|
161
|
+
{0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
|
162
|
+
{0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
|
163
|
+
{0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
|
164
|
+
{0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
|
165
|
+
{0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
|
166
|
+
{0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
|
167
|
+
{0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
|
168
|
+
};
|
169
|
+
|
170
|
+
int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
|
171
|
+
TidyInputSource* inp, int* count )
|
172
|
+
{
|
173
|
+
byte tempbuf[10];
|
174
|
+
byte *buf = &tempbuf[0];
|
175
|
+
uint ch = 0, n = 0;
|
176
|
+
int i, bytes = 0;
|
177
|
+
Bool hasError = no;
|
178
|
+
|
179
|
+
if ( successorBytes )
|
180
|
+
buf = (byte*) successorBytes;
|
181
|
+
|
182
|
+
/* special check if we have been passed an EOF char */
|
183
|
+
if ( firstByte == EndOfStream )
|
184
|
+
{
|
185
|
+
/* at present */
|
186
|
+
*c = firstByte;
|
187
|
+
*count = 1;
|
188
|
+
return 0;
|
189
|
+
}
|
190
|
+
|
191
|
+
ch = firstByte; /* first byte is passed in separately */
|
192
|
+
|
193
|
+
if (ch <= 0x7F) /* 0XXX XXXX one byte */
|
194
|
+
{
|
195
|
+
n = ch;
|
196
|
+
bytes = 1;
|
197
|
+
}
|
198
|
+
else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
|
199
|
+
{
|
200
|
+
n = ch & 31;
|
201
|
+
bytes = 2;
|
202
|
+
}
|
203
|
+
else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
|
204
|
+
{
|
205
|
+
n = ch & 15;
|
206
|
+
bytes = 3;
|
207
|
+
}
|
208
|
+
else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
|
209
|
+
{
|
210
|
+
n = ch & 7;
|
211
|
+
bytes = 4;
|
212
|
+
}
|
213
|
+
else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
|
214
|
+
{
|
215
|
+
n = ch & 3;
|
216
|
+
bytes = 5;
|
217
|
+
hasError = yes;
|
218
|
+
}
|
219
|
+
else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
|
220
|
+
{
|
221
|
+
n = ch & 1;
|
222
|
+
bytes = 6;
|
223
|
+
hasError = yes;
|
224
|
+
}
|
225
|
+
else
|
226
|
+
{
|
227
|
+
/* not a valid first byte of a UTF-8 sequence */
|
228
|
+
n = ch;
|
229
|
+
bytes = 1;
|
230
|
+
hasError = yes;
|
231
|
+
}
|
232
|
+
|
233
|
+
/* successor bytes should have the form 10XX XXXX */
|
234
|
+
|
235
|
+
/* If caller supplied buffer, use it. Else see if caller
|
236
|
+
** supplied an input source, use that.
|
237
|
+
*/
|
238
|
+
if ( successorBytes )
|
239
|
+
{
|
240
|
+
for ( i=0; i < bytes-1; ++i )
|
241
|
+
{
|
242
|
+
if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
|
243
|
+
{
|
244
|
+
hasError = yes;
|
245
|
+
bytes = i+1;
|
246
|
+
break;
|
247
|
+
}
|
248
|
+
n = (n << 6) | (buf[i] & 0x3F);
|
249
|
+
}
|
250
|
+
}
|
251
|
+
else if ( inp )
|
252
|
+
{
|
253
|
+
for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
|
254
|
+
{
|
255
|
+
int b = inp->getByte( inp->sourceData );
|
256
|
+
buf[i] = (tmbchar) b;
|
257
|
+
|
258
|
+
/* End of data or illegal successor byte value */
|
259
|
+
if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
|
260
|
+
{
|
261
|
+
hasError = yes;
|
262
|
+
bytes = i+1;
|
263
|
+
if ( b != EOF )
|
264
|
+
inp->ungetByte( inp->sourceData, buf[i] );
|
265
|
+
break;
|
266
|
+
}
|
267
|
+
n = (n << 6) | (buf[i] & 0x3F);
|
268
|
+
}
|
269
|
+
}
|
270
|
+
else if ( bytes > 1 )
|
271
|
+
{
|
272
|
+
hasError = yes;
|
273
|
+
bytes = 1;
|
274
|
+
}
|
275
|
+
|
276
|
+
if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
|
277
|
+
hasError = yes;
|
278
|
+
|
279
|
+
if (!hasError && (n > kMaxUTF8FromUCS4))
|
280
|
+
hasError = yes;
|
281
|
+
|
282
|
+
#if 0 /* Breaks Big5 D8 - DF */
|
283
|
+
if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
|
284
|
+
/* unpaired surrogates not allowed */
|
285
|
+
hasError = yes;
|
286
|
+
#endif
|
287
|
+
|
288
|
+
if (!hasError)
|
289
|
+
{
|
290
|
+
int lo, hi;
|
291
|
+
|
292
|
+
lo = offsetUTF8Sequences[bytes - 1];
|
293
|
+
hi = offsetUTF8Sequences[bytes] - 1;
|
294
|
+
|
295
|
+
/* check for overlong sequences */
|
296
|
+
if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
|
297
|
+
hasError = yes;
|
298
|
+
else
|
299
|
+
{
|
300
|
+
hasError = yes; /* assume error until proven otherwise */
|
301
|
+
|
302
|
+
for (i = lo; i <= hi; i++)
|
303
|
+
{
|
304
|
+
int tempCount;
|
305
|
+
byte theByte;
|
306
|
+
|
307
|
+
for (tempCount = 0; tempCount < bytes; tempCount++)
|
308
|
+
{
|
309
|
+
if (!tempCount)
|
310
|
+
theByte = (tmbchar) firstByte;
|
311
|
+
else
|
312
|
+
theByte = buf[tempCount - 1];
|
313
|
+
|
314
|
+
if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
|
315
|
+
theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
|
316
|
+
hasError = no;
|
317
|
+
if (hasError)
|
318
|
+
break;
|
319
|
+
}
|
320
|
+
}
|
321
|
+
}
|
322
|
+
}
|
323
|
+
|
324
|
+
#if 1 && defined(_DEBUG)
|
325
|
+
if ( hasError )
|
326
|
+
{
|
327
|
+
/* debug */
|
328
|
+
fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
|
329
|
+
fprintf( stderr, "0x%02x ", firstByte );
|
330
|
+
for (i = 1; i < bytes; i++)
|
331
|
+
fprintf( stderr, "0x%02x ", buf[i - 1] );
|
332
|
+
fprintf( stderr, " = U+%04ulx\n", n );
|
333
|
+
}
|
334
|
+
#endif
|
335
|
+
|
336
|
+
*count = bytes;
|
337
|
+
*c = n;
|
338
|
+
if ( hasError )
|
339
|
+
return -1;
|
340
|
+
return 0;
|
341
|
+
}
|
342
|
+
|
343
|
+
int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
|
344
|
+
TidyOutputSink* outp, int* count )
|
345
|
+
{
|
346
|
+
byte tempbuf[10] = {0};
|
347
|
+
byte* buf = &tempbuf[0];
|
348
|
+
int bytes = 0;
|
349
|
+
Bool hasError = no;
|
350
|
+
|
351
|
+
if ( encodebuf )
|
352
|
+
buf = (byte*) encodebuf;
|
353
|
+
|
354
|
+
if (c <= 0x7F) /* 0XXX XXXX one byte */
|
355
|
+
{
|
356
|
+
buf[0] = (tmbchar) c;
|
357
|
+
bytes = 1;
|
358
|
+
}
|
359
|
+
else if (c <= 0x7FF) /* 110X XXXX two bytes */
|
360
|
+
{
|
361
|
+
buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
|
362
|
+
buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
|
363
|
+
bytes = 2;
|
364
|
+
}
|
365
|
+
else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
|
366
|
+
{
|
367
|
+
buf[0] = (tmbchar) (0xE0 | (c >> 12));
|
368
|
+
buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
|
369
|
+
buf[2] = (tmbchar) (0x80 | (c & 0x3F));
|
370
|
+
bytes = 3;
|
371
|
+
if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
|
372
|
+
hasError = yes;
|
373
|
+
#if 0 /* Breaks Big5 D8 - DF */
|
374
|
+
else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
|
375
|
+
/* unpaired surrogates not allowed */
|
376
|
+
hasError = yes;
|
377
|
+
#endif
|
378
|
+
}
|
379
|
+
else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
|
380
|
+
{
|
381
|
+
buf[0] = (tmbchar) (0xF0 | (c >> 18));
|
382
|
+
buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
|
383
|
+
buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
|
384
|
+
buf[3] = (tmbchar) (0x80 | (c & 0x3F));
|
385
|
+
bytes = 4;
|
386
|
+
if (c > kMaxUTF8FromUCS4)
|
387
|
+
hasError = yes;
|
388
|
+
}
|
389
|
+
else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
|
390
|
+
{
|
391
|
+
buf[0] = (tmbchar) (0xF8 | (c >> 24));
|
392
|
+
buf[1] = (tmbchar) (0x80 | (c >> 18));
|
393
|
+
buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
|
394
|
+
buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
|
395
|
+
buf[4] = (tmbchar) (0x80 | (c & 0x3F));
|
396
|
+
bytes = 5;
|
397
|
+
hasError = yes;
|
398
|
+
}
|
399
|
+
else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
|
400
|
+
{
|
401
|
+
buf[0] = (tmbchar) (0xFC | (c >> 30));
|
402
|
+
buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
|
403
|
+
buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
|
404
|
+
buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
|
405
|
+
buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
|
406
|
+
buf[5] = (tmbchar) (0x80 | (c & 0x3F));
|
407
|
+
bytes = 6;
|
408
|
+
hasError = yes;
|
409
|
+
}
|
410
|
+
else
|
411
|
+
hasError = yes;
|
412
|
+
|
413
|
+
/* don't output invalid UTF-8 byte sequence to a stream */
|
414
|
+
if ( !hasError && outp != NULL )
|
415
|
+
{
|
416
|
+
int ix;
|
417
|
+
for ( ix=0; ix < bytes; ++ix )
|
418
|
+
outp->putByte( outp->sinkData, buf[ix] );
|
419
|
+
}
|
420
|
+
|
421
|
+
#if 1 && defined(_DEBUG)
|
422
|
+
if ( hasError )
|
423
|
+
{
|
424
|
+
int i;
|
425
|
+
fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
|
426
|
+
for (i = 0; i < bytes; i++)
|
427
|
+
fprintf( stderr, "0x%02x ", buf[i] );
|
428
|
+
fprintf( stderr, "\n" );
|
429
|
+
}
|
430
|
+
#endif
|
431
|
+
|
432
|
+
*count = bytes;
|
433
|
+
if (hasError)
|
434
|
+
return -1;
|
435
|
+
return 0;
|
436
|
+
}
|
437
|
+
|
438
|
+
|
439
|
+
/* return one less than the number of bytes used by the UTF-8 byte sequence */
|
440
|
+
/* str points to the UTF-8 byte sequence */
|
441
|
+
/* the Unicode char is returned in *ch */
|
442
|
+
uint TY_(GetUTF8)( ctmbstr str, uint *ch )
|
443
|
+
{
|
444
|
+
uint n;
|
445
|
+
int bytes;
|
446
|
+
|
447
|
+
int err;
|
448
|
+
|
449
|
+
bytes = 0;
|
450
|
+
|
451
|
+
/* first byte "str[0]" is passed in separately from the */
|
452
|
+
/* rest of the UTF-8 byte sequence starting at "str[1]" */
|
453
|
+
err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
|
454
|
+
if (err)
|
455
|
+
{
|
456
|
+
#if 1 && defined(_DEBUG)
|
457
|
+
fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
|
458
|
+
#endif
|
459
|
+
n = 0xFFFD; /* replacement char */
|
460
|
+
}
|
461
|
+
|
462
|
+
*ch = n;
|
463
|
+
return bytes - 1;
|
464
|
+
}
|
465
|
+
|
466
|
+
/* store char c as UTF-8 encoded byte stream */
|
467
|
+
tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
|
468
|
+
{
|
469
|
+
int err, count = 0;
|
470
|
+
|
471
|
+
err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
|
472
|
+
if (err)
|
473
|
+
{
|
474
|
+
#if 1 && defined(_DEBUG)
|
475
|
+
fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
|
476
|
+
#endif
|
477
|
+
/* replacement char 0xFFFD encoded as UTF-8 */
|
478
|
+
buf[0] = (byte) 0xEF;
|
479
|
+
buf[1] = (byte) 0xBF;
|
480
|
+
buf[2] = (byte) 0xBD;
|
481
|
+
count = 3;
|
482
|
+
}
|
483
|
+
|
484
|
+
buf += count;
|
485
|
+
return buf;
|
486
|
+
}
|
487
|
+
|
488
|
+
Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
|
489
|
+
{
|
490
|
+
return ( ucs4 <= kMaxUTF16FromUCS4 );
|
491
|
+
}
|
492
|
+
|
493
|
+
Bool TY_(IsHighSurrogate)( tchar ch )
|
494
|
+
{
|
495
|
+
return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
|
496
|
+
}
|
497
|
+
Bool TY_(IsLowSurrogate)( tchar ch )
|
498
|
+
{
|
499
|
+
return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
|
500
|
+
}
|
501
|
+
|
502
|
+
tchar TY_(CombineSurrogatePair)( tchar high, tchar low )
|
503
|
+
{
|
504
|
+
assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
|
505
|
+
return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
|
506
|
+
high - kUTF16HighSurrogateBegin + 0x10000 );
|
507
|
+
}
|
508
|
+
|
509
|
+
Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
|
510
|
+
{
|
511
|
+
Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
|
512
|
+
if ( status )
|
513
|
+
{
|
514
|
+
*low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
|
515
|
+
*high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
|
516
|
+
}
|
517
|
+
return status;
|
518
|
+
}
|
519
|
+
|
520
|
+
Bool TY_(IsValidCombinedChar)( tchar ch )
|
521
|
+
{
|
522
|
+
return ( ch >= kUTF16SurrogatesBegin &&
|
523
|
+
(ch & 0x0000FFFE) != 0x0000FFFE &&
|
524
|
+
(ch & 0x0000FFFF) != 0x0000FFFF );
|
525
|
+
}
|
526
|
+
|
527
|
+
Bool TY_(IsCombinedChar)( tchar ch )
|
528
|
+
{
|
529
|
+
return ( ch >= kUTF16SurrogatesBegin );
|
530
|
+
}
|
531
|
+
|
532
|
+
/*
|
533
|
+
* local variables:
|
534
|
+
* mode: c
|
535
|
+
* indent-tabs-mode: nil
|
536
|
+
* c-basic-offset: 4
|
537
|
+
* eval: (c-set-offset 'substatement-open 0)
|
538
|
+
* end:
|
539
|
+
*/
|
data/ext/tidy/utf8.h
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#ifndef __UTF8_H__
|
2
|
+
#define __UTF8_H__
|
3
|
+
|
4
|
+
/* utf8.h -- convert characters to/from UTF-8
|
5
|
+
|
6
|
+
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
|
7
|
+
See tidy.h for the copyright notice.
|
8
|
+
|
9
|
+
CVS Info :
|
10
|
+
|
11
|
+
$Author: arnaud02 $
|
12
|
+
$Date: 2006/09/12 15:14:44 $
|
13
|
+
$Revision: 1.5 $
|
14
|
+
|
15
|
+
*/
|
16
|
+
|
17
|
+
#include "platform.h"
|
18
|
+
#include "buffio.h"
|
19
|
+
|
20
|
+
/* UTF-8 encoding/decoding support
|
21
|
+
** Does not convert character "codepoints", i.e. to/from 10646.
|
22
|
+
*/
|
23
|
+
|
24
|
+
int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
|
25
|
+
TidyInputSource* inp, int* count );
|
26
|
+
|
27
|
+
int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
|
28
|
+
TidyOutputSink* outp, int* count );
|
29
|
+
|
30
|
+
|
31
|
+
uint TY_(GetUTF8)( ctmbstr str, uint *ch );
|
32
|
+
tmbstr TY_(PutUTF8)( tmbstr buf, uint c );
|
33
|
+
|
34
|
+
#define UNICODE_BOM_BE 0xFEFF /* big-endian (default) UNICODE BOM */
|
35
|
+
#define UNICODE_BOM UNICODE_BOM_BE
|
36
|
+
#define UNICODE_BOM_LE 0xFFFE /* little-endian UNICODE BOM */
|
37
|
+
#define UNICODE_BOM_UTF8 0xEFBBBF /* UTF-8 UNICODE BOM */
|
38
|
+
|
39
|
+
|
40
|
+
Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 );
|
41
|
+
Bool TY_(IsHighSurrogate)( tchar ch );
|
42
|
+
Bool TY_(IsLowSurrogate)( tchar ch );
|
43
|
+
|
44
|
+
Bool TY_(IsCombinedChar)( tchar ch );
|
45
|
+
Bool TY_(IsValidCombinedChar)( tchar ch );
|
46
|
+
|
47
|
+
tchar TY_(CombineSurrogatePair)( tchar high, tchar low );
|
48
|
+
Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* high, tchar* low );
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
#endif /* __UTF8_H__ */
|
data/ext/tidy/version.h
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
/* version information
|
2
|
+
|
3
|
+
(c) 2007-2009 (W3C) MIT, ERCIM, Keio University
|
4
|
+
See tidy.h for the copyright notice.
|
5
|
+
|
6
|
+
CVS Info :
|
7
|
+
|
8
|
+
$Author: arnaud02 $
|
9
|
+
$Date: 2009/03/25 21:37:11 $
|
10
|
+
$Revision: 1.46 $
|
11
|
+
|
12
|
+
*/
|
13
|
+
|
14
|
+
static const char TY_(release_date)[] = "25 March 2009";
|