tidy-ext 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
@@ -0,0 +1,1407 @@
1
+ /* streamio.c -- handles character stream I/O
2
+
3
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: arnaud02 $
9
+ $Date: 2008/03/22 21:00:18 $
10
+ $Revision: 1.43 $
11
+
12
+ Wrapper around Tidy input source and output sink
13
+ that calls appropriate interfaces, and applies
14
+ necessary char encoding transformations: to/from
15
+ ISO-10646 and/or UTF-8.
16
+
17
+ */
18
+
19
+ #include <stdio.h>
20
+ #include <errno.h>
21
+
22
+ #include "streamio.h"
23
+ #include "tidy-int.h"
24
+ #include "lexer.h"
25
+ #include "message.h"
26
+ #include "utf8.h"
27
+ #include "tmbstr.h"
28
+
29
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
30
+ #include "win32tc.h"
31
+ #endif
32
+
33
+ /************************
34
+ ** Forward Declarations
35
+ ************************/
36
+
37
+ static uint ReadCharFromStream( StreamIn* in );
38
+
39
+ static uint ReadByte( StreamIn* in );
40
+ static void UngetByte( StreamIn* in, uint byteValue );
41
+
42
+ static void PutByte( uint byteValue, StreamOut* out );
43
+
44
+ static void EncodeWin1252( uint c, StreamOut* out );
45
+ static void EncodeMacRoman( uint c, StreamOut* out );
46
+ static void EncodeIbm858( uint c, StreamOut* out );
47
+ static void EncodeLatin0( uint c, StreamOut* out );
48
+
49
+ static uint DecodeIbm850(uint c);
50
+ static uint DecodeLatin0(uint c);
51
+
52
+ static uint PopChar( StreamIn *in );
53
+
54
+ /******************************
55
+ ** Static (duration) Globals
56
+ ******************************/
57
+
58
+ static StreamOut stderrStreamOut =
59
+ {
60
+ ASCII,
61
+ FSM_ASCII,
62
+ DEFAULT_NL_CONFIG,
63
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
64
+ NULL,
65
+ #endif
66
+ FileIO,
67
+ { 0, TY_(filesink_putByte) }
68
+ };
69
+
70
+ static StreamOut stdoutStreamOut =
71
+ {
72
+ ASCII,
73
+ FSM_ASCII,
74
+ DEFAULT_NL_CONFIG,
75
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
76
+ NULL,
77
+ #endif
78
+ FileIO,
79
+ { 0, TY_(filesink_putByte) }
80
+ };
81
+
82
+ StreamOut* TY_(StdErrOutput)(void)
83
+ {
84
+ if ( stderrStreamOut.sink.sinkData == 0 )
85
+ stderrStreamOut.sink.sinkData = stderr;
86
+ return &stderrStreamOut;
87
+ }
88
+
89
+ #if 0
90
+ StreamOut* TY_(StdOutOutput)(void)
91
+ {
92
+ if ( stdoutStreamOut.sink.sinkData == 0 )
93
+ stdoutStreamOut.sink.sinkData = stdout;
94
+ return &stdoutStreamOut;
95
+ }
96
+ #endif
97
+
98
+ void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out )
99
+ {
100
+ if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
101
+ {
102
+ if ( out->iotype == FileIO )
103
+ fclose( (FILE*) out->sink.sinkData );
104
+ TidyDocFree( doc, out );
105
+ }
106
+ }
107
+
108
+ /************************
109
+ ** Source
110
+ ************************/
111
+
112
+ static void InitLastPos( StreamIn *in );
113
+
114
+ StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
115
+ {
116
+ StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) );
117
+
118
+ TidyClearMemory( in, sizeof(StreamIn) );
119
+ in->curline = 1;
120
+ in->curcol = 1;
121
+ in->encoding = encoding;
122
+ in->state = FSM_ASCII;
123
+ in->doc = doc;
124
+ in->bufsize = CHARBUF_SIZE;
125
+ in->allocator = doc->allocator;
126
+ in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize);
127
+ InitLastPos( in );
128
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
129
+ in->otextbuf = NULL;
130
+ in->otextlen = 0;
131
+ in->otextsize = 0;
132
+ #endif
133
+ return in;
134
+ }
135
+
136
+ void TY_(freeStreamIn)(StreamIn* in)
137
+ {
138
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
139
+ if (in->otextbuf)
140
+ TidyFree(in->allocator, in->otextbuf);
141
+ #endif
142
+ TidyFree(in->allocator, in->charbuf);
143
+ TidyFree(in->allocator, in);
144
+ }
145
+
146
+ StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
147
+ {
148
+ StreamIn *in = TY_(initStreamIn)( doc, encoding );
149
+ if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 )
150
+ {
151
+ TY_(freeStreamIn)( in );
152
+ return NULL;
153
+ }
154
+ in->iotype = FileIO;
155
+ return in;
156
+ }
157
+
158
+ StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
159
+ {
160
+ StreamIn *in = TY_(initStreamIn)( doc, encoding );
161
+ tidyInitInputBuffer( &in->source, buf );
162
+ in->iotype = BufferIO;
163
+ return in;
164
+ }
165
+
166
+ StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
167
+ {
168
+ StreamIn *in = TY_(initStreamIn)( doc, encoding );
169
+ memcpy( &in->source, source, sizeof(TidyInputSource) );
170
+ in->iotype = UserIO;
171
+ return in;
172
+ }
173
+
174
+ int TY_(ReadBOMEncoding)(StreamIn *in)
175
+ {
176
+ uint c, c1;
177
+ #if SUPPORT_UTF16_ENCODINGS
178
+ uint bom;
179
+ #endif
180
+
181
+ c = ReadByte(in);
182
+ if (c == EndOfStream)
183
+ return -1;
184
+
185
+ c1 = ReadByte( in );
186
+ if (c1 == EndOfStream)
187
+ {
188
+ UngetByte(in, c);
189
+ return -1;
190
+ }
191
+
192
+ /* todo: dont warn about mismatch for auto input encoding */
193
+ /* todo: let the user override the encoding found here */
194
+
195
+ #if SUPPORT_UTF16_ENCODINGS
196
+ bom = (c << 8) + c1;
197
+
198
+ if ( bom == UNICODE_BOM_BE )
199
+ {
200
+ /* big-endian UTF-16 */
201
+ if ( in->encoding != UTF16 && in->encoding != UTF16BE )
202
+ TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE);
203
+
204
+ return UTF16BE; /* return decoded BOM */
205
+ }
206
+ else if (bom == UNICODE_BOM_LE)
207
+ {
208
+ /* little-endian UTF-16 */
209
+ if (in->encoding != UTF16 && in->encoding != UTF16LE)
210
+ TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
211
+
212
+ return UTF16LE; /* return decoded BOM */
213
+ }
214
+ else
215
+ #endif /* SUPPORT_UTF16_ENCODINGS */
216
+ {
217
+ uint c2 = ReadByte(in);
218
+
219
+ if (c2 == EndOfStream)
220
+ {
221
+ UngetByte(in, c1);
222
+ UngetByte(in, c);
223
+ return -1;
224
+ }
225
+
226
+ if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
227
+ {
228
+ /* UTF-8 */
229
+ if (in->encoding != UTF8)
230
+ TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8);
231
+
232
+ return UTF8;
233
+ }
234
+ else
235
+ UngetByte( in, c2 );
236
+ }
237
+
238
+ UngetByte(in, c1);
239
+ UngetByte(in, c);
240
+
241
+ return -1;
242
+ }
243
+
244
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
245
+ void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c)
246
+ {
247
+ if (in->otextlen + 1 >= in->otextsize)
248
+ {
249
+ size_t size = in->otextsize ? 1 : 2;
250
+ in->otextbuf = TidyRealloc(in->allocator, in->otextbuf, in->otextsize + size);
251
+ in->otextsize += size;
252
+ }
253
+ in->otextbuf[in->otextlen++] = c;
254
+ in->otextbuf[in->otextlen ] = 0;
255
+ }
256
+
257
+ void TY_(AddCharToOriginalText)(StreamIn *in, tchar c)
258
+ {
259
+ int i, err, count = 0;
260
+ tmbchar buf[10] = {0};
261
+
262
+ err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count);
263
+
264
+ if (err)
265
+ {
266
+ /* replacement character 0xFFFD encoded as UTF-8 */
267
+ buf[0] = (byte) 0xEF;
268
+ buf[1] = (byte) 0xBF;
269
+ buf[2] = (byte) 0xBD;
270
+ count = 3;
271
+ }
272
+
273
+ for (i = 0; i < count; ++i)
274
+ TY_(AddByteToOriginalText)(in, buf[i]);
275
+ }
276
+ #endif
277
+
278
+ static void InitLastPos( StreamIn *in )
279
+ {
280
+ in->curlastpos = 0;
281
+ in->firstlastpos = 0;
282
+ }
283
+
284
+ static void PopLastPos( StreamIn *in )
285
+ {
286
+ in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE;
287
+ if ( in->curlastpos == in->firstlastpos )
288
+ in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE;
289
+ }
290
+
291
+ static void SaveLastPos( StreamIn *in )
292
+ {
293
+ PopLastPos( in );
294
+ in->lastcols[in->curlastpos] = in->curcol;
295
+ }
296
+
297
+ static void RestoreLastPos( StreamIn *in )
298
+ {
299
+ if ( in->firstlastpos == in->curlastpos )
300
+ in->curcol = 0;
301
+ else
302
+ {
303
+ in->curcol = in->lastcols[in->curlastpos];
304
+ if ( in->curlastpos == 0 )
305
+ in->curlastpos = LASTPOS_SIZE;
306
+ in->curlastpos--;
307
+ }
308
+ }
309
+
310
+ uint TY_(ReadChar)( StreamIn *in )
311
+ {
312
+ uint c = EndOfStream;
313
+ uint tabsize = cfg( in->doc, TidyTabSize );
314
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
315
+ Bool added = no;
316
+ #endif
317
+
318
+ if ( in->pushed )
319
+ return PopChar( in );
320
+
321
+ SaveLastPos( in );
322
+
323
+ if ( in->tabs > 0 )
324
+ {
325
+ in->curcol++;
326
+ in->tabs--;
327
+ return ' ';
328
+ }
329
+
330
+ for (;;)
331
+ {
332
+ c = ReadCharFromStream(in);
333
+
334
+ if ( EndOfStream == c )
335
+ return EndOfStream;
336
+
337
+ if (c == '\n')
338
+ {
339
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
340
+ added = yes;
341
+ TY_(AddCharToOriginalText)(in, (tchar)c);
342
+ #endif
343
+ in->curcol = 1;
344
+ in->curline++;
345
+ break;
346
+ }
347
+
348
+ if (c == '\t')
349
+ {
350
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
351
+ added = yes;
352
+ TY_(AddCharToOriginalText)(in, (tchar)c);
353
+ #endif
354
+ in->tabs = tabsize > 0 ?
355
+ tabsize - ((in->curcol - 1) % tabsize) - 1
356
+ : 0;
357
+ in->curcol++;
358
+ c = ' ';
359
+ break;
360
+ }
361
+
362
+ /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
363
+ if (c == '\r')
364
+ {
365
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
366
+ added = yes;
367
+ TY_(AddCharToOriginalText)(in, (tchar)c);
368
+ #endif
369
+ c = ReadCharFromStream(in);
370
+ if (c != '\n')
371
+ {
372
+ TY_(UngetChar)( c, in );
373
+ c = '\n';
374
+ }
375
+ else
376
+ {
377
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
378
+ TY_(AddCharToOriginalText)(in, (tchar)c);
379
+ #endif
380
+ }
381
+ in->curcol = 1;
382
+ in->curline++;
383
+ break;
384
+ }
385
+
386
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
387
+ /* strip control characters, except for Esc */
388
+ if (c == '\033')
389
+ break;
390
+ #endif
391
+
392
+ /* Form Feed is allowed in HTML */
393
+ if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
394
+ break;
395
+
396
+ if ( c < 32 )
397
+ continue; /* discard control char */
398
+
399
+ /* watch out for chars that have already been decoded such as */
400
+ /* IS02022, UTF-8 etc, that don't require further decoding */
401
+
402
+ if (
403
+ in->encoding == RAW
404
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
405
+ || in->encoding == ISO2022
406
+ #endif
407
+ || in->encoding == UTF8
408
+
409
+ #if SUPPORT_ASIAN_ENCODINGS
410
+ || in->encoding == SHIFTJIS /* #431953 - RJ */
411
+ || in->encoding == BIG5 /* #431953 - RJ */
412
+ #endif
413
+ )
414
+ {
415
+ in->curcol++;
416
+ break;
417
+ }
418
+
419
+ #if SUPPORT_UTF16_ENCODINGS
420
+ /* handle surrogate pairs */
421
+ if ( in->encoding == UTF16LE ||
422
+ in->encoding == UTF16 ||
423
+ in->encoding == UTF16BE )
424
+ {
425
+ if ( !TY_(IsValidUTF16FromUCS4)(c) )
426
+ {
427
+ /* invalid UTF-16 value */
428
+ TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
429
+ c = 0;
430
+ }
431
+ else if ( TY_(IsLowSurrogate)(c) )
432
+ {
433
+ uint n = c;
434
+ uint m = ReadCharFromStream( in );
435
+ if ( m == EndOfStream )
436
+ return EndOfStream;
437
+
438
+ c = 0;
439
+ if ( TY_(IsHighSurrogate)(m) )
440
+ {
441
+ n = TY_(CombineSurrogatePair)( m, n );
442
+ if ( TY_(IsValidCombinedChar)(n) )
443
+ c = n;
444
+ }
445
+ /* not a valid pair */
446
+ if ( 0 == c )
447
+ TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
448
+ }
449
+ }
450
+ #endif
451
+
452
+ /* Do first: acts on range 128 - 255 */
453
+ switch ( in->encoding )
454
+ {
455
+ case MACROMAN:
456
+ c = TY_(DecodeMacRoman)( c );
457
+ break;
458
+ case IBM858:
459
+ c = DecodeIbm850( c );
460
+ break;
461
+ case LATIN0:
462
+ c = DecodeLatin0( c );
463
+ break;
464
+ }
465
+
466
+ /* produced e.g. as a side-effect of smart quotes in Word */
467
+ /* but can't happen if using MACROMAN encoding */
468
+ if ( 127 < c && c < 160 )
469
+ {
470
+ uint c1 = 0, replMode = DISCARDED_CHAR;
471
+ Bool isVendorChar = ( in->encoding == WIN1252 ||
472
+ in->encoding == MACROMAN );
473
+ Bool isWinChar = ( in->encoding == WIN1252 ||
474
+ TY_(ReplacementCharEncoding) == WIN1252 );
475
+ Bool isMacChar = ( in->encoding == MACROMAN ||
476
+ TY_(ReplacementCharEncoding) == MACROMAN );
477
+
478
+ /* set error position just before offending character */
479
+ if (in->doc->lexer)
480
+ {
481
+ in->doc->lexer->lines = in->curline;
482
+ in->doc->lexer->columns = in->curcol;
483
+ }
484
+
485
+ if ( isWinChar )
486
+ c1 = TY_(DecodeWin1252)( c );
487
+ else if ( isMacChar )
488
+ c1 = TY_(DecodeMacRoman)( c );
489
+ if ( c1 )
490
+ replMode = REPLACED_CHAR;
491
+
492
+ if ( c1 == 0 && isVendorChar )
493
+ TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
494
+ else if ( ! isVendorChar )
495
+ TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
496
+
497
+ c = c1;
498
+ }
499
+
500
+ if ( c == 0 )
501
+ continue; /* illegal char is discarded */
502
+
503
+ in->curcol++;
504
+ break;
505
+ }
506
+
507
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
508
+ if (!added)
509
+ TY_(AddCharToOriginalText)(in, (tchar)c);
510
+ #endif
511
+
512
+ return c;
513
+ }
514
+
515
+ static uint PopChar( StreamIn *in )
516
+ {
517
+ uint c = EndOfStream;
518
+ if ( in->pushed )
519
+ {
520
+ assert( in->bufpos > 0 );
521
+ c = in->charbuf[ --in->bufpos ];
522
+ if ( in->bufpos == 0 )
523
+ in->pushed = no;
524
+
525
+ if ( c == '\n' )
526
+ {
527
+ in->curcol = 1;
528
+ in->curline++;
529
+ PopLastPos( in );
530
+ return c;
531
+ }
532
+ in->curcol++;
533
+ PopLastPos( in );
534
+ }
535
+ return c;
536
+ }
537
+
538
+ void TY_(UngetChar)( uint c, StreamIn *in )
539
+ {
540
+ if (c == EndOfStream)
541
+ {
542
+ /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
543
+ return;
544
+ }
545
+
546
+ in->pushed = yes;
547
+
548
+ if (in->bufpos + 1 >= in->bufsize)
549
+ in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize));
550
+
551
+ in->charbuf[(in->bufpos)++] = c;
552
+
553
+ if (c == '\n')
554
+ --(in->curline);
555
+
556
+ RestoreLastPos( in );
557
+ }
558
+
559
+
560
+
561
+ /************************
562
+ ** Sink
563
+ ************************/
564
+
565
+ static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl )
566
+ {
567
+ StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) );
568
+ TidyClearMemory( out, sizeof(StreamOut) );
569
+ out->encoding = encoding;
570
+ out->state = FSM_ASCII;
571
+ out->nl = nl;
572
+ return out;
573
+ }
574
+
575
+ StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl )
576
+ {
577
+ StreamOut* out = initStreamOut( doc, encoding, nl );
578
+ TY_(initFileSink)( &out->sink, fp );
579
+ out->iotype = FileIO;
580
+ return out;
581
+ }
582
+ StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl )
583
+ {
584
+ StreamOut* out = initStreamOut( doc, encoding, nl );
585
+ tidyInitOutputBuffer( &out->sink, buf );
586
+ out->iotype = BufferIO;
587
+ return out;
588
+ }
589
+ StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl )
590
+ {
591
+ StreamOut* out = initStreamOut( doc, encoding, nl );
592
+ memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
593
+ out->iotype = UserIO;
594
+ return out;
595
+ }
596
+
597
+ void TY_(WriteChar)( uint c, StreamOut* out )
598
+ {
599
+ /* Translate outgoing newlines */
600
+ if ( LF == c )
601
+ {
602
+ if ( out->nl == TidyCRLF )
603
+ TY_(WriteChar)( CR, out );
604
+ else if ( out->nl == TidyCR )
605
+ c = CR;
606
+ }
607
+
608
+ if (out->encoding == MACROMAN)
609
+ {
610
+ EncodeMacRoman( c, out );
611
+ }
612
+ else if (out->encoding == WIN1252)
613
+ {
614
+ EncodeWin1252( c, out );
615
+ }
616
+ else if (out->encoding == IBM858)
617
+ {
618
+ EncodeIbm858( c, out );
619
+ }
620
+ else if (out->encoding == LATIN0)
621
+ {
622
+ EncodeLatin0( c, out );
623
+ }
624
+
625
+ else if (out->encoding == UTF8)
626
+ {
627
+ int count = 0;
628
+
629
+ TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
630
+ if (count <= 0)
631
+ {
632
+ /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
633
+ /* replacement char 0xFFFD encoded as UTF-8 */
634
+ PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
635
+ }
636
+ }
637
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
638
+ else if (out->encoding == ISO2022)
639
+ {
640
+ if (c == 0x1b) /* ESC */
641
+ out->state = FSM_ESC;
642
+ else
643
+ {
644
+ switch (out->state)
645
+ {
646
+ case FSM_ESC:
647
+ if (c == '$')
648
+ out->state = FSM_ESCD;
649
+ else if (c == '(')
650
+ out->state = FSM_ESCP;
651
+ else
652
+ out->state = FSM_ASCII;
653
+ break;
654
+
655
+ case FSM_ESCD:
656
+ if (c == '(')
657
+ out->state = FSM_ESCDP;
658
+ else
659
+ out->state = FSM_NONASCII;
660
+ break;
661
+
662
+ case FSM_ESCDP:
663
+ out->state = FSM_NONASCII;
664
+ break;
665
+
666
+ case FSM_ESCP:
667
+ out->state = FSM_ASCII;
668
+ break;
669
+
670
+ case FSM_NONASCII:
671
+ c &= 0x7F;
672
+ break;
673
+
674
+ case FSM_ASCII:
675
+ break;
676
+ }
677
+ }
678
+
679
+ PutByte(c, out);
680
+ }
681
+ #endif /* NO_NATIVE_ISO2022_SUPPORT */
682
+
683
+ #if SUPPORT_UTF16_ENCODINGS
684
+ else if ( out->encoding == UTF16LE ||
685
+ out->encoding == UTF16BE ||
686
+ out->encoding == UTF16 )
687
+ {
688
+ int i, numChars = 1;
689
+ uint theChars[2];
690
+
691
+ if ( !TY_(IsValidUTF16FromUCS4)(c) )
692
+ {
693
+ /* invalid UTF-16 value */
694
+ /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
695
+ c = 0;
696
+ numChars = 0;
697
+ }
698
+ else if ( TY_(IsCombinedChar)(c) )
699
+ {
700
+ /* output both, unless something goes wrong */
701
+ numChars = 2;
702
+ if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
703
+ {
704
+ /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
705
+ c = 0;
706
+ numChars = 0;
707
+ }
708
+ }
709
+ else
710
+ {
711
+ /* just put the char out */
712
+ theChars[0] = c;
713
+ }
714
+
715
+ for (i = 0; i < numChars; i++)
716
+ {
717
+ c = theChars[i];
718
+
719
+ if (out->encoding == UTF16LE)
720
+ {
721
+ uint ch = c & 0xFF; PutByte(ch, out);
722
+ ch = (c >> 8) & 0xFF; PutByte(ch, out);
723
+ }
724
+
725
+ else if (out->encoding == UTF16BE || out->encoding == UTF16)
726
+ {
727
+ uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
728
+ ch = c & 0xFF; PutByte(ch, out);
729
+ }
730
+ }
731
+ }
732
+ #endif
733
+
734
+ #if SUPPORT_ASIAN_ENCODINGS
735
+ else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
736
+ {
737
+ if (c < 128)
738
+ PutByte(c, out);
739
+ else
740
+ {
741
+ uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
742
+ ch = c & 0xFF; PutByte(ch, out);
743
+ }
744
+ }
745
+ #endif
746
+
747
+ else
748
+ PutByte( c, out );
749
+ }
750
+
751
+
752
+
753
+ /****************************
754
+ ** Miscellaneous / Helpers
755
+ ****************************/
756
+
757
+ /* char encoding used when replacing illegal SGML chars,
758
+ ** regardless of specified encoding. Set at compile time
759
+ ** to either Windows or Mac.
760
+ */
761
+ const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC;
762
+
763
+
764
+ /* Mapping for Windows Western character set CP 1252
765
+ ** (chars 128-159/U+0080-U+009F) to Unicode.
766
+ */
767
+ static const uint Win2Unicode[32] =
768
+ {
769
+ 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
770
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
771
+ 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
772
+ 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
773
+ };
774
+
775
+ /* Function for conversion from Windows-1252 to Unicode */
776
+ uint TY_(DecodeWin1252)(uint c)
777
+ {
778
+ if (127 < c && c < 160)
779
+ c = Win2Unicode[c - 128];
780
+
781
+ return c;
782
+ }
783
+
784
+ static void EncodeWin1252( uint c, StreamOut* out )
785
+ {
786
+ if (c < 128 || (c > 159 && c < 256))
787
+ PutByte(c, out);
788
+ else
789
+ {
790
+ int i;
791
+
792
+ for (i = 128; i < 160; i++)
793
+ if (Win2Unicode[i - 128] == c)
794
+ {
795
+ PutByte(i, out);
796
+ break;
797
+ }
798
+ }
799
+ }
800
+
801
+ /*
802
+ John Love-Jensen contributed this table for mapping MacRoman
803
+ character set to Unicode
804
+ */
805
+
806
+ /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
807
+ static const uint Mac2Unicode[128] =
808
+ {
809
+ /* x7F = DEL */
810
+
811
+ 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
812
+ 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
813
+
814
+ 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
815
+ 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
816
+
817
+ 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
818
+ 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
819
+
820
+ 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
821
+ /* =BD U+2126 OHM SIGN */
822
+ 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
823
+
824
+ 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
825
+ 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
826
+
827
+ 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
828
+ /* =DB U+00A4 CURRENCY SIGN */
829
+ 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
830
+
831
+ 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
832
+ 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
833
+ /* xF0 = Apple Logo */
834
+ /* =F0 U+2665 BLACK HEART SUIT */
835
+ 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
836
+ 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
837
+ };
838
+
839
+ /* Function to convert from MacRoman to Unicode */
840
+ uint TY_(DecodeMacRoman)(uint c)
841
+ {
842
+ if (127 < c)
843
+ c = Mac2Unicode[c - 128];
844
+ return c;
845
+ }
846
+
847
+ static void EncodeMacRoman( uint c, StreamOut* out )
848
+ {
849
+ if (c < 128)
850
+ PutByte(c, out);
851
+ else
852
+ {
853
+ /* For mac users, map Unicode back to MacRoman. */
854
+ int i;
855
+ for (i = 128; i < 256; i++)
856
+ {
857
+ if (Mac2Unicode[i - 128] == c)
858
+ {
859
+ PutByte(i, out);
860
+ break;
861
+ }
862
+ }
863
+ }
864
+ }
865
+
866
+ /* Mapping for OS/2 Western character set CP 850
867
+ ** (chars 128-255) to Unicode.
868
+ */
869
+ static const uint IBM2Unicode[128] =
870
+ {
871
+ 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
872
+ 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
873
+ 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
874
+ 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
875
+ 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
876
+ 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
877
+ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
878
+ 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
879
+ 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
880
+ 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
881
+ 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
882
+ 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
883
+ 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
884
+ 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
885
+ 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
886
+ 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
887
+ };
888
+
889
+ /* Function for conversion from OS/2-850 to Unicode */
890
+ static uint DecodeIbm850(uint c)
891
+ {
892
+ if (127 < c && c < 256)
893
+ c = IBM2Unicode[c - 128];
894
+
895
+ return c;
896
+ }
897
+
898
+ /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
899
+ static void EncodeIbm858( uint c, StreamOut* out )
900
+ {
901
+ if (c < 128)
902
+ PutByte(c, out);
903
+ else
904
+ {
905
+ int i;
906
+ for (i = 128; i < 256; i++)
907
+ {
908
+ if (IBM2Unicode[i - 128] == c)
909
+ {
910
+ PutByte(i, out);
911
+ break;
912
+ }
913
+ }
914
+ }
915
+ }
916
+
917
+
918
+ /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
919
+ static uint DecodeLatin0(uint c)
920
+ {
921
+ if (159 < c && c < 191)
922
+ {
923
+ switch (c)
924
+ {
925
+ case 0xA4: c = 0x20AC; break;
926
+ case 0xA6: c = 0x0160; break;
927
+ case 0xA8: c = 0x0161; break;
928
+ case 0xB4: c = 0x017D; break;
929
+ case 0xB8: c = 0x017E; break;
930
+ case 0xBC: c = 0x0152; break;
931
+ case 0xBD: c = 0x0153; break;
932
+ case 0xBE: c = 0x0178; break;
933
+ }
934
+ }
935
+ return c;
936
+ }
937
+
938
+ /* Map Unicode back to ISO-8859-15. */
939
+ static void EncodeLatin0( uint c, StreamOut* out )
940
+ {
941
+ switch (c)
942
+ {
943
+ case 0x20AC: c = 0xA4; break;
944
+ case 0x0160: c = 0xA6; break;
945
+ case 0x0161: c = 0xA8; break;
946
+ case 0x017D: c = 0xB4; break;
947
+ case 0x017E: c = 0xB8; break;
948
+ case 0x0152: c = 0xBC; break;
949
+ case 0x0153: c = 0xBD; break;
950
+ case 0x0178: c = 0xBE; break;
951
+ }
952
+ PutByte(c, out);
953
+ }
954
+
955
+ /*
956
+ Table to map symbol font characters to Unicode; undefined
957
+ characters are mapped to 0x0000 and characters without any
958
+ Unicode equivalent are mapped to '?'. Is this appropriate?
959
+ */
960
+
961
+ static const uint Symbol2Unicode[] =
962
+ {
963
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
964
+ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
965
+
966
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
967
+ 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
968
+
969
+ 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
970
+ 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
971
+
972
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
973
+ 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
974
+
975
+ 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
976
+ 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
977
+
978
+ 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
979
+ 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
980
+
981
+ 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
982
+ 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
983
+
984
+ 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
985
+ 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
986
+
987
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
988
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
989
+
990
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
991
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
992
+
993
+ 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
994
+ 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
995
+
996
+ 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
997
+ 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
998
+
999
+ 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
1000
+ 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
1001
+
1002
+ 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
1003
+ 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
1004
+
1005
+ 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
1006
+ 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
1007
+
1008
+ 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
1009
+ 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
1010
+ };
1011
+
1012
+ #if 0
1013
+ /* Function to convert from Symbol Font chars to Unicode */
1014
+ uint DecodeSymbolFont(uint c)
1015
+ {
1016
+ if (c > 255)
1017
+ return c;
1018
+
1019
+ /* todo: add some error message */
1020
+
1021
+ return Symbol2Unicode[c];
1022
+ }
1023
+ #endif
1024
+
1025
+
1026
+ /* Facilitates user defined source by providing
1027
+ ** an entry point to marshal pointers-to-functions.
1028
+ ** Needed by .NET and possibly other language bindings.
1029
+ */
1030
+ Bool TIDY_CALL tidyInitSource( TidyInputSource* source,
1031
+ void* srcData,
1032
+ TidyGetByteFunc gbFunc,
1033
+ TidyUngetByteFunc ugbFunc,
1034
+ TidyEOFFunc endFunc )
1035
+ {
1036
+ Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
1037
+
1038
+ if ( status )
1039
+ {
1040
+ source->sourceData = srcData;
1041
+ source->getByte = gbFunc;
1042
+ source->ungetByte = ugbFunc;
1043
+ source->eof = endFunc;
1044
+ }
1045
+
1046
+ return status;
1047
+ }
1048
+
1049
+ Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
1050
+ void* snkData,
1051
+ TidyPutByteFunc pbFunc )
1052
+ {
1053
+ Bool status = ( sink && snkData && pbFunc );
1054
+ if ( status )
1055
+ {
1056
+ sink->sinkData = snkData;
1057
+ sink->putByte = pbFunc;
1058
+ }
1059
+ return status;
1060
+ }
1061
+
1062
+ /* GetByte must return a byte value in a signed
1063
+ ** integer so that a negative value can signal EOF
1064
+ ** without interfering w/ 0-255 legitimate byte values.
1065
+ */
1066
+ uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1067
+ {
1068
+ int bv = source->getByte( source->sourceData );
1069
+ return (uint) bv;
1070
+ }
1071
+ Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1072
+ {
1073
+ return source->eof( source->sourceData );
1074
+ }
1075
+ void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1076
+ {
1077
+ source->ungetByte( source->sourceData, (byte) ch );
1078
+ }
1079
+ void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1080
+ {
1081
+ sink->putByte( sink->sinkData, (byte) ch );
1082
+ }
1083
+
1084
+ static uint ReadByte( StreamIn* in )
1085
+ {
1086
+ return tidyGetByte( &in->source );
1087
+ }
1088
+ Bool TY_(IsEOF)( StreamIn* in )
1089
+ {
1090
+ return tidyIsEOF( &in->source );
1091
+ }
1092
+ static void UngetByte( StreamIn* in, uint byteValue )
1093
+ {
1094
+ tidyUngetByte( &in->source, byteValue );
1095
+ }
1096
+ static void PutByte( uint byteValue, StreamOut* out )
1097
+ {
1098
+ tidyPutByte( &out->sink, byteValue );
1099
+ }
1100
+
1101
+ #if 0
1102
+ static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1103
+ {
1104
+ int i;
1105
+
1106
+ for (i = 0; i < *count; i++)
1107
+ {
1108
+ /* should never get here; testing for 0xFF, a valid char, is not a good idea */
1109
+ if ( in && TY_(IsEOF)(in) )
1110
+ {
1111
+ /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1112
+ *count = -i;
1113
+ return;
1114
+ }
1115
+
1116
+ in->source.ungetByte( in->source.sourceData, buf[i] );
1117
+ }
1118
+ }
1119
+
1120
+ /*
1121
+ Read raw bytes from stream, return <= 0 if EOF; or if
1122
+ "unget" is true, Unget the bytes to re-synchronize the input stream
1123
+ Normally UTF-8 successor bytes are read using this routine.
1124
+ */
1125
+ static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1126
+ {
1127
+ int ix;
1128
+ for ( ix=0; ix < *count; ++ix )
1129
+ {
1130
+ if ( in->rawPushed )
1131
+ {
1132
+ buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1133
+ if ( in->rawBufpos == 0 )
1134
+ in->rawPushed = no;
1135
+ }
1136
+ else
1137
+ {
1138
+ if ( in->source.eof(in->source.sourceData) )
1139
+ {
1140
+ *count = -i;
1141
+ break;
1142
+ }
1143
+ buf[ix] = in->source.getByte( in->source.sourceData );
1144
+ }
1145
+ }
1146
+ }
1147
+ #endif /* 0 */
1148
+
1149
+ /* read char from stream */
1150
+ static uint ReadCharFromStream( StreamIn* in )
1151
+ {
1152
+ uint c, n;
1153
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
1154
+ uint bytesRead = 0;
1155
+ #endif
1156
+
1157
+ if ( TY_(IsEOF)(in) )
1158
+ return EndOfStream;
1159
+
1160
+ c = ReadByte( in );
1161
+
1162
+ if (c == EndOfStream)
1163
+ return c;
1164
+
1165
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
1166
+ /*
1167
+ A document in ISO-2022 based encoding uses some ESC sequences
1168
+ called "designator" to switch character sets. The designators
1169
+ defined and used in ISO-2022-JP are:
1170
+
1171
+ "ESC" + "(" + ? for ISO646 variants
1172
+
1173
+ "ESC" + "$" + ? and
1174
+ "ESC" + "$" + "(" + ? for multibyte character sets
1175
+
1176
+ Where ? stands for a single character used to indicate the
1177
+ character set for multibyte characters.
1178
+
1179
+ Tidy handles this by preserving the escape sequence and
1180
+ setting the top bit of each byte for non-ascii chars. This
1181
+ bit is then cleared on output. The input stream keeps track
1182
+ of the state to determine when to set/clear the bit.
1183
+ */
1184
+
1185
+ if (in->encoding == ISO2022)
1186
+ {
1187
+ if (c == 0x1b) /* ESC */
1188
+ {
1189
+ in->state = FSM_ESC;
1190
+ return c;
1191
+ }
1192
+
1193
+ switch (in->state)
1194
+ {
1195
+ case FSM_ESC:
1196
+ if (c == '$')
1197
+ in->state = FSM_ESCD;
1198
+ else if (c == '(')
1199
+ in->state = FSM_ESCP;
1200
+ else
1201
+ in->state = FSM_ASCII;
1202
+ break;
1203
+
1204
+ case FSM_ESCD:
1205
+ if (c == '(')
1206
+ in->state = FSM_ESCDP;
1207
+ else
1208
+ in->state = FSM_NONASCII;
1209
+ break;
1210
+
1211
+ case FSM_ESCDP:
1212
+ in->state = FSM_NONASCII;
1213
+ break;
1214
+
1215
+ case FSM_ESCP:
1216
+ in->state = FSM_ASCII;
1217
+ break;
1218
+
1219
+ case FSM_NONASCII:
1220
+ c |= 0x80;
1221
+ break;
1222
+
1223
+ case FSM_ASCII:
1224
+ break;
1225
+ }
1226
+
1227
+ return c;
1228
+ }
1229
+ #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1230
+
1231
+ #if SUPPORT_UTF16_ENCODINGS
1232
+ if ( in->encoding == UTF16LE )
1233
+ {
1234
+ uint c1 = ReadByte( in );
1235
+ if ( EndOfStream == c1 )
1236
+ return EndOfStream;
1237
+ n = (c1 << 8) + c;
1238
+ return n;
1239
+ }
1240
+
1241
+ if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1242
+ {
1243
+ uint c1 = ReadByte( in );
1244
+ if ( EndOfStream == c1 )
1245
+ return EndOfStream;
1246
+ n = (c << 8) + c1;
1247
+ return n;
1248
+ }
1249
+ #endif
1250
+
1251
+ if ( in->encoding == UTF8 )
1252
+ {
1253
+ /* deal with UTF-8 encoded char */
1254
+
1255
+ int err, count = 0;
1256
+
1257
+ /* first byte "c" is passed in separately */
1258
+ err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1259
+ if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1260
+ return EndOfStream;
1261
+ else if (err)
1262
+ {
1263
+ /* set error position just before offending character */
1264
+ in->doc->lexer->lines = in->curline;
1265
+ in->doc->lexer->columns = in->curcol;
1266
+
1267
+ TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1268
+ n = 0xFFFD; /* replacement char */
1269
+ }
1270
+
1271
+ return n;
1272
+ }
1273
+
1274
+ #if SUPPORT_ASIAN_ENCODINGS
1275
+ /*
1276
+ This section is suitable for any "multibyte" variable-width
1277
+ character encoding in which a one-byte code is less than
1278
+ 128, and the first byte of a two-byte code is greater or
1279
+ equal to 128. Note that Big5 and ShiftJIS fit into this
1280
+ kind, even though their second byte may be less than 128
1281
+ */
1282
+ if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1283
+ {
1284
+ if (c < 128)
1285
+ return c;
1286
+ else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1287
+ {
1288
+ /*
1289
+ Rick Cameron pointed out that for Shift_JIS, the values from
1290
+ 0xa1 through 0xdf represent singe-byte characters
1291
+ (U+FF61 to U+FF9F - half-shift Katakana)
1292
+ */
1293
+ return c;
1294
+ }
1295
+ else
1296
+ {
1297
+ uint c1 = ReadByte( in );
1298
+ if ( EndOfStream == c1 )
1299
+ return EndOfStream;
1300
+ n = (c << 8) + c1;
1301
+ return n;
1302
+ }
1303
+ }
1304
+ #endif
1305
+
1306
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
1307
+ else if (in->encoding > WIN32MLANG)
1308
+ {
1309
+ assert( in->mlang != NULL );
1310
+ return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead);
1311
+ }
1312
+ #endif
1313
+
1314
+ else
1315
+ n = c;
1316
+
1317
+ return n;
1318
+ }
1319
+
1320
+ /* Output a Byte Order Mark if required */
1321
+ void TY_(outBOM)( StreamOut *out )
1322
+ {
1323
+ if ( out->encoding == UTF8
1324
+ #if SUPPORT_UTF16_ENCODINGS
1325
+ || out->encoding == UTF16LE
1326
+ || out->encoding == UTF16BE
1327
+ || out->encoding == UTF16
1328
+ #endif
1329
+ )
1330
+ {
1331
+ /* this will take care of encoding the BOM correctly */
1332
+ TY_(WriteChar)( UNICODE_BOM, out );
1333
+ }
1334
+ }
1335
+
1336
+ /* this is in intermediate fix for various problems in the */
1337
+ /* long term code and data in charsets.c should be used */
1338
+ static struct _enc2iana
1339
+ {
1340
+ uint id;
1341
+ ctmbstr name;
1342
+ ctmbstr tidyOptName;
1343
+ } const enc2iana[] =
1344
+ {
1345
+ { ASCII, "us-ascii", "ascii" },
1346
+ { LATIN0, "iso-8859-15", "latin0" },
1347
+ { LATIN1, "iso-8859-1", "latin1" },
1348
+ { UTF8, "utf-8", "utf8" },
1349
+ { MACROMAN, "macintosh", "mac" },
1350
+ { WIN1252, "windows-1252", "win1252" },
1351
+ { IBM858, "ibm00858", "ibm858" },
1352
+ #if SUPPORT_UTF16_ENCODINGS
1353
+ { UTF16LE, "utf-16", "utf16le" },
1354
+ { UTF16BE, "utf-16", "utf16be" },
1355
+ { UTF16, "utf-16", "utf16" },
1356
+ #endif
1357
+ #if SUPPORT_ASIAN_ENCODINGS
1358
+ { BIG5, "big5", "big5" },
1359
+ { SHIFTJIS, "shift_jis", "shiftjis"},
1360
+ #endif
1361
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
1362
+ { ISO2022, NULL, "iso2022" },
1363
+ #endif
1364
+ { RAW, NULL, "raw" }
1365
+ };
1366
+
1367
+ ctmbstr TY_(GetEncodingNameFromTidyId)(uint id)
1368
+ {
1369
+ uint i;
1370
+
1371
+ for (i = 0; enc2iana[i].name; ++i)
1372
+ if (enc2iana[i].id == id)
1373
+ return enc2iana[i].name;
1374
+
1375
+ return NULL;
1376
+ }
1377
+
1378
+ ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id)
1379
+ {
1380
+ uint i;
1381
+
1382
+ for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1383
+ if (enc2iana[i].id == id)
1384
+ return enc2iana[i].tidyOptName;
1385
+
1386
+ return NULL;
1387
+ }
1388
+
1389
+ int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1390
+ {
1391
+ uint i;
1392
+
1393
+ for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1394
+ if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1395
+ return enc2iana[i].id;
1396
+
1397
+ return -1;
1398
+ }
1399
+
1400
+ /*
1401
+ * local variables:
1402
+ * mode: c
1403
+ * indent-tabs-mode: nil
1404
+ * c-basic-offset: 4
1405
+ * eval: (c-set-offset 'substatement-open 0)
1406
+ * end:
1407
+ */