tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
@@ -0,0 +1,1407 @@
1
+ /* streamio.c -- handles character stream I/O
2
+
3
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: arnaud02 $
9
+ $Date: 2008/03/22 21:00:18 $
10
+ $Revision: 1.43 $
11
+
12
+ Wrapper around Tidy input source and output sink
13
+ that calls appropriate interfaces, and applies
14
+ necessary char encoding transformations: to/from
15
+ ISO-10646 and/or UTF-8.
16
+
17
+ */
18
+
19
+ #include <stdio.h>
20
+ #include <errno.h>
21
+
22
+ #include "streamio.h"
23
+ #include "tidy-int.h"
24
+ #include "lexer.h"
25
+ #include "message.h"
26
+ #include "utf8.h"
27
+ #include "tmbstr.h"
28
+
29
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
30
+ #include "win32tc.h"
31
+ #endif
32
+
33
+ /************************
34
+ ** Forward Declarations
35
+ ************************/
36
+
37
+ static uint ReadCharFromStream( StreamIn* in );
38
+
39
+ static uint ReadByte( StreamIn* in );
40
+ static void UngetByte( StreamIn* in, uint byteValue );
41
+
42
+ static void PutByte( uint byteValue, StreamOut* out );
43
+
44
+ static void EncodeWin1252( uint c, StreamOut* out );
45
+ static void EncodeMacRoman( uint c, StreamOut* out );
46
+ static void EncodeIbm858( uint c, StreamOut* out );
47
+ static void EncodeLatin0( uint c, StreamOut* out );
48
+
49
+ static uint DecodeIbm850(uint c);
50
+ static uint DecodeLatin0(uint c);
51
+
52
+ static uint PopChar( StreamIn *in );
53
+
54
+ /******************************
55
+ ** Static (duration) Globals
56
+ ******************************/
57
+
58
+ static StreamOut stderrStreamOut =
59
+ {
60
+ ASCII,
61
+ FSM_ASCII,
62
+ DEFAULT_NL_CONFIG,
63
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
64
+ NULL,
65
+ #endif
66
+ FileIO,
67
+ { 0, TY_(filesink_putByte) }
68
+ };
69
+
70
+ static StreamOut stdoutStreamOut =
71
+ {
72
+ ASCII,
73
+ FSM_ASCII,
74
+ DEFAULT_NL_CONFIG,
75
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
76
+ NULL,
77
+ #endif
78
+ FileIO,
79
+ { 0, TY_(filesink_putByte) }
80
+ };
81
+
82
+ StreamOut* TY_(StdErrOutput)(void)
83
+ {
84
+ if ( stderrStreamOut.sink.sinkData == 0 )
85
+ stderrStreamOut.sink.sinkData = stderr;
86
+ return &stderrStreamOut;
87
+ }
88
+
89
+ #if 0
90
+ StreamOut* TY_(StdOutOutput)(void)
91
+ {
92
+ if ( stdoutStreamOut.sink.sinkData == 0 )
93
+ stdoutStreamOut.sink.sinkData = stdout;
94
+ return &stdoutStreamOut;
95
+ }
96
+ #endif
97
+
98
+ void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out )
99
+ {
100
+ if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
101
+ {
102
+ if ( out->iotype == FileIO )
103
+ fclose( (FILE*) out->sink.sinkData );
104
+ TidyDocFree( doc, out );
105
+ }
106
+ }
107
+
108
+ /************************
109
+ ** Source
110
+ ************************/
111
+
112
+ static void InitLastPos( StreamIn *in );
113
+
114
+ StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
115
+ {
116
+ StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) );
117
+
118
+ TidyClearMemory( in, sizeof(StreamIn) );
119
+ in->curline = 1;
120
+ in->curcol = 1;
121
+ in->encoding = encoding;
122
+ in->state = FSM_ASCII;
123
+ in->doc = doc;
124
+ in->bufsize = CHARBUF_SIZE;
125
+ in->allocator = doc->allocator;
126
+ in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize);
127
+ InitLastPos( in );
128
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
129
+ in->otextbuf = NULL;
130
+ in->otextlen = 0;
131
+ in->otextsize = 0;
132
+ #endif
133
+ return in;
134
+ }
135
+
136
+ void TY_(freeStreamIn)(StreamIn* in)
137
+ {
138
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
139
+ if (in->otextbuf)
140
+ TidyFree(in->allocator, in->otextbuf);
141
+ #endif
142
+ TidyFree(in->allocator, in->charbuf);
143
+ TidyFree(in->allocator, in);
144
+ }
145
+
146
+ StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
147
+ {
148
+ StreamIn *in = TY_(initStreamIn)( doc, encoding );
149
+ if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 )
150
+ {
151
+ TY_(freeStreamIn)( in );
152
+ return NULL;
153
+ }
154
+ in->iotype = FileIO;
155
+ return in;
156
+ }
157
+
158
+ StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
159
+ {
160
+ StreamIn *in = TY_(initStreamIn)( doc, encoding );
161
+ tidyInitInputBuffer( &in->source, buf );
162
+ in->iotype = BufferIO;
163
+ return in;
164
+ }
165
+
166
+ StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
167
+ {
168
+ StreamIn *in = TY_(initStreamIn)( doc, encoding );
169
+ memcpy( &in->source, source, sizeof(TidyInputSource) );
170
+ in->iotype = UserIO;
171
+ return in;
172
+ }
173
+
174
+ int TY_(ReadBOMEncoding)(StreamIn *in)
175
+ {
176
+ uint c, c1;
177
+ #if SUPPORT_UTF16_ENCODINGS
178
+ uint bom;
179
+ #endif
180
+
181
+ c = ReadByte(in);
182
+ if (c == EndOfStream)
183
+ return -1;
184
+
185
+ c1 = ReadByte( in );
186
+ if (c1 == EndOfStream)
187
+ {
188
+ UngetByte(in, c);
189
+ return -1;
190
+ }
191
+
192
+ /* todo: dont warn about mismatch for auto input encoding */
193
+ /* todo: let the user override the encoding found here */
194
+
195
+ #if SUPPORT_UTF16_ENCODINGS
196
+ bom = (c << 8) + c1;
197
+
198
+ if ( bom == UNICODE_BOM_BE )
199
+ {
200
+ /* big-endian UTF-16 */
201
+ if ( in->encoding != UTF16 && in->encoding != UTF16BE )
202
+ TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE);
203
+
204
+ return UTF16BE; /* return decoded BOM */
205
+ }
206
+ else if (bom == UNICODE_BOM_LE)
207
+ {
208
+ /* little-endian UTF-16 */
209
+ if (in->encoding != UTF16 && in->encoding != UTF16LE)
210
+ TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
211
+
212
+ return UTF16LE; /* return decoded BOM */
213
+ }
214
+ else
215
+ #endif /* SUPPORT_UTF16_ENCODINGS */
216
+ {
217
+ uint c2 = ReadByte(in);
218
+
219
+ if (c2 == EndOfStream)
220
+ {
221
+ UngetByte(in, c1);
222
+ UngetByte(in, c);
223
+ return -1;
224
+ }
225
+
226
+ if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
227
+ {
228
+ /* UTF-8 */
229
+ if (in->encoding != UTF8)
230
+ TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8);
231
+
232
+ return UTF8;
233
+ }
234
+ else
235
+ UngetByte( in, c2 );
236
+ }
237
+
238
+ UngetByte(in, c1);
239
+ UngetByte(in, c);
240
+
241
+ return -1;
242
+ }
243
+
244
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
245
+ void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c)
246
+ {
247
+ if (in->otextlen + 1 >= in->otextsize)
248
+ {
249
+ size_t size = in->otextsize ? 1 : 2;
250
+ in->otextbuf = TidyRealloc(in->allocator, in->otextbuf, in->otextsize + size);
251
+ in->otextsize += size;
252
+ }
253
+ in->otextbuf[in->otextlen++] = c;
254
+ in->otextbuf[in->otextlen ] = 0;
255
+ }
256
+
257
+ void TY_(AddCharToOriginalText)(StreamIn *in, tchar c)
258
+ {
259
+ int i, err, count = 0;
260
+ tmbchar buf[10] = {0};
261
+
262
+ err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count);
263
+
264
+ if (err)
265
+ {
266
+ /* replacement character 0xFFFD encoded as UTF-8 */
267
+ buf[0] = (byte) 0xEF;
268
+ buf[1] = (byte) 0xBF;
269
+ buf[2] = (byte) 0xBD;
270
+ count = 3;
271
+ }
272
+
273
+ for (i = 0; i < count; ++i)
274
+ TY_(AddByteToOriginalText)(in, buf[i]);
275
+ }
276
+ #endif
277
+
278
+ static void InitLastPos( StreamIn *in )
279
+ {
280
+ in->curlastpos = 0;
281
+ in->firstlastpos = 0;
282
+ }
283
+
284
+ static void PopLastPos( StreamIn *in )
285
+ {
286
+ in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE;
287
+ if ( in->curlastpos == in->firstlastpos )
288
+ in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE;
289
+ }
290
+
291
+ static void SaveLastPos( StreamIn *in )
292
+ {
293
+ PopLastPos( in );
294
+ in->lastcols[in->curlastpos] = in->curcol;
295
+ }
296
+
297
+ static void RestoreLastPos( StreamIn *in )
298
+ {
299
+ if ( in->firstlastpos == in->curlastpos )
300
+ in->curcol = 0;
301
+ else
302
+ {
303
+ in->curcol = in->lastcols[in->curlastpos];
304
+ if ( in->curlastpos == 0 )
305
+ in->curlastpos = LASTPOS_SIZE;
306
+ in->curlastpos--;
307
+ }
308
+ }
309
+
310
+ uint TY_(ReadChar)( StreamIn *in )
311
+ {
312
+ uint c = EndOfStream;
313
+ uint tabsize = cfg( in->doc, TidyTabSize );
314
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
315
+ Bool added = no;
316
+ #endif
317
+
318
+ if ( in->pushed )
319
+ return PopChar( in );
320
+
321
+ SaveLastPos( in );
322
+
323
+ if ( in->tabs > 0 )
324
+ {
325
+ in->curcol++;
326
+ in->tabs--;
327
+ return ' ';
328
+ }
329
+
330
+ for (;;)
331
+ {
332
+ c = ReadCharFromStream(in);
333
+
334
+ if ( EndOfStream == c )
335
+ return EndOfStream;
336
+
337
+ if (c == '\n')
338
+ {
339
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
340
+ added = yes;
341
+ TY_(AddCharToOriginalText)(in, (tchar)c);
342
+ #endif
343
+ in->curcol = 1;
344
+ in->curline++;
345
+ break;
346
+ }
347
+
348
+ if (c == '\t')
349
+ {
350
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
351
+ added = yes;
352
+ TY_(AddCharToOriginalText)(in, (tchar)c);
353
+ #endif
354
+ in->tabs = tabsize > 0 ?
355
+ tabsize - ((in->curcol - 1) % tabsize) - 1
356
+ : 0;
357
+ in->curcol++;
358
+ c = ' ';
359
+ break;
360
+ }
361
+
362
+ /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
363
+ if (c == '\r')
364
+ {
365
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
366
+ added = yes;
367
+ TY_(AddCharToOriginalText)(in, (tchar)c);
368
+ #endif
369
+ c = ReadCharFromStream(in);
370
+ if (c != '\n')
371
+ {
372
+ TY_(UngetChar)( c, in );
373
+ c = '\n';
374
+ }
375
+ else
376
+ {
377
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
378
+ TY_(AddCharToOriginalText)(in, (tchar)c);
379
+ #endif
380
+ }
381
+ in->curcol = 1;
382
+ in->curline++;
383
+ break;
384
+ }
385
+
386
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
387
+ /* strip control characters, except for Esc */
388
+ if (c == '\033')
389
+ break;
390
+ #endif
391
+
392
+ /* Form Feed is allowed in HTML */
393
+ if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
394
+ break;
395
+
396
+ if ( c < 32 )
397
+ continue; /* discard control char */
398
+
399
+ /* watch out for chars that have already been decoded such as */
400
+ /* IS02022, UTF-8 etc, that don't require further decoding */
401
+
402
+ if (
403
+ in->encoding == RAW
404
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
405
+ || in->encoding == ISO2022
406
+ #endif
407
+ || in->encoding == UTF8
408
+
409
+ #if SUPPORT_ASIAN_ENCODINGS
410
+ || in->encoding == SHIFTJIS /* #431953 - RJ */
411
+ || in->encoding == BIG5 /* #431953 - RJ */
412
+ #endif
413
+ )
414
+ {
415
+ in->curcol++;
416
+ break;
417
+ }
418
+
419
+ #if SUPPORT_UTF16_ENCODINGS
420
+ /* handle surrogate pairs */
421
+ if ( in->encoding == UTF16LE ||
422
+ in->encoding == UTF16 ||
423
+ in->encoding == UTF16BE )
424
+ {
425
+ if ( !TY_(IsValidUTF16FromUCS4)(c) )
426
+ {
427
+ /* invalid UTF-16 value */
428
+ TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
429
+ c = 0;
430
+ }
431
+ else if ( TY_(IsLowSurrogate)(c) )
432
+ {
433
+ uint n = c;
434
+ uint m = ReadCharFromStream( in );
435
+ if ( m == EndOfStream )
436
+ return EndOfStream;
437
+
438
+ c = 0;
439
+ if ( TY_(IsHighSurrogate)(m) )
440
+ {
441
+ n = TY_(CombineSurrogatePair)( m, n );
442
+ if ( TY_(IsValidCombinedChar)(n) )
443
+ c = n;
444
+ }
445
+ /* not a valid pair */
446
+ if ( 0 == c )
447
+ TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
448
+ }
449
+ }
450
+ #endif
451
+
452
+ /* Do first: acts on range 128 - 255 */
453
+ switch ( in->encoding )
454
+ {
455
+ case MACROMAN:
456
+ c = TY_(DecodeMacRoman)( c );
457
+ break;
458
+ case IBM858:
459
+ c = DecodeIbm850( c );
460
+ break;
461
+ case LATIN0:
462
+ c = DecodeLatin0( c );
463
+ break;
464
+ }
465
+
466
+ /* produced e.g. as a side-effect of smart quotes in Word */
467
+ /* but can't happen if using MACROMAN encoding */
468
+ if ( 127 < c && c < 160 )
469
+ {
470
+ uint c1 = 0, replMode = DISCARDED_CHAR;
471
+ Bool isVendorChar = ( in->encoding == WIN1252 ||
472
+ in->encoding == MACROMAN );
473
+ Bool isWinChar = ( in->encoding == WIN1252 ||
474
+ TY_(ReplacementCharEncoding) == WIN1252 );
475
+ Bool isMacChar = ( in->encoding == MACROMAN ||
476
+ TY_(ReplacementCharEncoding) == MACROMAN );
477
+
478
+ /* set error position just before offending character */
479
+ if (in->doc->lexer)
480
+ {
481
+ in->doc->lexer->lines = in->curline;
482
+ in->doc->lexer->columns = in->curcol;
483
+ }
484
+
485
+ if ( isWinChar )
486
+ c1 = TY_(DecodeWin1252)( c );
487
+ else if ( isMacChar )
488
+ c1 = TY_(DecodeMacRoman)( c );
489
+ if ( c1 )
490
+ replMode = REPLACED_CHAR;
491
+
492
+ if ( c1 == 0 && isVendorChar )
493
+ TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
494
+ else if ( ! isVendorChar )
495
+ TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
496
+
497
+ c = c1;
498
+ }
499
+
500
+ if ( c == 0 )
501
+ continue; /* illegal char is discarded */
502
+
503
+ in->curcol++;
504
+ break;
505
+ }
506
+
507
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
508
+ if (!added)
509
+ TY_(AddCharToOriginalText)(in, (tchar)c);
510
+ #endif
511
+
512
+ return c;
513
+ }
514
+
515
+ static uint PopChar( StreamIn *in )
516
+ {
517
+ uint c = EndOfStream;
518
+ if ( in->pushed )
519
+ {
520
+ assert( in->bufpos > 0 );
521
+ c = in->charbuf[ --in->bufpos ];
522
+ if ( in->bufpos == 0 )
523
+ in->pushed = no;
524
+
525
+ if ( c == '\n' )
526
+ {
527
+ in->curcol = 1;
528
+ in->curline++;
529
+ PopLastPos( in );
530
+ return c;
531
+ }
532
+ in->curcol++;
533
+ PopLastPos( in );
534
+ }
535
+ return c;
536
+ }
537
+
538
+ void TY_(UngetChar)( uint c, StreamIn *in )
539
+ {
540
+ if (c == EndOfStream)
541
+ {
542
+ /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
543
+ return;
544
+ }
545
+
546
+ in->pushed = yes;
547
+
548
+ if (in->bufpos + 1 >= in->bufsize)
549
+ in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize));
550
+
551
+ in->charbuf[(in->bufpos)++] = c;
552
+
553
+ if (c == '\n')
554
+ --(in->curline);
555
+
556
+ RestoreLastPos( in );
557
+ }
558
+
559
+
560
+
561
+ /************************
562
+ ** Sink
563
+ ************************/
564
+
565
+ static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl )
566
+ {
567
+ StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) );
568
+ TidyClearMemory( out, sizeof(StreamOut) );
569
+ out->encoding = encoding;
570
+ out->state = FSM_ASCII;
571
+ out->nl = nl;
572
+ return out;
573
+ }
574
+
575
+ StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl )
576
+ {
577
+ StreamOut* out = initStreamOut( doc, encoding, nl );
578
+ TY_(initFileSink)( &out->sink, fp );
579
+ out->iotype = FileIO;
580
+ return out;
581
+ }
582
+ StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl )
583
+ {
584
+ StreamOut* out = initStreamOut( doc, encoding, nl );
585
+ tidyInitOutputBuffer( &out->sink, buf );
586
+ out->iotype = BufferIO;
587
+ return out;
588
+ }
589
+ StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl )
590
+ {
591
+ StreamOut* out = initStreamOut( doc, encoding, nl );
592
+ memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
593
+ out->iotype = UserIO;
594
+ return out;
595
+ }
596
+
597
+ void TY_(WriteChar)( uint c, StreamOut* out )
598
+ {
599
+ /* Translate outgoing newlines */
600
+ if ( LF == c )
601
+ {
602
+ if ( out->nl == TidyCRLF )
603
+ TY_(WriteChar)( CR, out );
604
+ else if ( out->nl == TidyCR )
605
+ c = CR;
606
+ }
607
+
608
+ if (out->encoding == MACROMAN)
609
+ {
610
+ EncodeMacRoman( c, out );
611
+ }
612
+ else if (out->encoding == WIN1252)
613
+ {
614
+ EncodeWin1252( c, out );
615
+ }
616
+ else if (out->encoding == IBM858)
617
+ {
618
+ EncodeIbm858( c, out );
619
+ }
620
+ else if (out->encoding == LATIN0)
621
+ {
622
+ EncodeLatin0( c, out );
623
+ }
624
+
625
+ else if (out->encoding == UTF8)
626
+ {
627
+ int count = 0;
628
+
629
+ TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
630
+ if (count <= 0)
631
+ {
632
+ /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
633
+ /* replacement char 0xFFFD encoded as UTF-8 */
634
+ PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
635
+ }
636
+ }
637
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
638
+ else if (out->encoding == ISO2022)
639
+ {
640
+ if (c == 0x1b) /* ESC */
641
+ out->state = FSM_ESC;
642
+ else
643
+ {
644
+ switch (out->state)
645
+ {
646
+ case FSM_ESC:
647
+ if (c == '$')
648
+ out->state = FSM_ESCD;
649
+ else if (c == '(')
650
+ out->state = FSM_ESCP;
651
+ else
652
+ out->state = FSM_ASCII;
653
+ break;
654
+
655
+ case FSM_ESCD:
656
+ if (c == '(')
657
+ out->state = FSM_ESCDP;
658
+ else
659
+ out->state = FSM_NONASCII;
660
+ break;
661
+
662
+ case FSM_ESCDP:
663
+ out->state = FSM_NONASCII;
664
+ break;
665
+
666
+ case FSM_ESCP:
667
+ out->state = FSM_ASCII;
668
+ break;
669
+
670
+ case FSM_NONASCII:
671
+ c &= 0x7F;
672
+ break;
673
+
674
+ case FSM_ASCII:
675
+ break;
676
+ }
677
+ }
678
+
679
+ PutByte(c, out);
680
+ }
681
+ #endif /* NO_NATIVE_ISO2022_SUPPORT */
682
+
683
+ #if SUPPORT_UTF16_ENCODINGS
684
+ else if ( out->encoding == UTF16LE ||
685
+ out->encoding == UTF16BE ||
686
+ out->encoding == UTF16 )
687
+ {
688
+ int i, numChars = 1;
689
+ uint theChars[2];
690
+
691
+ if ( !TY_(IsValidUTF16FromUCS4)(c) )
692
+ {
693
+ /* invalid UTF-16 value */
694
+ /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
695
+ c = 0;
696
+ numChars = 0;
697
+ }
698
+ else if ( TY_(IsCombinedChar)(c) )
699
+ {
700
+ /* output both, unless something goes wrong */
701
+ numChars = 2;
702
+ if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
703
+ {
704
+ /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
705
+ c = 0;
706
+ numChars = 0;
707
+ }
708
+ }
709
+ else
710
+ {
711
+ /* just put the char out */
712
+ theChars[0] = c;
713
+ }
714
+
715
+ for (i = 0; i < numChars; i++)
716
+ {
717
+ c = theChars[i];
718
+
719
+ if (out->encoding == UTF16LE)
720
+ {
721
+ uint ch = c & 0xFF; PutByte(ch, out);
722
+ ch = (c >> 8) & 0xFF; PutByte(ch, out);
723
+ }
724
+
725
+ else if (out->encoding == UTF16BE || out->encoding == UTF16)
726
+ {
727
+ uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
728
+ ch = c & 0xFF; PutByte(ch, out);
729
+ }
730
+ }
731
+ }
732
+ #endif
733
+
734
+ #if SUPPORT_ASIAN_ENCODINGS
735
+ else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
736
+ {
737
+ if (c < 128)
738
+ PutByte(c, out);
739
+ else
740
+ {
741
+ uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
742
+ ch = c & 0xFF; PutByte(ch, out);
743
+ }
744
+ }
745
+ #endif
746
+
747
+ else
748
+ PutByte( c, out );
749
+ }
750
+
751
+
752
+
753
+ /****************************
754
+ ** Miscellaneous / Helpers
755
+ ****************************/
756
+
757
+ /* char encoding used when replacing illegal SGML chars,
758
+ ** regardless of specified encoding. Set at compile time
759
+ ** to either Windows or Mac.
760
+ */
761
+ const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC;
762
+
763
+
764
+ /* Mapping for Windows Western character set CP 1252
765
+ ** (chars 128-159/U+0080-U+009F) to Unicode.
766
+ */
767
+ static const uint Win2Unicode[32] =
768
+ {
769
+ 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
770
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
771
+ 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
772
+ 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
773
+ };
774
+
775
+ /* Function for conversion from Windows-1252 to Unicode */
776
+ uint TY_(DecodeWin1252)(uint c)
777
+ {
778
+ if (127 < c && c < 160)
779
+ c = Win2Unicode[c - 128];
780
+
781
+ return c;
782
+ }
783
+
784
+ static void EncodeWin1252( uint c, StreamOut* out )
785
+ {
786
+ if (c < 128 || (c > 159 && c < 256))
787
+ PutByte(c, out);
788
+ else
789
+ {
790
+ int i;
791
+
792
+ for (i = 128; i < 160; i++)
793
+ if (Win2Unicode[i - 128] == c)
794
+ {
795
+ PutByte(i, out);
796
+ break;
797
+ }
798
+ }
799
+ }
800
+
801
+ /*
802
+ John Love-Jensen contributed this table for mapping MacRoman
803
+ character set to Unicode
804
+ */
805
+
806
+ /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
807
+ static const uint Mac2Unicode[128] =
808
+ {
809
+ /* x7F = DEL */
810
+
811
+ 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
812
+ 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
813
+
814
+ 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
815
+ 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
816
+
817
+ 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
818
+ 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
819
+
820
+ 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
821
+ /* =BD U+2126 OHM SIGN */
822
+ 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
823
+
824
+ 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
825
+ 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
826
+
827
+ 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
828
+ /* =DB U+00A4 CURRENCY SIGN */
829
+ 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
830
+
831
+ 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
832
+ 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
833
+ /* xF0 = Apple Logo */
834
+ /* =F0 U+2665 BLACK HEART SUIT */
835
+ 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
836
+ 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
837
+ };
838
+
839
+ /* Function to convert from MacRoman to Unicode */
840
+ uint TY_(DecodeMacRoman)(uint c)
841
+ {
842
+ if (127 < c)
843
+ c = Mac2Unicode[c - 128];
844
+ return c;
845
+ }
846
+
847
+ static void EncodeMacRoman( uint c, StreamOut* out )
848
+ {
849
+ if (c < 128)
850
+ PutByte(c, out);
851
+ else
852
+ {
853
+ /* For mac users, map Unicode back to MacRoman. */
854
+ int i;
855
+ for (i = 128; i < 256; i++)
856
+ {
857
+ if (Mac2Unicode[i - 128] == c)
858
+ {
859
+ PutByte(i, out);
860
+ break;
861
+ }
862
+ }
863
+ }
864
+ }
865
+
866
+ /* Mapping for OS/2 Western character set CP 850
867
+ ** (chars 128-255) to Unicode.
868
+ */
869
+ static const uint IBM2Unicode[128] =
870
+ {
871
+ 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
872
+ 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
873
+ 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
874
+ 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
875
+ 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
876
+ 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
877
+ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
878
+ 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
879
+ 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
880
+ 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
881
+ 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
882
+ 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
883
+ 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
884
+ 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
885
+ 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
886
+ 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
887
+ };
888
+
889
+ /* Function for conversion from OS/2-850 to Unicode */
890
+ static uint DecodeIbm850(uint c)
891
+ {
892
+ if (127 < c && c < 256)
893
+ c = IBM2Unicode[c - 128];
894
+
895
+ return c;
896
+ }
897
+
898
+ /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
899
+ static void EncodeIbm858( uint c, StreamOut* out )
900
+ {
901
+ if (c < 128)
902
+ PutByte(c, out);
903
+ else
904
+ {
905
+ int i;
906
+ for (i = 128; i < 256; i++)
907
+ {
908
+ if (IBM2Unicode[i - 128] == c)
909
+ {
910
+ PutByte(i, out);
911
+ break;
912
+ }
913
+ }
914
+ }
915
+ }
916
+
917
+
918
+ /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
919
+ static uint DecodeLatin0(uint c)
920
+ {
921
+ if (159 < c && c < 191)
922
+ {
923
+ switch (c)
924
+ {
925
+ case 0xA4: c = 0x20AC; break;
926
+ case 0xA6: c = 0x0160; break;
927
+ case 0xA8: c = 0x0161; break;
928
+ case 0xB4: c = 0x017D; break;
929
+ case 0xB8: c = 0x017E; break;
930
+ case 0xBC: c = 0x0152; break;
931
+ case 0xBD: c = 0x0153; break;
932
+ case 0xBE: c = 0x0178; break;
933
+ }
934
+ }
935
+ return c;
936
+ }
937
+
938
+ /* Map Unicode back to ISO-8859-15. */
939
+ static void EncodeLatin0( uint c, StreamOut* out )
940
+ {
941
+ switch (c)
942
+ {
943
+ case 0x20AC: c = 0xA4; break;
944
+ case 0x0160: c = 0xA6; break;
945
+ case 0x0161: c = 0xA8; break;
946
+ case 0x017D: c = 0xB4; break;
947
+ case 0x017E: c = 0xB8; break;
948
+ case 0x0152: c = 0xBC; break;
949
+ case 0x0153: c = 0xBD; break;
950
+ case 0x0178: c = 0xBE; break;
951
+ }
952
+ PutByte(c, out);
953
+ }
954
+
955
+ /*
956
+ Table to map symbol font characters to Unicode; undefined
957
+ characters are mapped to 0x0000 and characters without any
958
+ Unicode equivalent are mapped to '?'. Is this appropriate?
959
+ */
960
+
961
+ static const uint Symbol2Unicode[] =
962
+ {
963
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
964
+ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
965
+
966
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
967
+ 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
968
+
969
+ 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
970
+ 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
971
+
972
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
973
+ 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
974
+
975
+ 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
976
+ 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
977
+
978
+ 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
979
+ 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
980
+
981
+ 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
982
+ 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
983
+
984
+ 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
985
+ 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
986
+
987
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
988
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
989
+
990
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
991
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
992
+
993
+ 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
994
+ 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
995
+
996
+ 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
997
+ 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
998
+
999
+ 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
1000
+ 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
1001
+
1002
+ 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
1003
+ 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
1004
+
1005
+ 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
1006
+ 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
1007
+
1008
+ 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
1009
+ 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
1010
+ };
1011
+
1012
+ #if 0
1013
+ /* Function to convert from Symbol Font chars to Unicode */
1014
+ uint DecodeSymbolFont(uint c)
1015
+ {
1016
+ if (c > 255)
1017
+ return c;
1018
+
1019
+ /* todo: add some error message */
1020
+
1021
+ return Symbol2Unicode[c];
1022
+ }
1023
+ #endif
1024
+
1025
+
1026
+ /* Facilitates user defined source by providing
1027
+ ** an entry point to marshal pointers-to-functions.
1028
+ ** Needed by .NET and possibly other language bindings.
1029
+ */
1030
+ Bool TIDY_CALL tidyInitSource( TidyInputSource* source,
1031
+ void* srcData,
1032
+ TidyGetByteFunc gbFunc,
1033
+ TidyUngetByteFunc ugbFunc,
1034
+ TidyEOFFunc endFunc )
1035
+ {
1036
+ Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
1037
+
1038
+ if ( status )
1039
+ {
1040
+ source->sourceData = srcData;
1041
+ source->getByte = gbFunc;
1042
+ source->ungetByte = ugbFunc;
1043
+ source->eof = endFunc;
1044
+ }
1045
+
1046
+ return status;
1047
+ }
1048
+
1049
+ Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
1050
+ void* snkData,
1051
+ TidyPutByteFunc pbFunc )
1052
+ {
1053
+ Bool status = ( sink && snkData && pbFunc );
1054
+ if ( status )
1055
+ {
1056
+ sink->sinkData = snkData;
1057
+ sink->putByte = pbFunc;
1058
+ }
1059
+ return status;
1060
+ }
1061
+
1062
+ /* GetByte must return a byte value in a signed
1063
+ ** integer so that a negative value can signal EOF
1064
+ ** without interfering w/ 0-255 legitimate byte values.
1065
+ */
1066
+ uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1067
+ {
1068
+ int bv = source->getByte( source->sourceData );
1069
+ return (uint) bv;
1070
+ }
1071
+ Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1072
+ {
1073
+ return source->eof( source->sourceData );
1074
+ }
1075
+ void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1076
+ {
1077
+ source->ungetByte( source->sourceData, (byte) ch );
1078
+ }
1079
+ void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1080
+ {
1081
+ sink->putByte( sink->sinkData, (byte) ch );
1082
+ }
1083
+
1084
+ static uint ReadByte( StreamIn* in )
1085
+ {
1086
+ return tidyGetByte( &in->source );
1087
+ }
1088
+ Bool TY_(IsEOF)( StreamIn* in )
1089
+ {
1090
+ return tidyIsEOF( &in->source );
1091
+ }
1092
+ static void UngetByte( StreamIn* in, uint byteValue )
1093
+ {
1094
+ tidyUngetByte( &in->source, byteValue );
1095
+ }
1096
+ static void PutByte( uint byteValue, StreamOut* out )
1097
+ {
1098
+ tidyPutByte( &out->sink, byteValue );
1099
+ }
1100
+
1101
+ #if 0
1102
+ static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1103
+ {
1104
+ int i;
1105
+
1106
+ for (i = 0; i < *count; i++)
1107
+ {
1108
+ /* should never get here; testing for 0xFF, a valid char, is not a good idea */
1109
+ if ( in && TY_(IsEOF)(in) )
1110
+ {
1111
+ /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1112
+ *count = -i;
1113
+ return;
1114
+ }
1115
+
1116
+ in->source.ungetByte( in->source.sourceData, buf[i] );
1117
+ }
1118
+ }
1119
+
1120
+ /*
1121
+ Read raw bytes from stream, return <= 0 if EOF; or if
1122
+ "unget" is true, Unget the bytes to re-synchronize the input stream
1123
+ Normally UTF-8 successor bytes are read using this routine.
1124
+ */
1125
+ static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1126
+ {
1127
+ int ix;
1128
+ for ( ix=0; ix < *count; ++ix )
1129
+ {
1130
+ if ( in->rawPushed )
1131
+ {
1132
+ buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1133
+ if ( in->rawBufpos == 0 )
1134
+ in->rawPushed = no;
1135
+ }
1136
+ else
1137
+ {
1138
+ if ( in->source.eof(in->source.sourceData) )
1139
+ {
1140
+ *count = -i;
1141
+ break;
1142
+ }
1143
+ buf[ix] = in->source.getByte( in->source.sourceData );
1144
+ }
1145
+ }
1146
+ }
1147
+ #endif /* 0 */
1148
+
1149
+ /* read char from stream */
1150
+ static uint ReadCharFromStream( StreamIn* in )
1151
+ {
1152
+ uint c, n;
1153
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
1154
+ uint bytesRead = 0;
1155
+ #endif
1156
+
1157
+ if ( TY_(IsEOF)(in) )
1158
+ return EndOfStream;
1159
+
1160
+ c = ReadByte( in );
1161
+
1162
+ if (c == EndOfStream)
1163
+ return c;
1164
+
1165
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
1166
+ /*
1167
+ A document in ISO-2022 based encoding uses some ESC sequences
1168
+ called "designator" to switch character sets. The designators
1169
+ defined and used in ISO-2022-JP are:
1170
+
1171
+ "ESC" + "(" + ? for ISO646 variants
1172
+
1173
+ "ESC" + "$" + ? and
1174
+ "ESC" + "$" + "(" + ? for multibyte character sets
1175
+
1176
+ Where ? stands for a single character used to indicate the
1177
+ character set for multibyte characters.
1178
+
1179
+ Tidy handles this by preserving the escape sequence and
1180
+ setting the top bit of each byte for non-ascii chars. This
1181
+ bit is then cleared on output. The input stream keeps track
1182
+ of the state to determine when to set/clear the bit.
1183
+ */
1184
+
1185
+ if (in->encoding == ISO2022)
1186
+ {
1187
+ if (c == 0x1b) /* ESC */
1188
+ {
1189
+ in->state = FSM_ESC;
1190
+ return c;
1191
+ }
1192
+
1193
+ switch (in->state)
1194
+ {
1195
+ case FSM_ESC:
1196
+ if (c == '$')
1197
+ in->state = FSM_ESCD;
1198
+ else if (c == '(')
1199
+ in->state = FSM_ESCP;
1200
+ else
1201
+ in->state = FSM_ASCII;
1202
+ break;
1203
+
1204
+ case FSM_ESCD:
1205
+ if (c == '(')
1206
+ in->state = FSM_ESCDP;
1207
+ else
1208
+ in->state = FSM_NONASCII;
1209
+ break;
1210
+
1211
+ case FSM_ESCDP:
1212
+ in->state = FSM_NONASCII;
1213
+ break;
1214
+
1215
+ case FSM_ESCP:
1216
+ in->state = FSM_ASCII;
1217
+ break;
1218
+
1219
+ case FSM_NONASCII:
1220
+ c |= 0x80;
1221
+ break;
1222
+
1223
+ case FSM_ASCII:
1224
+ break;
1225
+ }
1226
+
1227
+ return c;
1228
+ }
1229
+ #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1230
+
1231
+ #if SUPPORT_UTF16_ENCODINGS
1232
+ if ( in->encoding == UTF16LE )
1233
+ {
1234
+ uint c1 = ReadByte( in );
1235
+ if ( EndOfStream == c1 )
1236
+ return EndOfStream;
1237
+ n = (c1 << 8) + c;
1238
+ return n;
1239
+ }
1240
+
1241
+ if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1242
+ {
1243
+ uint c1 = ReadByte( in );
1244
+ if ( EndOfStream == c1 )
1245
+ return EndOfStream;
1246
+ n = (c << 8) + c1;
1247
+ return n;
1248
+ }
1249
+ #endif
1250
+
1251
+ if ( in->encoding == UTF8 )
1252
+ {
1253
+ /* deal with UTF-8 encoded char */
1254
+
1255
+ int err, count = 0;
1256
+
1257
+ /* first byte "c" is passed in separately */
1258
+ err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1259
+ if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1260
+ return EndOfStream;
1261
+ else if (err)
1262
+ {
1263
+ /* set error position just before offending character */
1264
+ in->doc->lexer->lines = in->curline;
1265
+ in->doc->lexer->columns = in->curcol;
1266
+
1267
+ TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1268
+ n = 0xFFFD; /* replacement char */
1269
+ }
1270
+
1271
+ return n;
1272
+ }
1273
+
1274
+ #if SUPPORT_ASIAN_ENCODINGS
1275
+ /*
1276
+ This section is suitable for any "multibyte" variable-width
1277
+ character encoding in which a one-byte code is less than
1278
+ 128, and the first byte of a two-byte code is greater or
1279
+ equal to 128. Note that Big5 and ShiftJIS fit into this
1280
+ kind, even though their second byte may be less than 128
1281
+ */
1282
+ if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1283
+ {
1284
+ if (c < 128)
1285
+ return c;
1286
+ else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1287
+ {
1288
+ /*
1289
+ Rick Cameron pointed out that for Shift_JIS, the values from
1290
+ 0xa1 through 0xdf represent singe-byte characters
1291
+ (U+FF61 to U+FF9F - half-shift Katakana)
1292
+ */
1293
+ return c;
1294
+ }
1295
+ else
1296
+ {
1297
+ uint c1 = ReadByte( in );
1298
+ if ( EndOfStream == c1 )
1299
+ return EndOfStream;
1300
+ n = (c << 8) + c1;
1301
+ return n;
1302
+ }
1303
+ }
1304
+ #endif
1305
+
1306
+ #ifdef TIDY_WIN32_MLANG_SUPPORT
1307
+ else if (in->encoding > WIN32MLANG)
1308
+ {
1309
+ assert( in->mlang != NULL );
1310
+ return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead);
1311
+ }
1312
+ #endif
1313
+
1314
+ else
1315
+ n = c;
1316
+
1317
+ return n;
1318
+ }
1319
+
1320
+ /* Output a Byte Order Mark if required */
1321
+ void TY_(outBOM)( StreamOut *out )
1322
+ {
1323
+ if ( out->encoding == UTF8
1324
+ #if SUPPORT_UTF16_ENCODINGS
1325
+ || out->encoding == UTF16LE
1326
+ || out->encoding == UTF16BE
1327
+ || out->encoding == UTF16
1328
+ #endif
1329
+ )
1330
+ {
1331
+ /* this will take care of encoding the BOM correctly */
1332
+ TY_(WriteChar)( UNICODE_BOM, out );
1333
+ }
1334
+ }
1335
+
1336
+ /* this is in intermediate fix for various problems in the */
1337
+ /* long term code and data in charsets.c should be used */
1338
+ static struct _enc2iana
1339
+ {
1340
+ uint id;
1341
+ ctmbstr name;
1342
+ ctmbstr tidyOptName;
1343
+ } const enc2iana[] =
1344
+ {
1345
+ { ASCII, "us-ascii", "ascii" },
1346
+ { LATIN0, "iso-8859-15", "latin0" },
1347
+ { LATIN1, "iso-8859-1", "latin1" },
1348
+ { UTF8, "utf-8", "utf8" },
1349
+ { MACROMAN, "macintosh", "mac" },
1350
+ { WIN1252, "windows-1252", "win1252" },
1351
+ { IBM858, "ibm00858", "ibm858" },
1352
+ #if SUPPORT_UTF16_ENCODINGS
1353
+ { UTF16LE, "utf-16", "utf16le" },
1354
+ { UTF16BE, "utf-16", "utf16be" },
1355
+ { UTF16, "utf-16", "utf16" },
1356
+ #endif
1357
+ #if SUPPORT_ASIAN_ENCODINGS
1358
+ { BIG5, "big5", "big5" },
1359
+ { SHIFTJIS, "shift_jis", "shiftjis"},
1360
+ #endif
1361
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
1362
+ { ISO2022, NULL, "iso2022" },
1363
+ #endif
1364
+ { RAW, NULL, "raw" }
1365
+ };
1366
+
1367
+ ctmbstr TY_(GetEncodingNameFromTidyId)(uint id)
1368
+ {
1369
+ uint i;
1370
+
1371
+ for (i = 0; enc2iana[i].name; ++i)
1372
+ if (enc2iana[i].id == id)
1373
+ return enc2iana[i].name;
1374
+
1375
+ return NULL;
1376
+ }
1377
+
1378
+ ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id)
1379
+ {
1380
+ uint i;
1381
+
1382
+ for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1383
+ if (enc2iana[i].id == id)
1384
+ return enc2iana[i].tidyOptName;
1385
+
1386
+ return NULL;
1387
+ }
1388
+
1389
+ int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1390
+ {
1391
+ uint i;
1392
+
1393
+ for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1394
+ if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1395
+ return enc2iana[i].id;
1396
+
1397
+ return -1;
1398
+ }
1399
+
1400
+ /*
1401
+ * local variables:
1402
+ * mode: c
1403
+ * indent-tabs-mode: nil
1404
+ * c-basic-offset: 4
1405
+ * eval: (c-set-offset 'substatement-open 0)
1406
+ * end:
1407
+ */