html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,818 @@
1
+ module HTML5
2
+
3
+ class EOF < Exception; end
4
+
5
+ CONTENT_MODEL_FLAGS = [
6
+ :PCDATA,
7
+ :RCDATA,
8
+ :CDATA,
9
+ :PLAINTEXT
10
+ ]
11
+
12
+ SCOPING_ELEMENTS = %w[
13
+ button
14
+ caption
15
+ html
16
+ marquee
17
+ object
18
+ table
19
+ td
20
+ th
21
+ ]
22
+
23
+ FORMATTING_ELEMENTS = %w[
24
+ a
25
+ b
26
+ big
27
+ em
28
+ font
29
+ i
30
+ nobr
31
+ s
32
+ small
33
+ strike
34
+ strong
35
+ tt
36
+ u
37
+ ]
38
+
39
+ SPECIAL_ELEMENTS = %w[
40
+ address
41
+ area
42
+ base
43
+ basefont
44
+ bgsound
45
+ blockquote
46
+ body
47
+ br
48
+ center
49
+ col
50
+ colgroup
51
+ dd
52
+ dir
53
+ div
54
+ dl
55
+ dt
56
+ embed
57
+ fieldset
58
+ form
59
+ frame
60
+ frameset
61
+ h1
62
+ h2
63
+ h3
64
+ h4
65
+ h5
66
+ h6
67
+ head
68
+ hr
69
+ iframe
70
+ image
71
+ img
72
+ input
73
+ isindex
74
+ li
75
+ link
76
+ listing
77
+ menu
78
+ meta
79
+ noembed
80
+ noframes
81
+ noscript
82
+ ol
83
+ optgroup
84
+ option
85
+ p
86
+ param
87
+ plaintext
88
+ pre
89
+ script
90
+ select
91
+ spacer
92
+ style
93
+ tbody
94
+ textarea
95
+ tfoot
96
+ thead
97
+ title
98
+ tr
99
+ ul
100
+ wbr
101
+ ]
102
+
103
+ SPACE_CHARACTERS = %W[
104
+ \t
105
+ \n
106
+ \x0B
107
+ \x0C
108
+ \x20
109
+ \r
110
+ ]
111
+
112
+ TABLE_INSERT_MODE_ELEMENTS = %w[
113
+ table
114
+ tbody
115
+ tfoot
116
+ thead
117
+ tr
118
+ ]
119
+
120
+ ASCII_LOWERCASE = ('a'..'z').to_a.join('')
121
+ ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
122
+ ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
123
+ DIGITS = '0'..'9'
124
+ HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
125
+
126
+ # Heading elements need to be ordered
127
+ HEADING_ELEMENTS = %w[
128
+ h1
129
+ h2
130
+ h3
131
+ h4
132
+ h5
133
+ h6
134
+ ]
135
+
136
+ # XXX What about event-source and command?
137
+ VOID_ELEMENTS = %w[
138
+ base
139
+ link
140
+ meta
141
+ hr
142
+ br
143
+ img
144
+ embed
145
+ param
146
+ area
147
+ col
148
+ input
149
+ ]
150
+
151
+ CDATA_ELEMENTS = %w[title textarea]
152
+
153
+ RCDATA_ELEMENTS = %w[
154
+ style
155
+ script
156
+ xmp
157
+ iframe
158
+ noembed
159
+ noframes
160
+ noscript
161
+ ]
162
+
163
+ BOOLEAN_ATTRIBUTES = {
164
+ :global => %w[irrelevant],
165
+ 'style' => %w[scoped],
166
+ 'img' => %w[ismap],
167
+ 'audio' => %w[autoplay controls],
168
+ 'video' => %w[autoplay controls],
169
+ 'script' => %w[defer async],
170
+ 'details' => %w[open],
171
+ 'datagrid' => %w[multiple disabled],
172
+ 'command' => %w[hidden disabled checked default],
173
+ 'menu' => %w[autosubmit],
174
+ 'fieldset' => %w[disabled readonly],
175
+ 'option' => %w[disabled readonly selected],
176
+ 'optgroup' => %w[disabled readonly],
177
+ 'button' => %w[disabled autofocus],
178
+ 'input' => %w[disabled readonly required autofocus checked ismap],
179
+ 'select' => %w[disabled readonly autofocus multiple],
180
+ 'output' => %w[disabled readonly]
181
+
182
+ }
183
+
184
+ # entitiesWindows1252 has to be _ordered_ and needs to have an index.
185
+ ENTITIES_WINDOWS1252 = [
186
+ 8364, # 0x80 0x20AC EURO SIGN
187
+ 65533, # 0x81 UNDEFINED
188
+ 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
189
+ 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
190
+ 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
191
+ 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
192
+ 8224, # 0x86 0x2020 DAGGER
193
+ 8225, # 0x87 0x2021 DOUBLE DAGGER
194
+ 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
195
+ 8240, # 0x89 0x2030 PER MILLE SIGN
196
+ 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
197
+ 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
198
+ 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
199
+ 65533, # 0x8D UNDEFINED
200
+ 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
201
+ 65533, # 0x8F UNDEFINED
202
+ 65533, # 0x90 UNDEFINED
203
+ 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
204
+ 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
205
+ 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
206
+ 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
207
+ 8226, # 0x95 0x2022 BULLET
208
+ 8211, # 0x96 0x2013 EN DASH
209
+ 8212, # 0x97 0x2014 EM DASH
210
+ 732, # 0x98 0x02DC SMALL TILDE
211
+ 8482, # 0x99 0x2122 TRADE MARK SIGN
212
+ 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
213
+ 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
214
+ 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
215
+ 65533, # 0x9D UNDEFINED
216
+ 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
217
+ 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
218
+ ]
219
+
220
+ # ENTITIES was generated from Python using the following code:
221
+ #
222
+ # import constants
223
+ # entities = constants.entities.items()
224
+ # entities.sort()
225
+ # list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
226
+ # repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
227
+ # for entity, value in entities]
228
+ # print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
229
+
230
+ ENTITIES = {
231
+ 'AElig' => "\xc3\x86",
232
+ 'AElig;' => "\xc3\x86",
233
+ 'AMP' => '&',
234
+ 'AMP;' => '&',
235
+ 'Aacute' => "\xc3\x81",
236
+ 'Aacute;' => "\xc3\x81",
237
+ 'Acirc' => "\xc3\x82",
238
+ 'Acirc;' => "\xc3\x82",
239
+ 'Agrave' => "\xc3\x80",
240
+ 'Agrave;' => "\xc3\x80",
241
+ 'Alpha;' => "\xce\x91",
242
+ 'Aring' => "\xc3\x85",
243
+ 'Aring;' => "\xc3\x85",
244
+ 'Atilde' => "\xc3\x83",
245
+ 'Atilde;' => "\xc3\x83",
246
+ 'Auml' => "\xc3\x84",
247
+ 'Auml;' => "\xc3\x84",
248
+ 'Beta;' => "\xce\x92",
249
+ 'COPY' => "\xc2\xa9",
250
+ 'COPY;' => "\xc2\xa9",
251
+ 'Ccedil' => "\xc3\x87",
252
+ 'Ccedil;' => "\xc3\x87",
253
+ 'Chi;' => "\xce\xa7",
254
+ 'Dagger;' => "\xe2\x80\xa1",
255
+ 'Delta;' => "\xce\x94",
256
+ 'ETH' => "\xc3\x90",
257
+ 'ETH;' => "\xc3\x90",
258
+ 'Eacute' => "\xc3\x89",
259
+ 'Eacute;' => "\xc3\x89",
260
+ 'Ecirc' => "\xc3\x8a",
261
+ 'Ecirc;' => "\xc3\x8a",
262
+ 'Egrave' => "\xc3\x88",
263
+ 'Egrave;' => "\xc3\x88",
264
+ 'Epsilon;' => "\xce\x95",
265
+ 'Eta;' => "\xce\x97",
266
+ 'Euml' => "\xc3\x8b",
267
+ 'Euml;' => "\xc3\x8b",
268
+ 'GT' => '>',
269
+ 'GT;' => '>',
270
+ 'Gamma;' => "\xce\x93",
271
+ 'Iacute' => "\xc3\x8d",
272
+ 'Iacute;' => "\xc3\x8d",
273
+ 'Icirc' => "\xc3\x8e",
274
+ 'Icirc;' => "\xc3\x8e",
275
+ 'Igrave' => "\xc3\x8c",
276
+ 'Igrave;' => "\xc3\x8c",
277
+ 'Iota;' => "\xce\x99",
278
+ 'Iuml' => "\xc3\x8f",
279
+ 'Iuml;' => "\xc3\x8f",
280
+ 'Kappa;' => "\xce\x9a",
281
+ 'LT' => '<',
282
+ 'LT;' => '<',
283
+ 'Lambda;' => "\xce\x9b",
284
+ 'Mu;' => "\xce\x9c",
285
+ 'Ntilde' => "\xc3\x91",
286
+ 'Ntilde;' => "\xc3\x91",
287
+ 'Nu;' => "\xce\x9d",
288
+ 'OElig;' => "\xc5\x92",
289
+ 'Oacute' => "\xc3\x93",
290
+ 'Oacute;' => "\xc3\x93",
291
+ 'Ocirc' => "\xc3\x94",
292
+ 'Ocirc;' => "\xc3\x94",
293
+ 'Ograve' => "\xc3\x92",
294
+ 'Ograve;' => "\xc3\x92",
295
+ 'Omega;' => "\xce\xa9",
296
+ 'Omicron;' => "\xce\x9f",
297
+ 'Oslash' => "\xc3\x98",
298
+ 'Oslash;' => "\xc3\x98",
299
+ 'Otilde' => "\xc3\x95",
300
+ 'Otilde;' => "\xc3\x95",
301
+ 'Ouml' => "\xc3\x96",
302
+ 'Ouml;' => "\xc3\x96",
303
+ 'Phi;' => "\xce\xa6",
304
+ 'Pi;' => "\xce\xa0",
305
+ 'Prime;' => "\xe2\x80\xb3",
306
+ 'Psi;' => "\xce\xa8",
307
+ 'QUOT' => '"',
308
+ 'QUOT;' => '"',
309
+ 'REG' => "\xc2\xae",
310
+ 'REG;' => "\xc2\xae",
311
+ 'Rho;' => "\xce\xa1",
312
+ 'Scaron;' => "\xc5\xa0",
313
+ 'Sigma;' => "\xce\xa3",
314
+ 'THORN' => "\xc3\x9e",
315
+ 'THORN;' => "\xc3\x9e",
316
+ 'TRADE;' => "\xe2\x84\xa2",
317
+ 'Tau;' => "\xce\xa4",
318
+ 'Theta;' => "\xce\x98",
319
+ 'Uacute' => "\xc3\x9a",
320
+ 'Uacute;' => "\xc3\x9a",
321
+ 'Ucirc' => "\xc3\x9b",
322
+ 'Ucirc;' => "\xc3\x9b",
323
+ 'Ugrave' => "\xc3\x99",
324
+ 'Ugrave;' => "\xc3\x99",
325
+ 'Upsilon;' => "\xce\xa5",
326
+ 'Uuml' => "\xc3\x9c",
327
+ 'Uuml;' => "\xc3\x9c",
328
+ 'Xi;' => "\xce\x9e",
329
+ 'Yacute' => "\xc3\x9d",
330
+ 'Yacute;' => "\xc3\x9d",
331
+ 'Yuml;' => "\xc5\xb8",
332
+ 'Zeta;' => "\xce\x96",
333
+ 'aacute' => "\xc3\xa1",
334
+ 'aacute;' => "\xc3\xa1",
335
+ 'acirc' => "\xc3\xa2",
336
+ 'acirc;' => "\xc3\xa2",
337
+ 'acute' => "\xc2\xb4",
338
+ 'acute;' => "\xc2\xb4",
339
+ 'aelig' => "\xc3\xa6",
340
+ 'aelig;' => "\xc3\xa6",
341
+ 'agrave' => "\xc3\xa0",
342
+ 'agrave;' => "\xc3\xa0",
343
+ 'alefsym;' => "\xe2\x84\xb5",
344
+ 'alpha;' => "\xce\xb1",
345
+ 'amp' => '&',
346
+ 'amp;' => '&',
347
+ 'and;' => "\xe2\x88\xa7",
348
+ 'ang;' => "\xe2\x88\xa0",
349
+ 'apos;' => "'",
350
+ 'aring' => "\xc3\xa5",
351
+ 'aring;' => "\xc3\xa5",
352
+ 'asymp;' => "\xe2\x89\x88",
353
+ 'atilde' => "\xc3\xa3",
354
+ 'atilde;' => "\xc3\xa3",
355
+ 'auml' => "\xc3\xa4",
356
+ 'auml;' => "\xc3\xa4",
357
+ 'bdquo;' => "\xe2\x80\x9e",
358
+ 'beta;' => "\xce\xb2",
359
+ 'brvbar' => "\xc2\xa6",
360
+ 'brvbar;' => "\xc2\xa6",
361
+ 'bull;' => "\xe2\x80\xa2",
362
+ 'cap;' => "\xe2\x88\xa9",
363
+ 'ccedil' => "\xc3\xa7",
364
+ 'ccedil;' => "\xc3\xa7",
365
+ 'cedil' => "\xc2\xb8",
366
+ 'cedil;' => "\xc2\xb8",
367
+ 'cent' => "\xc2\xa2",
368
+ 'cent;' => "\xc2\xa2",
369
+ 'chi;' => "\xcf\x87",
370
+ 'circ;' => "\xcb\x86",
371
+ 'clubs;' => "\xe2\x99\xa3",
372
+ 'cong;' => "\xe2\x89\x85",
373
+ 'copy' => "\xc2\xa9",
374
+ 'copy;' => "\xc2\xa9",
375
+ 'crarr;' => "\xe2\x86\xb5",
376
+ 'cup;' => "\xe2\x88\xaa",
377
+ 'curren' => "\xc2\xa4",
378
+ 'curren;' => "\xc2\xa4",
379
+ 'dArr;' => "\xe2\x87\x93",
380
+ 'dagger;' => "\xe2\x80\xa0",
381
+ 'darr;' => "\xe2\x86\x93",
382
+ 'deg' => "\xc2\xb0",
383
+ 'deg;' => "\xc2\xb0",
384
+ 'delta;' => "\xce\xb4",
385
+ 'diams;' => "\xe2\x99\xa6",
386
+ 'divide' => "\xc3\xb7",
387
+ 'divide;' => "\xc3\xb7",
388
+ 'eacute' => "\xc3\xa9",
389
+ 'eacute;' => "\xc3\xa9",
390
+ 'ecirc' => "\xc3\xaa",
391
+ 'ecirc;' => "\xc3\xaa",
392
+ 'egrave' => "\xc3\xa8",
393
+ 'egrave;' => "\xc3\xa8",
394
+ 'empty;' => "\xe2\x88\x85",
395
+ 'emsp;' => "\xe2\x80\x83",
396
+ 'ensp;' => "\xe2\x80\x82",
397
+ 'epsilon;' => "\xce\xb5",
398
+ 'equiv;' => "\xe2\x89\xa1",
399
+ 'eta;' => "\xce\xb7",
400
+ 'eth' => "\xc3\xb0",
401
+ 'eth;' => "\xc3\xb0",
402
+ 'euml' => "\xc3\xab",
403
+ 'euml;' => "\xc3\xab",
404
+ 'euro;' => "\xe2\x82\xac",
405
+ 'exist;' => "\xe2\x88\x83",
406
+ 'fnof;' => "\xc6\x92",
407
+ 'forall;' => "\xe2\x88\x80",
408
+ 'frac12' => "\xc2\xbd",
409
+ 'frac12;' => "\xc2\xbd",
410
+ 'frac14' => "\xc2\xbc",
411
+ 'frac14;' => "\xc2\xbc",
412
+ 'frac34' => "\xc2\xbe",
413
+ 'frac34;' => "\xc2\xbe",
414
+ 'frasl;' => "\xe2\x81\x84",
415
+ 'gamma;' => "\xce\xb3",
416
+ 'ge;' => "\xe2\x89\xa5",
417
+ 'gt' => '>',
418
+ 'gt;' => '>',
419
+ 'hArr;' => "\xe2\x87\x94",
420
+ 'harr;' => "\xe2\x86\x94",
421
+ 'hearts;' => "\xe2\x99\xa5",
422
+ 'hellip;' => "\xe2\x80\xa6",
423
+ 'iacute' => "\xc3\xad",
424
+ 'iacute;' => "\xc3\xad",
425
+ 'icirc' => "\xc3\xae",
426
+ 'icirc;' => "\xc3\xae",
427
+ 'iexcl' => "\xc2\xa1",
428
+ 'iexcl;' => "\xc2\xa1",
429
+ 'igrave' => "\xc3\xac",
430
+ 'igrave;' => "\xc3\xac",
431
+ 'image;' => "\xe2\x84\x91",
432
+ 'infin;' => "\xe2\x88\x9e",
433
+ 'int;' => "\xe2\x88\xab",
434
+ 'iota;' => "\xce\xb9",
435
+ 'iquest' => "\xc2\xbf",
436
+ 'iquest;' => "\xc2\xbf",
437
+ 'isin;' => "\xe2\x88\x88",
438
+ 'iuml' => "\xc3\xaf",
439
+ 'iuml;' => "\xc3\xaf",
440
+ 'kappa;' => "\xce\xba",
441
+ 'lArr;' => "\xe2\x87\x90",
442
+ 'lambda;' => "\xce\xbb",
443
+ 'lang;' => "\xe3\x80\x88",
444
+ 'laquo' => "\xc2\xab",
445
+ 'laquo;' => "\xc2\xab",
446
+ 'larr;' => "\xe2\x86\x90",
447
+ 'lceil;' => "\xe2\x8c\x88",
448
+ 'ldquo;' => "\xe2\x80\x9c",
449
+ 'le;' => "\xe2\x89\xa4",
450
+ 'lfloor;' => "\xe2\x8c\x8a",
451
+ 'lowast;' => "\xe2\x88\x97",
452
+ 'loz;' => "\xe2\x97\x8a",
453
+ 'lrm;' => "\xe2\x80\x8e",
454
+ 'lsaquo;' => "\xe2\x80\xb9",
455
+ 'lsquo;' => "\xe2\x80\x98",
456
+ 'lt' => '<',
457
+ 'lt;' => '<',
458
+ 'macr' => "\xc2\xaf",
459
+ 'macr;' => "\xc2\xaf",
460
+ 'mdash;' => "\xe2\x80\x94",
461
+ 'micro' => "\xc2\xb5",
462
+ 'micro;' => "\xc2\xb5",
463
+ 'middot' => "\xc2\xb7",
464
+ 'middot;' => "\xc2\xb7",
465
+ 'minus;' => "\xe2\x88\x92",
466
+ 'mu;' => "\xce\xbc",
467
+ 'nabla;' => "\xe2\x88\x87",
468
+ 'nbsp' => "\xc2\xa0",
469
+ 'nbsp;' => "\xc2\xa0",
470
+ 'ndash;' => "\xe2\x80\x93",
471
+ 'ne;' => "\xe2\x89\xa0",
472
+ 'ni;' => "\xe2\x88\x8b",
473
+ 'not' => "\xc2\xac",
474
+ 'not;' => "\xc2\xac",
475
+ 'notin;' => "\xe2\x88\x89",
476
+ 'nsub;' => "\xe2\x8a\x84",
477
+ 'ntilde' => "\xc3\xb1",
478
+ 'ntilde;' => "\xc3\xb1",
479
+ 'nu;' => "\xce\xbd",
480
+ 'oacute' => "\xc3\xb3",
481
+ 'oacute;' => "\xc3\xb3",
482
+ 'ocirc' => "\xc3\xb4",
483
+ 'ocirc;' => "\xc3\xb4",
484
+ 'oelig;' => "\xc5\x93",
485
+ 'ograve' => "\xc3\xb2",
486
+ 'ograve;' => "\xc3\xb2",
487
+ 'oline;' => "\xe2\x80\xbe",
488
+ 'omega;' => "\xcf\x89",
489
+ 'omicron;' => "\xce\xbf",
490
+ 'oplus;' => "\xe2\x8a\x95",
491
+ 'or;' => "\xe2\x88\xa8",
492
+ 'ordf' => "\xc2\xaa",
493
+ 'ordf;' => "\xc2\xaa",
494
+ 'ordm' => "\xc2\xba",
495
+ 'ordm;' => "\xc2\xba",
496
+ 'oslash' => "\xc3\xb8",
497
+ 'oslash;' => "\xc3\xb8",
498
+ 'otilde' => "\xc3\xb5",
499
+ 'otilde;' => "\xc3\xb5",
500
+ 'otimes;' => "\xe2\x8a\x97",
501
+ 'ouml' => "\xc3\xb6",
502
+ 'ouml;' => "\xc3\xb6",
503
+ 'para' => "\xc2\xb6",
504
+ 'para;' => "\xc2\xb6",
505
+ 'part;' => "\xe2\x88\x82",
506
+ 'permil;' => "\xe2\x80\xb0",
507
+ 'perp;' => "\xe2\x8a\xa5",
508
+ 'phi;' => "\xcf\x86",
509
+ 'pi;' => "\xcf\x80",
510
+ 'piv;' => "\xcf\x96",
511
+ 'plusmn' => "\xc2\xb1",
512
+ 'plusmn;' => "\xc2\xb1",
513
+ 'pound' => "\xc2\xa3",
514
+ 'pound;' => "\xc2\xa3",
515
+ 'prime;' => "\xe2\x80\xb2",
516
+ 'prod;' => "\xe2\x88\x8f",
517
+ 'prop;' => "\xe2\x88\x9d",
518
+ 'psi;' => "\xcf\x88",
519
+ 'quot' => '"',
520
+ 'quot;' => '"',
521
+ 'rArr;' => "\xe2\x87\x92",
522
+ 'radic;' => "\xe2\x88\x9a",
523
+ 'rang;' => "\xe3\x80\x89",
524
+ 'raquo' => "\xc2\xbb",
525
+ 'raquo;' => "\xc2\xbb",
526
+ 'rarr;' => "\xe2\x86\x92",
527
+ 'rceil;' => "\xe2\x8c\x89",
528
+ 'rdquo;' => "\xe2\x80\x9d",
529
+ 'real;' => "\xe2\x84\x9c",
530
+ 'reg' => "\xc2\xae",
531
+ 'reg;' => "\xc2\xae",
532
+ 'rfloor;' => "\xe2\x8c\x8b",
533
+ 'rho;' => "\xcf\x81",
534
+ 'rlm;' => "\xe2\x80\x8f",
535
+ 'rsaquo;' => "\xe2\x80\xba",
536
+ 'rsquo;' => "\xe2\x80\x99",
537
+ 'sbquo;' => "\xe2\x80\x9a",
538
+ 'scaron;' => "\xc5\xa1",
539
+ 'sdot;' => "\xe2\x8b\x85",
540
+ 'sect' => "\xc2\xa7",
541
+ 'sect;' => "\xc2\xa7",
542
+ 'shy' => "\xc2\xad",
543
+ 'shy;' => "\xc2\xad",
544
+ 'sigma;' => "\xcf\x83",
545
+ 'sigmaf;' => "\xcf\x82",
546
+ 'sim;' => "\xe2\x88\xbc",
547
+ 'spades;' => "\xe2\x99\xa0",
548
+ 'sub;' => "\xe2\x8a\x82",
549
+ 'sube;' => "\xe2\x8a\x86",
550
+ 'sum;' => "\xe2\x88\x91",
551
+ 'sup1' => "\xc2\xb9",
552
+ 'sup1;' => "\xc2\xb9",
553
+ 'sup2' => "\xc2\xb2",
554
+ 'sup2;' => "\xc2\xb2",
555
+ 'sup3' => "\xc2\xb3",
556
+ 'sup3;' => "\xc2\xb3",
557
+ 'sup;' => "\xe2\x8a\x83",
558
+ 'supe;' => "\xe2\x8a\x87",
559
+ 'szlig' => "\xc3\x9f",
560
+ 'szlig;' => "\xc3\x9f",
561
+ 'tau;' => "\xcf\x84",
562
+ 'there4;' => "\xe2\x88\xb4",
563
+ 'theta;' => "\xce\xb8",
564
+ 'thetasym;' => "\xcf\x91",
565
+ 'thinsp;' => "\xe2\x80\x89",
566
+ 'thorn' => "\xc3\xbe",
567
+ 'thorn;' => "\xc3\xbe",
568
+ 'tilde;' => "\xcb\x9c",
569
+ 'times' => "\xc3\x97",
570
+ 'times;' => "\xc3\x97",
571
+ 'trade;' => "\xe2\x84\xa2",
572
+ 'uArr;' => "\xe2\x87\x91",
573
+ 'uacute' => "\xc3\xba",
574
+ 'uacute;' => "\xc3\xba",
575
+ 'uarr;' => "\xe2\x86\x91",
576
+ 'ucirc' => "\xc3\xbb",
577
+ 'ucirc;' => "\xc3\xbb",
578
+ 'ugrave' => "\xc3\xb9",
579
+ 'ugrave;' => "\xc3\xb9",
580
+ 'uml' => "\xc2\xa8",
581
+ 'uml;' => "\xc2\xa8",
582
+ 'upsih;' => "\xcf\x92",
583
+ 'upsilon;' => "\xcf\x85",
584
+ 'uuml' => "\xc3\xbc",
585
+ 'uuml;' => "\xc3\xbc",
586
+ 'weierp;' => "\xe2\x84\x98",
587
+ 'xi;' => "\xce\xbe",
588
+ 'yacute' => "\xc3\xbd",
589
+ 'yacute;' => "\xc3\xbd",
590
+ 'yen' => "\xc2\xa5",
591
+ 'yen;' => "\xc2\xa5",
592
+ 'yuml' => "\xc3\xbf",
593
+ 'yuml;' => "\xc3\xbf",
594
+ 'zeta;' => "\xce\xb6",
595
+ 'zwj;' => "\xe2\x80\x8d",
596
+ 'zwnj;' => "\xe2\x80\x8c"
597
+ }
598
+
599
+ ENCODINGS = %w[
600
+ ansi_x3.4-1968
601
+ iso-ir-6
602
+ ansi_x3.4-1986
603
+ iso_646.irv:1991
604
+ ascii
605
+ iso646-us
606
+ us-ascii
607
+ us
608
+ ibm367
609
+ cp367
610
+ csascii
611
+ ks_c_5601-1987
612
+ korean
613
+ iso-2022-kr
614
+ csiso2022kr
615
+ euc-kr
616
+ iso-2022-jp
617
+ csiso2022jp
618
+ iso-2022-jp-2
619
+ iso-ir-58
620
+ chinese
621
+ csiso58gb231280
622
+ iso_8859-1:1987
623
+ iso-ir-100
624
+ iso_8859-1
625
+ iso-8859-1
626
+ latin1
627
+ l1
628
+ ibm819
629
+ cp819
630
+ csisolatin1
631
+ iso_8859-2:1987
632
+ iso-ir-101
633
+ iso_8859-2
634
+ iso-8859-2
635
+ latin2
636
+ l2
637
+ csisolatin2
638
+ iso_8859-3:1988
639
+ iso-ir-109
640
+ iso_8859-3
641
+ iso-8859-3
642
+ latin3
643
+ l3
644
+ csisolatin3
645
+ iso_8859-4:1988
646
+ iso-ir-110
647
+ iso_8859-4
648
+ iso-8859-4
649
+ latin4
650
+ l4
651
+ csisolatin4
652
+ iso_8859-6:1987
653
+ iso-ir-127
654
+ iso_8859-6
655
+ iso-8859-6
656
+ ecma-114
657
+ asmo-708
658
+ arabic
659
+ csisolatinarabic
660
+ iso_8859-7:1987
661
+ iso-ir-126
662
+ iso_8859-7
663
+ iso-8859-7
664
+ elot_928
665
+ ecma-118
666
+ greek
667
+ greek8
668
+ csisolatingreek
669
+ iso_8859-8:1988
670
+ iso-ir-138
671
+ iso_8859-8
672
+ iso-8859-8
673
+ hebrew
674
+ csisolatinhebrew
675
+ iso_8859-5:1988
676
+ iso-ir-144
677
+ iso_8859-5
678
+ iso-8859-5
679
+ cyrillic
680
+ csisolatincyrillic
681
+ iso_8859-9:1989
682
+ iso-ir-148
683
+ iso_8859-9
684
+ iso-8859-9
685
+ latin5
686
+ l5
687
+ csisolatin5
688
+ iso-8859-10
689
+ iso-ir-157
690
+ l6
691
+ iso_8859-10:1992
692
+ csisolatin6
693
+ latin6
694
+ hp-roman8
695
+ roman8
696
+ r8
697
+ ibm037
698
+ cp037
699
+ csibm037
700
+ ibm424
701
+ cp424
702
+ csibm424
703
+ ibm437
704
+ cp437
705
+ 437
706
+ cspc8codepage437
707
+ ibm500
708
+ cp500
709
+ csibm500
710
+ ibm775
711
+ cp775
712
+ cspc775baltic
713
+ ibm850
714
+ cp850
715
+ 850
716
+ cspc850multilingual
717
+ ibm852
718
+ cp852
719
+ 852
720
+ cspcp852
721
+ ibm855
722
+ cp855
723
+ 855
724
+ csibm855
725
+ ibm857
726
+ cp857
727
+ 857
728
+ csibm857
729
+ ibm860
730
+ cp860
731
+ 860
732
+ csibm860
733
+ ibm861
734
+ cp861
735
+ 861
736
+ cp-is
737
+ csibm861
738
+ ibm862
739
+ cp862
740
+ 862
741
+ cspc862latinhebrew
742
+ ibm863
743
+ cp863
744
+ 863
745
+ csibm863
746
+ ibm864
747
+ cp864
748
+ csibm864
749
+ ibm865
750
+ cp865
751
+ 865
752
+ csibm865
753
+ ibm866
754
+ cp866
755
+ 866
756
+ csibm866
757
+ ibm869
758
+ cp869
759
+ 869
760
+ cp-gr
761
+ csibm869
762
+ ibm1026
763
+ cp1026
764
+ csibm1026
765
+ koi8-r
766
+ cskoi8r
767
+ koi8-u
768
+ big5-hkscs
769
+ ptcp154
770
+ csptcp154
771
+ pt154
772
+ cp154
773
+ utf-7
774
+ utf-16be
775
+ utf-16le
776
+ utf-16
777
+ utf-8
778
+ iso-8859-13
779
+ iso-8859-14
780
+ iso-ir-199
781
+ iso_8859-14:1998
782
+ iso_8859-14
783
+ latin8
784
+ iso-celtic
785
+ l8
786
+ iso-8859-15
787
+ iso_8859-15
788
+ iso-8859-16
789
+ iso-ir-226
790
+ iso_8859-16:2001
791
+ iso_8859-16
792
+ latin10
793
+ l10
794
+ gbk
795
+ cp936
796
+ ms936
797
+ gb18030
798
+ shift_jis
799
+ ms_kanji
800
+ csshiftjis
801
+ euc-jp
802
+ gb2312
803
+ big5
804
+ csbig5
805
+ windows-1250
806
+ windows-1251
807
+ windows-1252
808
+ windows-1253
809
+ windows-1254
810
+ windows-1255
811
+ windows-1256
812
+ windows-1257
813
+ windows-1258
814
+ tis-620
815
+ hz-gb-2312
816
+ ]
817
+
818
+ end