html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,818 @@
1
+ module HTML5
2
+
3
+ class EOF < Exception; end
4
+
5
+ CONTENT_MODEL_FLAGS = [
6
+ :PCDATA,
7
+ :RCDATA,
8
+ :CDATA,
9
+ :PLAINTEXT
10
+ ]
11
+
12
+ SCOPING_ELEMENTS = %w[
13
+ button
14
+ caption
15
+ html
16
+ marquee
17
+ object
18
+ table
19
+ td
20
+ th
21
+ ]
22
+
23
+ FORMATTING_ELEMENTS = %w[
24
+ a
25
+ b
26
+ big
27
+ em
28
+ font
29
+ i
30
+ nobr
31
+ s
32
+ small
33
+ strike
34
+ strong
35
+ tt
36
+ u
37
+ ]
38
+
39
+ SPECIAL_ELEMENTS = %w[
40
+ address
41
+ area
42
+ base
43
+ basefont
44
+ bgsound
45
+ blockquote
46
+ body
47
+ br
48
+ center
49
+ col
50
+ colgroup
51
+ dd
52
+ dir
53
+ div
54
+ dl
55
+ dt
56
+ embed
57
+ fieldset
58
+ form
59
+ frame
60
+ frameset
61
+ h1
62
+ h2
63
+ h3
64
+ h4
65
+ h5
66
+ h6
67
+ head
68
+ hr
69
+ iframe
70
+ image
71
+ img
72
+ input
73
+ isindex
74
+ li
75
+ link
76
+ listing
77
+ menu
78
+ meta
79
+ noembed
80
+ noframes
81
+ noscript
82
+ ol
83
+ optgroup
84
+ option
85
+ p
86
+ param
87
+ plaintext
88
+ pre
89
+ script
90
+ select
91
+ spacer
92
+ style
93
+ tbody
94
+ textarea
95
+ tfoot
96
+ thead
97
+ title
98
+ tr
99
+ ul
100
+ wbr
101
+ ]
102
+
103
+ SPACE_CHARACTERS = %W[
104
+ \t
105
+ \n
106
+ \x0B
107
+ \x0C
108
+ \x20
109
+ \r
110
+ ]
111
+
112
+ TABLE_INSERT_MODE_ELEMENTS = %w[
113
+ table
114
+ tbody
115
+ tfoot
116
+ thead
117
+ tr
118
+ ]
119
+
120
+ ASCII_LOWERCASE = ('a'..'z').to_a.join('')
121
+ ASCII_UPPERCASE = ('A'..'Z').to_a.join('')
122
+ ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
123
+ DIGITS = '0'..'9'
124
+ HEX_DIGITS = DIGITS.to_a + ('a'..'f').to_a + ('A'..'F').to_a
125
+
126
+ # Heading elements need to be ordered
127
+ HEADING_ELEMENTS = %w[
128
+ h1
129
+ h2
130
+ h3
131
+ h4
132
+ h5
133
+ h6
134
+ ]
135
+
136
+ # XXX What about event-source and command?
137
+ VOID_ELEMENTS = %w[
138
+ base
139
+ link
140
+ meta
141
+ hr
142
+ br
143
+ img
144
+ embed
145
+ param
146
+ area
147
+ col
148
+ input
149
+ ]
150
+
151
+ CDATA_ELEMENTS = %w[title textarea]
152
+
153
+ RCDATA_ELEMENTS = %w[
154
+ style
155
+ script
156
+ xmp
157
+ iframe
158
+ noembed
159
+ noframes
160
+ noscript
161
+ ]
162
+
163
+ BOOLEAN_ATTRIBUTES = {
164
+ :global => %w[irrelevant],
165
+ 'style' => %w[scoped],
166
+ 'img' => %w[ismap],
167
+ 'audio' => %w[autoplay controls],
168
+ 'video' => %w[autoplay controls],
169
+ 'script' => %w[defer async],
170
+ 'details' => %w[open],
171
+ 'datagrid' => %w[multiple disabled],
172
+ 'command' => %w[hidden disabled checked default],
173
+ 'menu' => %w[autosubmit],
174
+ 'fieldset' => %w[disabled readonly],
175
+ 'option' => %w[disabled readonly selected],
176
+ 'optgroup' => %w[disabled readonly],
177
+ 'button' => %w[disabled autofocus],
178
+ 'input' => %w[disabled readonly required autofocus checked ismap],
179
+ 'select' => %w[disabled readonly autofocus multiple],
180
+ 'output' => %w[disabled readonly]
181
+
182
+ }
183
+
184
+ # entitiesWindows1252 has to be _ordered_ and needs to have an index.
185
+ ENTITIES_WINDOWS1252 = [
186
+ 8364, # 0x80 0x20AC EURO SIGN
187
+ 65533, # 0x81 UNDEFINED
188
+ 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
189
+ 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
190
+ 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
191
+ 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
192
+ 8224, # 0x86 0x2020 DAGGER
193
+ 8225, # 0x87 0x2021 DOUBLE DAGGER
194
+ 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
195
+ 8240, # 0x89 0x2030 PER MILLE SIGN
196
+ 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
197
+ 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
198
+ 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
199
+ 65533, # 0x8D UNDEFINED
200
+ 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
201
+ 65533, # 0x8F UNDEFINED
202
+ 65533, # 0x90 UNDEFINED
203
+ 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
204
+ 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
205
+ 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
206
+ 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
207
+ 8226, # 0x95 0x2022 BULLET
208
+ 8211, # 0x96 0x2013 EN DASH
209
+ 8212, # 0x97 0x2014 EM DASH
210
+ 732, # 0x98 0x02DC SMALL TILDE
211
+ 8482, # 0x99 0x2122 TRADE MARK SIGN
212
+ 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
213
+ 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
214
+ 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
215
+ 65533, # 0x9D UNDEFINED
216
+ 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
217
+ 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
218
+ ]
219
+
220
+ # ENTITIES was generated from Python using the following code:
221
+ #
222
+ # import constants
223
+ # entities = constants.entities.items()
224
+ # entities.sort()
225
+ # list = [ ' '.join([repr(entity), '=>', ord(value)<128 and
226
+ # repr(str(value)) or repr(value.encode('utf-8')).replace("'",'"')])
227
+ # for entity, value in entities]
228
+ # print ' ENTITIES = {\n ' + ',\n '.join(list) + '\n }'
229
+
230
+ ENTITIES = {
231
+ 'AElig' => "\xc3\x86",
232
+ 'AElig;' => "\xc3\x86",
233
+ 'AMP' => '&',
234
+ 'AMP;' => '&',
235
+ 'Aacute' => "\xc3\x81",
236
+ 'Aacute;' => "\xc3\x81",
237
+ 'Acirc' => "\xc3\x82",
238
+ 'Acirc;' => "\xc3\x82",
239
+ 'Agrave' => "\xc3\x80",
240
+ 'Agrave;' => "\xc3\x80",
241
+ 'Alpha;' => "\xce\x91",
242
+ 'Aring' => "\xc3\x85",
243
+ 'Aring;' => "\xc3\x85",
244
+ 'Atilde' => "\xc3\x83",
245
+ 'Atilde;' => "\xc3\x83",
246
+ 'Auml' => "\xc3\x84",
247
+ 'Auml;' => "\xc3\x84",
248
+ 'Beta;' => "\xce\x92",
249
+ 'COPY' => "\xc2\xa9",
250
+ 'COPY;' => "\xc2\xa9",
251
+ 'Ccedil' => "\xc3\x87",
252
+ 'Ccedil;' => "\xc3\x87",
253
+ 'Chi;' => "\xce\xa7",
254
+ 'Dagger;' => "\xe2\x80\xa1",
255
+ 'Delta;' => "\xce\x94",
256
+ 'ETH' => "\xc3\x90",
257
+ 'ETH;' => "\xc3\x90",
258
+ 'Eacute' => "\xc3\x89",
259
+ 'Eacute;' => "\xc3\x89",
260
+ 'Ecirc' => "\xc3\x8a",
261
+ 'Ecirc;' => "\xc3\x8a",
262
+ 'Egrave' => "\xc3\x88",
263
+ 'Egrave;' => "\xc3\x88",
264
+ 'Epsilon;' => "\xce\x95",
265
+ 'Eta;' => "\xce\x97",
266
+ 'Euml' => "\xc3\x8b",
267
+ 'Euml;' => "\xc3\x8b",
268
+ 'GT' => '>',
269
+ 'GT;' => '>',
270
+ 'Gamma;' => "\xce\x93",
271
+ 'Iacute' => "\xc3\x8d",
272
+ 'Iacute;' => "\xc3\x8d",
273
+ 'Icirc' => "\xc3\x8e",
274
+ 'Icirc;' => "\xc3\x8e",
275
+ 'Igrave' => "\xc3\x8c",
276
+ 'Igrave;' => "\xc3\x8c",
277
+ 'Iota;' => "\xce\x99",
278
+ 'Iuml' => "\xc3\x8f",
279
+ 'Iuml;' => "\xc3\x8f",
280
+ 'Kappa;' => "\xce\x9a",
281
+ 'LT' => '<',
282
+ 'LT;' => '<',
283
+ 'Lambda;' => "\xce\x9b",
284
+ 'Mu;' => "\xce\x9c",
285
+ 'Ntilde' => "\xc3\x91",
286
+ 'Ntilde;' => "\xc3\x91",
287
+ 'Nu;' => "\xce\x9d",
288
+ 'OElig;' => "\xc5\x92",
289
+ 'Oacute' => "\xc3\x93",
290
+ 'Oacute;' => "\xc3\x93",
291
+ 'Ocirc' => "\xc3\x94",
292
+ 'Ocirc;' => "\xc3\x94",
293
+ 'Ograve' => "\xc3\x92",
294
+ 'Ograve;' => "\xc3\x92",
295
+ 'Omega;' => "\xce\xa9",
296
+ 'Omicron;' => "\xce\x9f",
297
+ 'Oslash' => "\xc3\x98",
298
+ 'Oslash;' => "\xc3\x98",
299
+ 'Otilde' => "\xc3\x95",
300
+ 'Otilde;' => "\xc3\x95",
301
+ 'Ouml' => "\xc3\x96",
302
+ 'Ouml;' => "\xc3\x96",
303
+ 'Phi;' => "\xce\xa6",
304
+ 'Pi;' => "\xce\xa0",
305
+ 'Prime;' => "\xe2\x80\xb3",
306
+ 'Psi;' => "\xce\xa8",
307
+ 'QUOT' => '"',
308
+ 'QUOT;' => '"',
309
+ 'REG' => "\xc2\xae",
310
+ 'REG;' => "\xc2\xae",
311
+ 'Rho;' => "\xce\xa1",
312
+ 'Scaron;' => "\xc5\xa0",
313
+ 'Sigma;' => "\xce\xa3",
314
+ 'THORN' => "\xc3\x9e",
315
+ 'THORN;' => "\xc3\x9e",
316
+ 'TRADE;' => "\xe2\x84\xa2",
317
+ 'Tau;' => "\xce\xa4",
318
+ 'Theta;' => "\xce\x98",
319
+ 'Uacute' => "\xc3\x9a",
320
+ 'Uacute;' => "\xc3\x9a",
321
+ 'Ucirc' => "\xc3\x9b",
322
+ 'Ucirc;' => "\xc3\x9b",
323
+ 'Ugrave' => "\xc3\x99",
324
+ 'Ugrave;' => "\xc3\x99",
325
+ 'Upsilon;' => "\xce\xa5",
326
+ 'Uuml' => "\xc3\x9c",
327
+ 'Uuml;' => "\xc3\x9c",
328
+ 'Xi;' => "\xce\x9e",
329
+ 'Yacute' => "\xc3\x9d",
330
+ 'Yacute;' => "\xc3\x9d",
331
+ 'Yuml;' => "\xc5\xb8",
332
+ 'Zeta;' => "\xce\x96",
333
+ 'aacute' => "\xc3\xa1",
334
+ 'aacute;' => "\xc3\xa1",
335
+ 'acirc' => "\xc3\xa2",
336
+ 'acirc;' => "\xc3\xa2",
337
+ 'acute' => "\xc2\xb4",
338
+ 'acute;' => "\xc2\xb4",
339
+ 'aelig' => "\xc3\xa6",
340
+ 'aelig;' => "\xc3\xa6",
341
+ 'agrave' => "\xc3\xa0",
342
+ 'agrave;' => "\xc3\xa0",
343
+ 'alefsym;' => "\xe2\x84\xb5",
344
+ 'alpha;' => "\xce\xb1",
345
+ 'amp' => '&',
346
+ 'amp;' => '&',
347
+ 'and;' => "\xe2\x88\xa7",
348
+ 'ang;' => "\xe2\x88\xa0",
349
+ 'apos;' => "'",
350
+ 'aring' => "\xc3\xa5",
351
+ 'aring;' => "\xc3\xa5",
352
+ 'asymp;' => "\xe2\x89\x88",
353
+ 'atilde' => "\xc3\xa3",
354
+ 'atilde;' => "\xc3\xa3",
355
+ 'auml' => "\xc3\xa4",
356
+ 'auml;' => "\xc3\xa4",
357
+ 'bdquo;' => "\xe2\x80\x9e",
358
+ 'beta;' => "\xce\xb2",
359
+ 'brvbar' => "\xc2\xa6",
360
+ 'brvbar;' => "\xc2\xa6",
361
+ 'bull;' => "\xe2\x80\xa2",
362
+ 'cap;' => "\xe2\x88\xa9",
363
+ 'ccedil' => "\xc3\xa7",
364
+ 'ccedil;' => "\xc3\xa7",
365
+ 'cedil' => "\xc2\xb8",
366
+ 'cedil;' => "\xc2\xb8",
367
+ 'cent' => "\xc2\xa2",
368
+ 'cent;' => "\xc2\xa2",
369
+ 'chi;' => "\xcf\x87",
370
+ 'circ;' => "\xcb\x86",
371
+ 'clubs;' => "\xe2\x99\xa3",
372
+ 'cong;' => "\xe2\x89\x85",
373
+ 'copy' => "\xc2\xa9",
374
+ 'copy;' => "\xc2\xa9",
375
+ 'crarr;' => "\xe2\x86\xb5",
376
+ 'cup;' => "\xe2\x88\xaa",
377
+ 'curren' => "\xc2\xa4",
378
+ 'curren;' => "\xc2\xa4",
379
+ 'dArr;' => "\xe2\x87\x93",
380
+ 'dagger;' => "\xe2\x80\xa0",
381
+ 'darr;' => "\xe2\x86\x93",
382
+ 'deg' => "\xc2\xb0",
383
+ 'deg;' => "\xc2\xb0",
384
+ 'delta;' => "\xce\xb4",
385
+ 'diams;' => "\xe2\x99\xa6",
386
+ 'divide' => "\xc3\xb7",
387
+ 'divide;' => "\xc3\xb7",
388
+ 'eacute' => "\xc3\xa9",
389
+ 'eacute;' => "\xc3\xa9",
390
+ 'ecirc' => "\xc3\xaa",
391
+ 'ecirc;' => "\xc3\xaa",
392
+ 'egrave' => "\xc3\xa8",
393
+ 'egrave;' => "\xc3\xa8",
394
+ 'empty;' => "\xe2\x88\x85",
395
+ 'emsp;' => "\xe2\x80\x83",
396
+ 'ensp;' => "\xe2\x80\x82",
397
+ 'epsilon;' => "\xce\xb5",
398
+ 'equiv;' => "\xe2\x89\xa1",
399
+ 'eta;' => "\xce\xb7",
400
+ 'eth' => "\xc3\xb0",
401
+ 'eth;' => "\xc3\xb0",
402
+ 'euml' => "\xc3\xab",
403
+ 'euml;' => "\xc3\xab",
404
+ 'euro;' => "\xe2\x82\xac",
405
+ 'exist;' => "\xe2\x88\x83",
406
+ 'fnof;' => "\xc6\x92",
407
+ 'forall;' => "\xe2\x88\x80",
408
+ 'frac12' => "\xc2\xbd",
409
+ 'frac12;' => "\xc2\xbd",
410
+ 'frac14' => "\xc2\xbc",
411
+ 'frac14;' => "\xc2\xbc",
412
+ 'frac34' => "\xc2\xbe",
413
+ 'frac34;' => "\xc2\xbe",
414
+ 'frasl;' => "\xe2\x81\x84",
415
+ 'gamma;' => "\xce\xb3",
416
+ 'ge;' => "\xe2\x89\xa5",
417
+ 'gt' => '>',
418
+ 'gt;' => '>',
419
+ 'hArr;' => "\xe2\x87\x94",
420
+ 'harr;' => "\xe2\x86\x94",
421
+ 'hearts;' => "\xe2\x99\xa5",
422
+ 'hellip;' => "\xe2\x80\xa6",
423
+ 'iacute' => "\xc3\xad",
424
+ 'iacute;' => "\xc3\xad",
425
+ 'icirc' => "\xc3\xae",
426
+ 'icirc;' => "\xc3\xae",
427
+ 'iexcl' => "\xc2\xa1",
428
+ 'iexcl;' => "\xc2\xa1",
429
+ 'igrave' => "\xc3\xac",
430
+ 'igrave;' => "\xc3\xac",
431
+ 'image;' => "\xe2\x84\x91",
432
+ 'infin;' => "\xe2\x88\x9e",
433
+ 'int;' => "\xe2\x88\xab",
434
+ 'iota;' => "\xce\xb9",
435
+ 'iquest' => "\xc2\xbf",
436
+ 'iquest;' => "\xc2\xbf",
437
+ 'isin;' => "\xe2\x88\x88",
438
+ 'iuml' => "\xc3\xaf",
439
+ 'iuml;' => "\xc3\xaf",
440
+ 'kappa;' => "\xce\xba",
441
+ 'lArr;' => "\xe2\x87\x90",
442
+ 'lambda;' => "\xce\xbb",
443
+ 'lang;' => "\xe3\x80\x88",
444
+ 'laquo' => "\xc2\xab",
445
+ 'laquo;' => "\xc2\xab",
446
+ 'larr;' => "\xe2\x86\x90",
447
+ 'lceil;' => "\xe2\x8c\x88",
448
+ 'ldquo;' => "\xe2\x80\x9c",
449
+ 'le;' => "\xe2\x89\xa4",
450
+ 'lfloor;' => "\xe2\x8c\x8a",
451
+ 'lowast;' => "\xe2\x88\x97",
452
+ 'loz;' => "\xe2\x97\x8a",
453
+ 'lrm;' => "\xe2\x80\x8e",
454
+ 'lsaquo;' => "\xe2\x80\xb9",
455
+ 'lsquo;' => "\xe2\x80\x98",
456
+ 'lt' => '<',
457
+ 'lt;' => '<',
458
+ 'macr' => "\xc2\xaf",
459
+ 'macr;' => "\xc2\xaf",
460
+ 'mdash;' => "\xe2\x80\x94",
461
+ 'micro' => "\xc2\xb5",
462
+ 'micro;' => "\xc2\xb5",
463
+ 'middot' => "\xc2\xb7",
464
+ 'middot;' => "\xc2\xb7",
465
+ 'minus;' => "\xe2\x88\x92",
466
+ 'mu;' => "\xce\xbc",
467
+ 'nabla;' => "\xe2\x88\x87",
468
+ 'nbsp' => "\xc2\xa0",
469
+ 'nbsp;' => "\xc2\xa0",
470
+ 'ndash;' => "\xe2\x80\x93",
471
+ 'ne;' => "\xe2\x89\xa0",
472
+ 'ni;' => "\xe2\x88\x8b",
473
+ 'not' => "\xc2\xac",
474
+ 'not;' => "\xc2\xac",
475
+ 'notin;' => "\xe2\x88\x89",
476
+ 'nsub;' => "\xe2\x8a\x84",
477
+ 'ntilde' => "\xc3\xb1",
478
+ 'ntilde;' => "\xc3\xb1",
479
+ 'nu;' => "\xce\xbd",
480
+ 'oacute' => "\xc3\xb3",
481
+ 'oacute;' => "\xc3\xb3",
482
+ 'ocirc' => "\xc3\xb4",
483
+ 'ocirc;' => "\xc3\xb4",
484
+ 'oelig;' => "\xc5\x93",
485
+ 'ograve' => "\xc3\xb2",
486
+ 'ograve;' => "\xc3\xb2",
487
+ 'oline;' => "\xe2\x80\xbe",
488
+ 'omega;' => "\xcf\x89",
489
+ 'omicron;' => "\xce\xbf",
490
+ 'oplus;' => "\xe2\x8a\x95",
491
+ 'or;' => "\xe2\x88\xa8",
492
+ 'ordf' => "\xc2\xaa",
493
+ 'ordf;' => "\xc2\xaa",
494
+ 'ordm' => "\xc2\xba",
495
+ 'ordm;' => "\xc2\xba",
496
+ 'oslash' => "\xc3\xb8",
497
+ 'oslash;' => "\xc3\xb8",
498
+ 'otilde' => "\xc3\xb5",
499
+ 'otilde;' => "\xc3\xb5",
500
+ 'otimes;' => "\xe2\x8a\x97",
501
+ 'ouml' => "\xc3\xb6",
502
+ 'ouml;' => "\xc3\xb6",
503
+ 'para' => "\xc2\xb6",
504
+ 'para;' => "\xc2\xb6",
505
+ 'part;' => "\xe2\x88\x82",
506
+ 'permil;' => "\xe2\x80\xb0",
507
+ 'perp;' => "\xe2\x8a\xa5",
508
+ 'phi;' => "\xcf\x86",
509
+ 'pi;' => "\xcf\x80",
510
+ 'piv;' => "\xcf\x96",
511
+ 'plusmn' => "\xc2\xb1",
512
+ 'plusmn;' => "\xc2\xb1",
513
+ 'pound' => "\xc2\xa3",
514
+ 'pound;' => "\xc2\xa3",
515
+ 'prime;' => "\xe2\x80\xb2",
516
+ 'prod;' => "\xe2\x88\x8f",
517
+ 'prop;' => "\xe2\x88\x9d",
518
+ 'psi;' => "\xcf\x88",
519
+ 'quot' => '"',
520
+ 'quot;' => '"',
521
+ 'rArr;' => "\xe2\x87\x92",
522
+ 'radic;' => "\xe2\x88\x9a",
523
+ 'rang;' => "\xe3\x80\x89",
524
+ 'raquo' => "\xc2\xbb",
525
+ 'raquo;' => "\xc2\xbb",
526
+ 'rarr;' => "\xe2\x86\x92",
527
+ 'rceil;' => "\xe2\x8c\x89",
528
+ 'rdquo;' => "\xe2\x80\x9d",
529
+ 'real;' => "\xe2\x84\x9c",
530
+ 'reg' => "\xc2\xae",
531
+ 'reg;' => "\xc2\xae",
532
+ 'rfloor;' => "\xe2\x8c\x8b",
533
+ 'rho;' => "\xcf\x81",
534
+ 'rlm;' => "\xe2\x80\x8f",
535
+ 'rsaquo;' => "\xe2\x80\xba",
536
+ 'rsquo;' => "\xe2\x80\x99",
537
+ 'sbquo;' => "\xe2\x80\x9a",
538
+ 'scaron;' => "\xc5\xa1",
539
+ 'sdot;' => "\xe2\x8b\x85",
540
+ 'sect' => "\xc2\xa7",
541
+ 'sect;' => "\xc2\xa7",
542
+ 'shy' => "\xc2\xad",
543
+ 'shy;' => "\xc2\xad",
544
+ 'sigma;' => "\xcf\x83",
545
+ 'sigmaf;' => "\xcf\x82",
546
+ 'sim;' => "\xe2\x88\xbc",
547
+ 'spades;' => "\xe2\x99\xa0",
548
+ 'sub;' => "\xe2\x8a\x82",
549
+ 'sube;' => "\xe2\x8a\x86",
550
+ 'sum;' => "\xe2\x88\x91",
551
+ 'sup1' => "\xc2\xb9",
552
+ 'sup1;' => "\xc2\xb9",
553
+ 'sup2' => "\xc2\xb2",
554
+ 'sup2;' => "\xc2\xb2",
555
+ 'sup3' => "\xc2\xb3",
556
+ 'sup3;' => "\xc2\xb3",
557
+ 'sup;' => "\xe2\x8a\x83",
558
+ 'supe;' => "\xe2\x8a\x87",
559
+ 'szlig' => "\xc3\x9f",
560
+ 'szlig;' => "\xc3\x9f",
561
+ 'tau;' => "\xcf\x84",
562
+ 'there4;' => "\xe2\x88\xb4",
563
+ 'theta;' => "\xce\xb8",
564
+ 'thetasym;' => "\xcf\x91",
565
+ 'thinsp;' => "\xe2\x80\x89",
566
+ 'thorn' => "\xc3\xbe",
567
+ 'thorn;' => "\xc3\xbe",
568
+ 'tilde;' => "\xcb\x9c",
569
+ 'times' => "\xc3\x97",
570
+ 'times;' => "\xc3\x97",
571
+ 'trade;' => "\xe2\x84\xa2",
572
+ 'uArr;' => "\xe2\x87\x91",
573
+ 'uacute' => "\xc3\xba",
574
+ 'uacute;' => "\xc3\xba",
575
+ 'uarr;' => "\xe2\x86\x91",
576
+ 'ucirc' => "\xc3\xbb",
577
+ 'ucirc;' => "\xc3\xbb",
578
+ 'ugrave' => "\xc3\xb9",
579
+ 'ugrave;' => "\xc3\xb9",
580
+ 'uml' => "\xc2\xa8",
581
+ 'uml;' => "\xc2\xa8",
582
+ 'upsih;' => "\xcf\x92",
583
+ 'upsilon;' => "\xcf\x85",
584
+ 'uuml' => "\xc3\xbc",
585
+ 'uuml;' => "\xc3\xbc",
586
+ 'weierp;' => "\xe2\x84\x98",
587
+ 'xi;' => "\xce\xbe",
588
+ 'yacute' => "\xc3\xbd",
589
+ 'yacute;' => "\xc3\xbd",
590
+ 'yen' => "\xc2\xa5",
591
+ 'yen;' => "\xc2\xa5",
592
+ 'yuml' => "\xc3\xbf",
593
+ 'yuml;' => "\xc3\xbf",
594
+ 'zeta;' => "\xce\xb6",
595
+ 'zwj;' => "\xe2\x80\x8d",
596
+ 'zwnj;' => "\xe2\x80\x8c"
597
+ }
598
+
599
+ ENCODINGS = %w[
600
+ ansi_x3.4-1968
601
+ iso-ir-6
602
+ ansi_x3.4-1986
603
+ iso_646.irv:1991
604
+ ascii
605
+ iso646-us
606
+ us-ascii
607
+ us
608
+ ibm367
609
+ cp367
610
+ csascii
611
+ ks_c_5601-1987
612
+ korean
613
+ iso-2022-kr
614
+ csiso2022kr
615
+ euc-kr
616
+ iso-2022-jp
617
+ csiso2022jp
618
+ iso-2022-jp-2
619
+ iso-ir-58
620
+ chinese
621
+ csiso58gb231280
622
+ iso_8859-1:1987
623
+ iso-ir-100
624
+ iso_8859-1
625
+ iso-8859-1
626
+ latin1
627
+ l1
628
+ ibm819
629
+ cp819
630
+ csisolatin1
631
+ iso_8859-2:1987
632
+ iso-ir-101
633
+ iso_8859-2
634
+ iso-8859-2
635
+ latin2
636
+ l2
637
+ csisolatin2
638
+ iso_8859-3:1988
639
+ iso-ir-109
640
+ iso_8859-3
641
+ iso-8859-3
642
+ latin3
643
+ l3
644
+ csisolatin3
645
+ iso_8859-4:1988
646
+ iso-ir-110
647
+ iso_8859-4
648
+ iso-8859-4
649
+ latin4
650
+ l4
651
+ csisolatin4
652
+ iso_8859-6:1987
653
+ iso-ir-127
654
+ iso_8859-6
655
+ iso-8859-6
656
+ ecma-114
657
+ asmo-708
658
+ arabic
659
+ csisolatinarabic
660
+ iso_8859-7:1987
661
+ iso-ir-126
662
+ iso_8859-7
663
+ iso-8859-7
664
+ elot_928
665
+ ecma-118
666
+ greek
667
+ greek8
668
+ csisolatingreek
669
+ iso_8859-8:1988
670
+ iso-ir-138
671
+ iso_8859-8
672
+ iso-8859-8
673
+ hebrew
674
+ csisolatinhebrew
675
+ iso_8859-5:1988
676
+ iso-ir-144
677
+ iso_8859-5
678
+ iso-8859-5
679
+ cyrillic
680
+ csisolatincyrillic
681
+ iso_8859-9:1989
682
+ iso-ir-148
683
+ iso_8859-9
684
+ iso-8859-9
685
+ latin5
686
+ l5
687
+ csisolatin5
688
+ iso-8859-10
689
+ iso-ir-157
690
+ l6
691
+ iso_8859-10:1992
692
+ csisolatin6
693
+ latin6
694
+ hp-roman8
695
+ roman8
696
+ r8
697
+ ibm037
698
+ cp037
699
+ csibm037
700
+ ibm424
701
+ cp424
702
+ csibm424
703
+ ibm437
704
+ cp437
705
+ 437
706
+ cspc8codepage437
707
+ ibm500
708
+ cp500
709
+ csibm500
710
+ ibm775
711
+ cp775
712
+ cspc775baltic
713
+ ibm850
714
+ cp850
715
+ 850
716
+ cspc850multilingual
717
+ ibm852
718
+ cp852
719
+ 852
720
+ cspcp852
721
+ ibm855
722
+ cp855
723
+ 855
724
+ csibm855
725
+ ibm857
726
+ cp857
727
+ 857
728
+ csibm857
729
+ ibm860
730
+ cp860
731
+ 860
732
+ csibm860
733
+ ibm861
734
+ cp861
735
+ 861
736
+ cp-is
737
+ csibm861
738
+ ibm862
739
+ cp862
740
+ 862
741
+ cspc862latinhebrew
742
+ ibm863
743
+ cp863
744
+ 863
745
+ csibm863
746
+ ibm864
747
+ cp864
748
+ csibm864
749
+ ibm865
750
+ cp865
751
+ 865
752
+ csibm865
753
+ ibm866
754
+ cp866
755
+ 866
756
+ csibm866
757
+ ibm869
758
+ cp869
759
+ 869
760
+ cp-gr
761
+ csibm869
762
+ ibm1026
763
+ cp1026
764
+ csibm1026
765
+ koi8-r
766
+ cskoi8r
767
+ koi8-u
768
+ big5-hkscs
769
+ ptcp154
770
+ csptcp154
771
+ pt154
772
+ cp154
773
+ utf-7
774
+ utf-16be
775
+ utf-16le
776
+ utf-16
777
+ utf-8
778
+ iso-8859-13
779
+ iso-8859-14
780
+ iso-ir-199
781
+ iso_8859-14:1998
782
+ iso_8859-14
783
+ latin8
784
+ iso-celtic
785
+ l8
786
+ iso-8859-15
787
+ iso_8859-15
788
+ iso-8859-16
789
+ iso-ir-226
790
+ iso_8859-16:2001
791
+ iso_8859-16
792
+ latin10
793
+ l10
794
+ gbk
795
+ cp936
796
+ ms936
797
+ gb18030
798
+ shift_jis
799
+ ms_kanji
800
+ csshiftjis
801
+ euc-jp
802
+ gb2312
803
+ big5
804
+ csbig5
805
+ windows-1250
806
+ windows-1251
807
+ windows-1252
808
+ windows-1253
809
+ windows-1254
810
+ windows-1255
811
+ windows-1256
812
+ windows-1257
813
+ windows-1258
814
+ tis-620
815
+ hz-gb-2312
816
+ ]
817
+
818
+ end