html5 0.1.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. data/History.txt +9 -2
  2. data/Manifest.txt +61 -2
  3. data/README +41 -5
  4. data/Rakefile.rb +22 -6
  5. data/{parse.rb → bin/html5} +11 -11
  6. data/lib/core_ext/string.rb +17 -0
  7. data/lib/html5/constants.rb +228 -0
  8. data/lib/html5/filters/iso639codes.rb +752 -0
  9. data/lib/html5/filters/rfc2046.rb +30 -0
  10. data/lib/html5/filters/rfc3987.rb +89 -0
  11. data/lib/html5/filters/validator.rb +830 -0
  12. data/lib/html5/html5parser.rb +25 -25
  13. data/lib/html5/html5parser/after_body_phase.rb +3 -3
  14. data/lib/html5/html5parser/after_frameset_phase.rb +3 -4
  15. data/lib/html5/html5parser/after_head_phase.rb +6 -6
  16. data/lib/html5/html5parser/before_head_phase.rb +1 -1
  17. data/lib/html5/html5parser/in_body_phase.rb +54 -48
  18. data/lib/html5/html5parser/in_caption_phase.rb +7 -6
  19. data/lib/html5/html5parser/in_cell_phase.rb +3 -3
  20. data/lib/html5/html5parser/in_column_group_phase.rb +1 -1
  21. data/lib/html5/html5parser/in_frameset_phase.rb +5 -5
  22. data/lib/html5/html5parser/in_head_phase.rb +10 -10
  23. data/lib/html5/html5parser/in_row_phase.rb +4 -2
  24. data/lib/html5/html5parser/in_select_phase.rb +7 -6
  25. data/lib/html5/html5parser/in_table_body_phase.rb +8 -5
  26. data/lib/html5/html5parser/in_table_phase.rb +12 -7
  27. data/lib/html5/html5parser/initial_phase.rb +5 -6
  28. data/lib/html5/html5parser/phase.rb +5 -9
  29. data/lib/html5/html5parser/root_element_phase.rb +1 -2
  30. data/lib/html5/html5parser/trailing_end_phase.rb +3 -3
  31. data/lib/html5/inputstream.rb +25 -31
  32. data/lib/html5/liberalxmlparser.rb +2 -2
  33. data/lib/html5/sanitizer.rb +6 -6
  34. data/lib/html5/serializer/htmlserializer.rb +2 -3
  35. data/lib/html5/sniffer.rb +45 -0
  36. data/lib/html5/tokenizer.rb +57 -59
  37. data/lib/html5/treebuilders/rexml.rb +7 -6
  38. data/lib/html5/treebuilders/simpletree.rb +1 -1
  39. data/lib/html5/treewalkers/base.rb +8 -0
  40. data/lib/html5/version.rb +3 -0
  41. data/testdata/encoding/chardet/test_big5.txt +51 -0
  42. data/testdata/encoding/test-yahoo-jp.dat +10 -0
  43. data/testdata/encoding/tests1.dat +394 -0
  44. data/testdata/encoding/tests2.dat +81 -0
  45. data/testdata/sanitizer/tests1.dat +416 -0
  46. data/testdata/serializer/core.test +104 -0
  47. data/testdata/serializer/injectmeta.test +65 -0
  48. data/testdata/serializer/optionaltags.test +900 -0
  49. data/testdata/serializer/options.test +60 -0
  50. data/testdata/serializer/whitespace.test +51 -0
  51. data/testdata/sites/google-results.htm +1 -0
  52. data/testdata/sites/python-ref-import.htm +1 -0
  53. data/testdata/sites/web-apps-old.htm +1 -0
  54. data/testdata/sites/web-apps.htm +34275 -0
  55. data/testdata/sniffer/htmlOrFeed.json +43 -0
  56. data/testdata/tokenizer/contentModelFlags.test +48 -0
  57. data/testdata/tokenizer/entities.test +2339 -0
  58. data/testdata/tokenizer/escapeFlag.test +21 -0
  59. data/testdata/tokenizer/test1.test +172 -0
  60. data/testdata/tokenizer/test2.test +129 -0
  61. data/testdata/tokenizer/test3.test +367 -0
  62. data/testdata/tokenizer/test4.test +198 -0
  63. data/testdata/tree-construction/tests1.dat +1950 -0
  64. data/testdata/tree-construction/tests2.dat +773 -0
  65. data/testdata/tree-construction/tests3.dat +270 -0
  66. data/testdata/tree-construction/tests4.dat +60 -0
  67. data/testdata/tree-construction/tests5.dat +175 -0
  68. data/testdata/tree-construction/tests6.dat +196 -0
  69. data/testdata/validator/attributes.test +1035 -0
  70. data/testdata/validator/base-href-attribute.test +787 -0
  71. data/testdata/validator/base-target-attribute.test +35 -0
  72. data/testdata/validator/blockquote-cite-attribute.test +7 -0
  73. data/testdata/validator/classattribute.test +152 -0
  74. data/testdata/validator/contenteditableattribute.test +59 -0
  75. data/testdata/validator/contextmenuattribute.test +115 -0
  76. data/testdata/validator/dirattribute.test +59 -0
  77. data/testdata/validator/draggableattribute.test +63 -0
  78. data/testdata/validator/html-xmlns-attribute.test +23 -0
  79. data/testdata/validator/idattribute.test +115 -0
  80. data/testdata/validator/inputattributes.test +2795 -0
  81. data/testdata/validator/irrelevantattribute.test +63 -0
  82. data/testdata/validator/langattribute.test +5579 -0
  83. data/testdata/validator/li-value-attribute.test +7 -0
  84. data/testdata/validator/link-href-attribute.test +7 -0
  85. data/testdata/validator/link-hreflang-attribute.test +7 -0
  86. data/testdata/validator/link-rel-attribute.test +271 -0
  87. data/testdata/validator/ol-start-attribute.test +7 -0
  88. data/testdata/validator/starttags.test +375 -0
  89. data/testdata/validator/style-scoped-attribute.test +7 -0
  90. data/testdata/validator/tabindexattribute.test +79 -0
  91. data/tests/preamble.rb +7 -17
  92. data/tests/test_encoding.rb +1 -1
  93. data/tests/test_lxp.rb +16 -0
  94. data/tests/test_parser.rb +2 -2
  95. data/tests/test_sniffer.rb +27 -0
  96. data/tests/test_treewalkers.rb +41 -22
  97. data/tests/test_validator.rb +31 -0
  98. metadata +65 -6
@@ -0,0 +1,752 @@
1
+ # borrowed from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ ISO_LANG = {
24
+ 'aa' => 'Afar',
25
+ 'ab' => 'Abkhazian',
26
+ 'ae' => 'Avestan',
27
+ 'af' => 'Afrikaans',
28
+ 'ak' => 'Akan',
29
+ 'am' => 'Amharic',
30
+ 'an' => 'Aragonese',
31
+ 'ar' => 'Arabic',
32
+ 'as' => 'Assamese',
33
+ 'av' => 'Avaric',
34
+ 'ay' => 'Aymara',
35
+ 'az' => 'Azerbaijani',
36
+ 'ba' => 'Bashkir',
37
+ 'be' => 'Byelorussian',
38
+ 'bg' => 'Bulgarian',
39
+ 'bh' => 'Bihari',
40
+ 'bi' => 'Bislama',
41
+ 'bm' => 'Bambara',
42
+ 'bn' => 'Bengali;Bangla',
43
+ 'bo' => 'Tibetan',
44
+ 'br' => 'Breton',
45
+ 'bs' => 'Bosnian',
46
+ 'ca' => 'Catalan',
47
+ 'ce' => 'Chechen',
48
+ 'ch' => 'Chamorro',
49
+ 'co' => 'Corsican',
50
+ 'cr' => 'Cree',
51
+ 'cs' => 'Czech',
52
+ 'cu' => 'Church Slavic',
53
+ 'cv' => 'Chuvash',
54
+ 'cy' => 'Welsh',
55
+ 'da' => 'Danish',
56
+ 'de' => 'German',
57
+ 'dv' => 'Divehi',
58
+ 'dz' => 'Dzongkha',
59
+ 'ee' => 'Ewe',
60
+ 'el' => 'Greek',
61
+ 'en' => 'English',
62
+ 'eo' => 'Esperanto',
63
+ 'es' => 'Spanish',
64
+ 'et' => 'Estonian',
65
+ 'eu' => 'Basque',
66
+ 'fa' => 'Persian (Farsi)',
67
+ 'ff' => 'Fulah',
68
+ 'fi' => 'Finnish',
69
+ 'fj' => 'Fiji',
70
+ 'fo' => 'Faroese',
71
+ 'fr' => 'French',
72
+ 'fy' => 'Frisian, Western',
73
+ 'ga' => 'Irish',
74
+ 'gd' => 'Scots Gaelic',
75
+ 'gl' => 'Galician',
76
+ 'gn' => 'Guarani',
77
+ 'gu' => 'Gujarati',
78
+ 'gv' => 'Manx',
79
+ 'ha' => 'Hausa',
80
+ 'he' => 'Hebrew',
81
+ 'hi' => 'Hindi',
82
+ 'ho' => 'Hiri Motu',
83
+ 'hr' => 'Croatian',
84
+ 'ht' => 'Haitian',
85
+ 'hu' => 'Hungarian',
86
+ 'hy' => 'Armenian',
87
+ 'hz' => 'Herero',
88
+ 'ia' => 'Interlingua',
89
+ 'id' => 'Indonesian',
90
+ 'ie' => 'Interlingue',
91
+ 'ig' => 'Igbo',
92
+ 'ii' => 'Sichuan Yi',
93
+ 'ik' => 'Inupiak',
94
+ 'io' => 'Ido',
95
+ 'is' => 'Icelandic',
96
+ 'it' => 'Italian',
97
+ 'iu' => 'Inuktitut',
98
+ 'ja' => 'Japanese',
99
+ 'jv' => 'Javanese',
100
+ 'ka' => 'Georgian',
101
+ 'kg' => 'Kongo',
102
+ 'ki' => 'Kikuyu; Gikuyu',
103
+ 'kj' => 'Kuanyama; Kwanyama',
104
+ 'kk' => 'Kazakh',
105
+ 'kl' => 'Greenlandic',
106
+ 'km' => 'Cambodian',
107
+ 'kn' => 'Kannada',
108
+ 'ko' => 'Korean',
109
+ 'kr' => 'Kanuri',
110
+ 'ks' => 'Kashmiri',
111
+ 'ku' => 'Kurdish',
112
+ 'kv' => 'Komi',
113
+ 'kw' => 'Cornish',
114
+ 'ky' => 'Kirghiz',
115
+ 'la' => 'Latin',
116
+ 'lb' => 'Letzeburgesch; Luxembourgish',
117
+ 'lg' => 'Ganda',
118
+ 'li' => 'Limburgan; Limburger, Limburgish',
119
+ 'ln' => 'Lingala',
120
+ 'lo' => 'Lao',
121
+ 'lt' => 'Lithuanian',
122
+ 'lu' => 'Luba-Katanga',
123
+ 'lv' => 'Latvian',
124
+ 'mg' => 'Malagasy',
125
+ 'mh' => 'Marshallese',
126
+ 'mi' => 'Maori',
127
+ 'mk' => 'Macedonian',
128
+ 'ml' => 'Malayalam',
129
+ 'mn' => 'Mongolian',
130
+ 'mo' => 'Moldavian',
131
+ 'mr' => 'Marathi',
132
+ 'ms' => 'Malay',
133
+ 'mt' => 'Maltese',
134
+ 'my' => 'Burmese',
135
+ 'na' => 'Nauru',
136
+ 'nb' => 'Norwegian Bokmal',
137
+ 'nd' => 'Ndebele, North',
138
+ 'ne' => 'Nepali',
139
+ 'ng' => 'Ndonga',
140
+ 'nl' => 'Dutch',
141
+ 'nn' => 'Norwegian Nynorsk',
142
+ 'no' => 'Norwegian',
143
+ 'nr' => 'Ndebele, South',
144
+ 'nv' => 'Navaho; Navajo',
145
+ 'ny' => 'Chewa; Chichewa; Nyanha',
146
+ 'oc' => 'Occitan',
147
+ 'oj' => 'Ojibwa',
148
+ 'om' => 'Afan (Oromo)',
149
+ 'or' => 'Oriya',
150
+ 'os' => 'Ossetian; Ossetic',
151
+ 'pa' => 'Punjabi',
152
+ 'pi' => 'Pali',
153
+ 'pl' => 'Polish',
154
+ 'ps' => 'Pushto',
155
+ 'pt' => 'Portuguese',
156
+ 'qu' => 'Quechua',
157
+ 'rm' => 'Rhaeto-Romance',
158
+ 'rn' => 'Kurundi',
159
+ 'ro' => 'Romanian',
160
+ 'ru' => 'Russian',
161
+ 'rw' => 'Kinyarwanda',
162
+ 'sa' => 'Sanskrit',
163
+ 'sc' => 'Sardinian',
164
+ 'sd' => 'Sindhi',
165
+ 'se' => 'Northern Sami',
166
+ 'sg' => 'Sangho',
167
+ 'sh' => 'Serbo-Croatian',
168
+ 'si' => 'Singhalese',
169
+ 'sk' => 'Slovak',
170
+ 'sl' => 'Slovenian',
171
+ 'sm' => 'Samoan',
172
+ 'sn' => 'Shona',
173
+ 'so' => 'Somali',
174
+ 'sq' => 'Albanian',
175
+ 'sr' => 'Serbian',
176
+ 'ss' => 'Swati',
177
+ 'st' => 'Sotho, Southern',
178
+ 'su' => 'Sundanese',
179
+ 'sv' => 'Swedish',
180
+ 'sw' => 'Swahili',
181
+ 'ta' => 'Tamil',
182
+ 'te' => 'Telugu',
183
+ 'tg' => 'Tajik',
184
+ 'th' => 'Thai',
185
+ 'ti' => 'Tigrinya',
186
+ 'tk' => 'Turkmen',
187
+ 'tl' => 'Tagalog',
188
+ 'tn' => 'Tswana',
189
+ 'to' => 'Tonga',
190
+ 'tr' => 'Turkish',
191
+ 'ts' => 'Tsonga',
192
+ 'tt' => 'Tatar',
193
+ 'tw' => 'Twi',
194
+ 'ty' => 'Tahitian',
195
+ 'ug' => 'Uigur',
196
+ 'uk' => 'Ukrainian',
197
+ 'ur' => 'Urdu',
198
+ 'uz' => 'Uzbek',
199
+ 've' => 'Venda',
200
+ 'vi' => 'Vietnamese',
201
+ 'vo' => 'Volapuk',
202
+ 'wa' => 'Walloon',
203
+ 'wo' => 'Wolof',
204
+ 'xh' => 'Xhosa',
205
+ 'yi' => 'Yiddish',
206
+ 'yo' => 'Yoruba',
207
+ 'za' => 'Zhuang',
208
+ 'zh' => 'Chinese',
209
+ 'zu' => 'Zulu',
210
+ 'x' => 'a user-defined language',
211
+ 'xx' => 'a user-defined language',
212
+
213
+ 'abk' => 'Abkhazian',
214
+ 'ace' => 'Achinese',
215
+ 'ach' => 'Acoli',
216
+ 'ada' => 'Adangme',
217
+ 'ady' => 'Adygei',
218
+ 'ady' => 'Adyghe',
219
+ 'aar' => 'Afar',
220
+ 'afh' => 'Afrihili',
221
+ 'afr' => 'Afrikaans',
222
+ 'afa' => 'Afro-Asiatic (Other)',
223
+ 'ain' => 'Ainu',
224
+ 'aka' => 'Akan',
225
+ 'akk' => 'Akkadian',
226
+ 'alb' => 'Albanian',
227
+ 'sqi' => 'Albanian',
228
+ 'gws' => 'Alemanic',
229
+ 'ale' => 'Aleut',
230
+ 'alg' => 'Algonquian languages',
231
+ 'tut' => 'Altaic (Other)',
232
+ 'amh' => 'Amharic',
233
+ 'anp' => 'Angika',
234
+ 'apa' => 'Apache languages',
235
+ 'ara' => 'Arabic',
236
+ 'arg' => 'Aragonese',
237
+ 'arc' => 'Aramaic',
238
+ 'arp' => 'Arapaho',
239
+ 'arn' => 'Araucanian',
240
+ 'arw' => 'Arawak',
241
+ 'arm' => 'Armenian',
242
+ 'hye' => 'Armenian',
243
+ 'rup' => 'Aromanian',
244
+ 'art' => 'Artificial (Other)',
245
+ 'asm' => 'Assamese',
246
+ 'ast' => 'Asturian',
247
+ 'ath' => 'Athapascan languages',
248
+ 'aus' => 'Australian languages',
249
+ 'map' => 'Austronesian (Other)',
250
+ 'ava' => 'Avaric',
251
+ 'ave' => 'Avestan',
252
+ 'awa' => 'Awadhi',
253
+ 'aym' => 'Aymara',
254
+ 'aze' => 'Azerbaijani',
255
+ 'ast' => 'Bable',
256
+ 'ban' => 'Balinese',
257
+ 'bat' => 'Baltic (Other)',
258
+ 'bal' => 'Baluchi',
259
+ 'bam' => 'Bambara',
260
+ 'bai' => 'Bamileke languages',
261
+ 'bad' => 'Banda',
262
+ 'bnt' => 'Bantu (Other)',
263
+ 'bas' => 'Basa',
264
+ 'bak' => 'Bashkir',
265
+ 'baq' => 'Basque',
266
+ 'eus' => 'Basque',
267
+ 'btk' => 'Batak (Indonesia)',
268
+ 'bej' => 'Beja',
269
+ 'bel' => 'Belarusian',
270
+ 'bem' => 'Bemba',
271
+ 'ben' => 'Bengali',
272
+ 'ber' => 'Berber (Other)',
273
+ 'bho' => 'Bhojpuri',
274
+ 'bih' => 'Bihari',
275
+ 'bik' => 'Bikol',
276
+ 'byn' => 'Bilin',
277
+ 'bin' => 'Bini',
278
+ 'bis' => 'Bislama',
279
+ 'byn' => 'Blin',
280
+ 'nob' => 'Bokmal, Norwegian',
281
+ 'bos' => 'Bosnian',
282
+ 'bra' => 'Braj',
283
+ 'bre' => 'Breton',
284
+ 'bug' => 'Buginese',
285
+ 'bul' => 'Bulgarian',
286
+ 'bua' => 'Buriat',
287
+ 'bur' => 'Burmese',
288
+ 'mya' => 'Burmese',
289
+ 'cad' => 'Caddo',
290
+ 'car' => 'Carib',
291
+ 'spa' => 'Castilian',
292
+ 'cat' => 'Catalan',
293
+ 'cau' => 'Caucasian (Other)',
294
+ 'ceb' => 'Cebuano',
295
+ 'cel' => 'Celtic (Other)',
296
+ 'cai' => 'Central American Indian (Other)',
297
+ 'chg' => 'Chagatai',
298
+ 'cmc' => 'Chamic languages',
299
+ 'cha' => 'Chamorro',
300
+ 'che' => 'Chechen',
301
+ 'chr' => 'Cherokee',
302
+ 'nya' => 'Chewa',
303
+ 'chy' => 'Cheyenne',
304
+ 'chb' => 'Chibcha',
305
+ 'nya' => 'Chichewa',
306
+ 'chi' => 'Chinese',
307
+ 'zho' => 'Chinese',
308
+ 'chn' => 'Chinook jargon',
309
+ 'chp' => 'Chipewyan',
310
+ 'cho' => 'Choctaw',
311
+ 'zha' => 'Chuang',
312
+ 'chu' => 'Church Slavic; Church Slavonic; Old Church Slavonic; Old Church Slavic; Old Bulgarian',
313
+ 'chk' => 'Chuukese',
314
+ 'chv' => 'Chuvash',
315
+ 'nwc' => 'Classical Nepal Bhasa; Classical Newari; Old Newari',
316
+ 'cop' => 'Coptic',
317
+ 'cor' => 'Cornish',
318
+ 'cos' => 'Corsican',
319
+ 'cre' => 'Cree',
320
+ 'mus' => 'Creek',
321
+ 'crp' => 'Creoles and pidgins(Other)',
322
+ 'cpe' => 'Creoles and pidgins, English-based (Other)',
323
+ 'cpf' => 'Creoles and pidgins, French-based (Other)',
324
+ 'cpp' => 'Creoles and pidgins, Portuguese-based (Other)',
325
+ 'crh' => 'Crimean Tatar; Crimean Turkish',
326
+ 'scr' => 'Croatian',
327
+ 'hrv' => 'Croatian',
328
+ 'cus' => 'Cushitic (Other)',
329
+ 'cze' => 'Czech',
330
+ 'ces' => 'Czech',
331
+ 'dak' => 'Dakota',
332
+ 'dan' => 'Danish',
333
+ 'dar' => 'Dargwa',
334
+ 'day' => 'Dayak',
335
+ 'del' => 'Delaware',
336
+ 'din' => 'Dinka',
337
+ 'div' => 'Divehi',
338
+ 'doi' => 'Dogri',
339
+ 'dgr' => 'Dogrib',
340
+ 'dra' => 'Dravidian (Other)',
341
+ 'dua' => 'Duala',
342
+ 'dut' => 'Dutch',
343
+ 'nld' => 'Dutch',
344
+ 'dum' => 'Dutch, Middle (ca. 1050-1350)',
345
+ 'dyu' => 'Dyula',
346
+ 'dzo' => 'Dzongkha',
347
+ 'efi' => 'Efik',
348
+ 'egy' => 'Egyptian (Ancient)',
349
+ 'eka' => 'Ekajuk',
350
+ 'elx' => 'Elamite',
351
+ 'eng' => 'English',
352
+ 'enm' => 'English, Middle (1100-1500)',
353
+ 'ang' => 'English, Old (ca.450-1100)',
354
+ 'myv' => 'Erzya',
355
+ 'epo' => 'Esperanto',
356
+ 'est' => 'Estonian',
357
+ 'ewe' => 'Ewe',
358
+ 'ewo' => 'Ewondo',
359
+ 'fan' => 'Fang',
360
+ 'fat' => 'Fanti',
361
+ 'fao' => 'Faroese',
362
+ 'fij' => 'Fijian',
363
+ 'fil' => 'Filipino; Pilipino',
364
+ 'fin' => 'Finnish',
365
+ 'fiu' => 'Finno-Ugrian (Other)',
366
+ 'fon' => 'Fon',
367
+ 'fre' => 'French',
368
+ 'fra' => 'French',
369
+ 'frm' => 'French, Middle (ca.1400-1600)',
370
+ 'fro' => 'French, Old (842-ca.1400)',
371
+ 'frs' => 'Frisian, Eastern',
372
+ 'fry' => 'Frisian, Western',
373
+ 'fur' => 'Friulian',
374
+ 'ful' => 'Fulah',
375
+ 'gaa' => 'Ga',
376
+ 'gla' => 'Gaelic',
377
+ 'glg' => 'Gallegan',
378
+ 'lug' => 'Ganda',
379
+ 'gay' => 'Gayo',
380
+ 'gba' => 'Gbaya',
381
+ 'gez' => 'Geez',
382
+ 'geo' => 'Georgian',
383
+ 'kat' => 'Georgian',
384
+ 'ger' => 'German',
385
+ 'deu' => 'German',
386
+ 'nds' => 'German, Low',
387
+ 'gmh' => 'German, Middle High (ca.1050-1500)',
388
+ 'goh' => 'German, Old High (ca.750-1050)',
389
+ 'gem' => 'Germanic (Other)',
390
+ 'kik' => 'Gikuyu',
391
+ 'gil' => 'Gilbertese',
392
+ 'gon' => 'Gondi',
393
+ 'gor' => 'Gorontalo',
394
+ 'got' => 'Gothic',
395
+ 'grb' => 'Grebo',
396
+ 'grc' => 'Greek, Ancient (to 1453)',
397
+ 'gre' => 'Greek, Modern (1453-)',
398
+ 'ell' => 'Greek, Modern (1453-)',
399
+ 'kal' => 'Greenlandic; Kalaallisut',
400
+ 'grn' => 'Guarani',
401
+ 'guj' => 'Gujarati',
402
+ 'gwi' => 'Gwich\'in',
403
+ 'hai' => 'Haida',
404
+ 'hat' => 'Haitian',
405
+ 'hau' => 'Hausa',
406
+ 'haw' => 'Hawaiian',
407
+ 'heb' => 'Hebrew',
408
+ 'her' => 'Herero',
409
+ 'hil' => 'Hiligaynon',
410
+ 'him' => 'Himachali',
411
+ 'hin' => 'Hindi',
412
+ 'hmo' => 'Hiri Motu',
413
+ 'hit' => 'Hittite',
414
+ 'hmn' => 'Hmong',
415
+ 'hun' => 'Hungarian',
416
+ 'hup' => 'Hupa',
417
+ 'iba' => 'Iban',
418
+ 'ice' => 'Icelandic',
419
+ 'isl' => 'Icelandic',
420
+ 'ido' => 'Ido',
421
+ 'ibo' => 'Igbo',
422
+ 'ijo' => 'Ijo',
423
+ 'ilo' => 'Iloko',
424
+ 'smn' => 'Inari Sami',
425
+ 'inc' => 'Indic (Other)',
426
+ 'ine' => 'Indo-European (Other)',
427
+ 'ind' => 'Indonesian',
428
+ 'inh' => 'Ingush',
429
+ 'ina' => 'Interlingua (International Auxiliary Language Association)',
430
+ 'ile' => 'Interlingue',
431
+ 'iku' => 'Inuktitut',
432
+ 'ipk' => 'Inupiaq',
433
+ 'ira' => 'Iranian (Other)',
434
+ 'gle' => 'Irish',
435
+ 'mga' => 'Irish, Middle (900-1200)',
436
+ 'sga' => 'Irish, Old (to 900)',
437
+ 'iro' => 'Iroquoian languages',
438
+ 'ita' => 'Italian',
439
+ 'jpn' => 'Japanese',
440
+ 'jav' => 'Javanese',
441
+ 'jrb' => 'Judeo-Arabic',
442
+ 'jpr' => 'Judeo-Persian',
443
+ 'kbd' => 'Kabardian',
444
+ 'kab' => 'Kabyle',
445
+ 'kac' => 'Kachin',
446
+ 'kal' => 'Kalaallisut',
447
+ 'xal' => 'Kalmyk',
448
+ 'kam' => 'Kamba',
449
+ 'kan' => 'Kannada',
450
+ 'kau' => 'Kanuri',
451
+ 'krc' => 'Karachay-Balkar',
452
+ 'kaa' => 'Kara-Kalpak',
453
+ 'krl' => 'Karelian',
454
+ 'kar' => 'Karen',
455
+ 'kas' => 'Kashmiri',
456
+ 'csb' => 'Kashubian',
457
+ 'kaw' => 'Kawi',
458
+ 'kaz' => 'Kazakh',
459
+ 'kha' => 'Khasi',
460
+ 'khm' => 'Khmer',
461
+ 'khi' => 'Khoisan (Other)',
462
+ 'kho' => 'Khotanese',
463
+ 'kik' => 'Kikuyu',
464
+ 'kmb' => 'Kimbundu',
465
+ 'kin' => 'Kinyarwanda',
466
+ 'kir' => 'Kirghiz',
467
+ 'tlh' => 'Klingon; tlhIngan-Hol',
468
+ 'kom' => 'Komi',
469
+ 'kon' => 'Kongo',
470
+ 'kok' => 'Konkani',
471
+ 'kor' => 'Korean',
472
+ 'kos' => 'Kosraean',
473
+ 'kpe' => 'Kpelle',
474
+ 'kro' => 'Kru',
475
+ 'kua' => 'Kuanyama',
476
+ 'kum' => 'Kumyk',
477
+ 'kur' => 'Kurdish',
478
+ 'kru' => 'Kurukh',
479
+ 'kut' => 'Kutenai',
480
+ 'kua' => 'Kwanyama',
481
+ 'lad' => 'Ladino',
482
+ 'lah' => 'Lahnda',
483
+ 'lam' => 'Lamba',
484
+ 'lao' => 'Lao',
485
+ 'lat' => 'Latin',
486
+ 'lav' => 'Latvian',
487
+ 'ltz' => 'Letzeburgesch',
488
+ 'lez' => 'Lezghian',
489
+ 'lim' => 'Limburgan',
490
+ 'lin' => 'Lingala',
491
+ 'lit' => 'Lithuanian',
492
+ 'jbo' => 'Lojban',
493
+ 'nds' => 'Low German',
494
+ 'dsb' => 'Lower Sorbian',
495
+ 'loz' => 'Lozi',
496
+ 'lub' => 'Luba-Katanga',
497
+ 'lua' => 'Luba-Lulua',
498
+ 'lui' => 'Luiseno',
499
+ 'smj' => 'Lule Sami',
500
+ 'lun' => 'Lunda',
501
+ 'luo' => 'Luo (Kenya and Tanzania)',
502
+ 'lus' => 'Lushai',
503
+ 'ltz' => 'Luxembourgish',
504
+ 'mac' => 'Macedonian',
505
+ 'mkd' => 'Macedonian',
506
+ 'mad' => 'Madurese',
507
+ 'mag' => 'Magahi',
508
+ 'mai' => 'Maithili',
509
+ 'mak' => 'Makasar',
510
+ 'mlg' => 'Malagasy',
511
+ 'may' => 'Malay',
512
+ 'msa' => 'Malay',
513
+ 'mal' => 'Malayalam',
514
+ 'mlt' => 'Maltese',
515
+ 'mnc' => 'Manchu',
516
+ 'mdr' => 'Mandar',
517
+ 'man' => 'Mandingo',
518
+ 'mni' => 'Manipuri',
519
+ 'mno' => 'Manobo languages',
520
+ 'glv' => 'Manx',
521
+ 'mao' => 'Maori',
522
+ 'mri' => 'Maori',
523
+ 'mar' => 'Marathi',
524
+ 'chm' => 'Mari',
525
+ 'mah' => 'Marshallese',
526
+ 'mwr' => 'Marwari',
527
+ 'mas' => 'Masai',
528
+ 'myn' => 'Mayan languages',
529
+ 'men' => 'Mende',
530
+ 'mic' => 'Micmac',
531
+ 'min' => 'Minangkabau',
532
+ 'mwl' => 'Mirandese',
533
+ 'mis' => 'Miscellaneous languages',
534
+ 'moh' => 'Mohawk',
535
+ 'mdf' => 'Moksha',
536
+ 'mol' => 'Moldavian',
537
+ 'mkh' => 'Mon-Khmer (Other)',
538
+ 'lol' => 'Mongo',
539
+ 'mon' => 'Mongolian',
540
+ 'mos' => 'Mossi',
541
+ 'mul' => 'Multiple languages',
542
+ 'mun' => 'Munda languages',
543
+ 'nah' => 'Nahuatl',
544
+ 'nau' => 'Nauru',
545
+ 'nav' => 'Navaho; Navajo',
546
+ 'nde' => 'Ndebele, North',
547
+ 'nbl' => 'Ndebele, South',
548
+ 'ndo' => 'Ndonga',
549
+ 'nap' => 'Neapolitan',
550
+ 'nep' => 'Nepali',
551
+ 'new' => 'Newari',
552
+ 'nia' => 'Nias',
553
+ 'nic' => 'Niger-Kordofanian (Other)',
554
+ 'ssa' => 'Nilo-Saharan (Other)',
555
+ 'niu' => 'Niuean',
556
+ 'nog' => 'Nogai',
557
+ 'non' => 'Norse, Old',
558
+ 'nai' => 'North American Indian (Other)',
559
+ 'frr' => 'Northern Frisian',
560
+ 'sme' => 'Northern Sami',
561
+ 'nso' => 'Northern Sotho; Pedi; Sepedi',
562
+ 'nde' => 'North Ndebele',
563
+ 'nor' => 'Norwegian',
564
+ 'nob' => 'Norwegian Bokmal',
565
+ 'nno' => 'Norwegian Nynorsk',
566
+ 'nub' => 'Nubian languages',
567
+ 'nym' => 'Nyamwezi',
568
+ 'nya' => 'Nyanja',
569
+ 'nyn' => 'Nyankole',
570
+ 'nno' => 'Nynorsk, Norwegian',
571
+ 'nyo' => 'Nyoro',
572
+ 'nzi' => 'Nzima',
573
+ 'oci' => 'Occitan (post 1500)',
574
+ 'oji' => 'Ojibwa',
575
+ 'ori' => 'Oriya',
576
+ 'orm' => 'Oromo',
577
+ 'osa' => 'Osage',
578
+ 'oss' => 'Ossetian; Ossetic',
579
+ 'oto' => 'Otomian languages',
580
+ 'pal' => 'Pahlavi',
581
+ 'pau' => 'Palauan',
582
+ 'pli' => 'Pali',
583
+ 'pam' => 'Pampanga',
584
+ 'pag' => 'Pangasinan',
585
+ 'pan' => 'Panjabi',
586
+ 'pap' => 'Papiamento',
587
+ 'paa' => 'Papuan (Other)',
588
+ 'per' => 'Persian',
589
+ 'fas' => 'Persian',
590
+ 'peo' => 'Persian, Old (ca.600-400)',
591
+ 'phi' => 'Philippine (Other)',
592
+ 'phn' => 'Phoenician',
593
+ 'pon' => 'Pohnpeian',
594
+ 'pol' => 'Polish',
595
+ 'por' => 'Portuguese',
596
+ 'pra' => 'Prakrit languages',
597
+ 'oci' => 'Provencal',
598
+ 'pro' => 'Provencal, Old (to 1500)',
599
+ 'pan' => 'Punjabi',
600
+ 'pus' => 'Pushto',
601
+ 'que' => 'Quechua',
602
+ 'roh' => 'Raeto-Romance',
603
+ 'raj' => 'Rajasthani',
604
+ 'rap' => 'Rapanui',
605
+ 'rar' => 'Rarotongan',
606
+ 'qaa' => 'Reserved for local use',
607
+ 'qtz' => 'Reserved for local use',
608
+ 'roa' => 'Romance (Other)',
609
+ 'rum' => 'Romanian',
610
+ 'ron' => 'Romanian',
611
+ 'rom' => 'Romany',
612
+ 'run' => 'Rundi',
613
+ 'rus' => 'Russian',
614
+ 'sal' => 'Salishan languages',
615
+ 'sam' => 'Samaritan Aramaic',
616
+ 'smi' => 'Sami languages (Other)',
617
+ 'smo' => 'Samoan',
618
+ 'sad' => 'Sandawe',
619
+ 'sag' => 'Sango',
620
+ 'san' => 'Sanskrit',
621
+ 'sat' => 'Santali',
622
+ 'srd' => 'Sardinian',
623
+ 'sas' => 'Sasak',
624
+ 'nds' => 'Saxon, Low',
625
+ 'sco' => 'Scots',
626
+ 'gla' => 'Scottish Gaelic',
627
+ 'sel' => 'Selkup',
628
+ 'sem' => 'Semitic (Other)',
629
+ 'nso' => 'Sepedi; Northern Sotho; Pedi',
630
+ 'scc' => 'Serbian',
631
+ 'srp' => 'Serbian',
632
+ 'srr' => 'Serer',
633
+ 'shn' => 'Shan',
634
+ 'sna' => 'Shona',
635
+ 'iii' => 'Sichuan Yi',
636
+ 'scn' => 'Sicilian',
637
+ 'sid' => 'Sidamo',
638
+ 'sgn' => 'Sign languages',
639
+ 'bla' => 'Siksika',
640
+ 'snd' => 'Sindhi',
641
+ 'sin' => 'Sinhalese',
642
+ 'sit' => 'Sino-Tibetan (Other)',
643
+ 'sio' => 'Siouan languages',
644
+ 'sms' => 'Skolt Sami',
645
+ 'den' => 'Slave (Athapascan)',
646
+ 'sla' => 'Slavic (Other)',
647
+ 'slo' => 'Slovak',
648
+ 'slk' => 'Slovak',
649
+ 'slv' => 'Slovenian',
650
+ 'sog' => 'Sogdian',
651
+ 'som' => 'Somali',
652
+ 'son' => 'Songhai',
653
+ 'snk' => 'Soninke',
654
+ 'wen' => 'Sorbian languages',
655
+ 'nso' => 'Sotho, Northern',
656
+ 'sot' => 'Sotho, Southern',
657
+ 'sai' => 'South American Indian (Other)',
658
+ 'alt' => 'Southern Altai',
659
+ 'sma' => 'Southern Sami',
660
+ 'nbl' => 'South Ndebele',
661
+ 'spa' => 'Spanish',
662
+ 'srn' => 'Sranan Tongo',
663
+ 'suk' => 'Sukuma',
664
+ 'sux' => 'Sumerian',
665
+ 'sun' => 'Sundanese',
666
+ 'sus' => 'Susu',
667
+ 'swa' => 'Swahili',
668
+ 'ssw' => 'Swati',
669
+ 'swe' => 'Swedish',
670
+ 'gsw' => 'Swiss German; Alemanic',
671
+ 'syr' => 'Syriac',
672
+ 'tgl' => 'Tagalog',
673
+ 'tah' => 'Tahitian',
674
+ 'tai' => 'Tai (Other)',
675
+ 'tgk' => 'Tajik',
676
+ 'tmh' => 'Tamashek',
677
+ 'tam' => 'Tamil',
678
+ 'tat' => 'Tatar',
679
+ 'tel' => 'Telugu',
680
+ 'ter' => 'Tereno',
681
+ 'tet' => 'Tetum',
682
+ 'tha' => 'Thai',
683
+ 'tib' => 'Tibetan',
684
+ 'bod' => 'Tibetan',
685
+ 'tig' => 'Tigre',
686
+ 'tir' => 'Tigrinya',
687
+ 'tem' => 'Timne',
688
+ 'tiv' => 'Tiv',
689
+ 'tlh' => 'tlhIngan-Hol; Klingon',
690
+ 'tli' => 'Tlingit',
691
+ 'tpi' => 'Tok Pisin',
692
+ 'tkl' => 'Tokelau',
693
+ 'tog' => 'Tonga (Nyasa)',
694
+ 'ton' => 'Tonga (Tonga Islands)',
695
+ 'tsi' => 'Tsimshian',
696
+ 'tso' => 'Tsonga',
697
+ 'tsn' => 'Tswana',
698
+ 'tum' => 'Tumbuka',
699
+ 'tup' => 'Tupi languages',
700
+ 'tur' => 'Turkish',
701
+ 'ota' => 'Turkish, Ottoman (1500-1928)',
702
+ 'tuk' => 'Turkmen',
703
+ 'tvl' => 'Tuvalu',
704
+ 'tyv' => 'Tuvinian',
705
+ 'twi' => 'Twi',
706
+ 'udm' => 'Udmurt',
707
+ 'uga' => 'Ugaritic',
708
+ 'uig' => 'Uighur',
709
+ 'ukr' => 'Ukrainian',
710
+ 'umb' => 'Umbundu',
711
+ 'und' => 'Undetermined',
712
+ 'hsb' => 'Upper Sorbian',
713
+ 'urd' => 'Urdu',
714
+ 'uzb' => 'Uzbek',
715
+ 'vai' => 'Vai',
716
+ 'cat' => 'Valencian',
717
+ 'ven' => 'Venda',
718
+ 'vie' => 'Vietnamese',
719
+ 'vol' => 'Volapuk',
720
+ 'vot' => 'Votic',
721
+ 'wak' => 'Wakashan languages',
722
+ 'wal' => 'Walamo',
723
+ 'wln' => 'Walloon',
724
+ 'war' => 'Waray',
725
+ 'was' => 'Washo',
726
+ 'wel' => 'Welsh',
727
+ 'cym' => 'Welsh',
728
+ 'fry' => 'Wester Frisian',
729
+ 'wol' => 'Wolof',
730
+ 'xho' => 'Xhosa',
731
+ 'sah' => 'Yakut',
732
+ 'yao' => 'Yao',
733
+ 'yap' => 'Yapese',
734
+ 'yid' => 'Yiddish',
735
+ 'yor' => 'Yoruba',
736
+ 'ypk' => 'Yupik languages',
737
+ 'znd' => 'Zande',
738
+ 'zap' => 'Zapotec',
739
+ 'zen' => 'Zenaga',
740
+ 'zha' => 'Zhuang',
741
+ 'zul' => 'Zulu',
742
+ 'zun' => 'Zuni'
743
+ }
744
+
745
+ def is_valid_lang_code(value)
746
+ if value.include? '-'
747
+ lang, sublang = value.split('-', 2)
748
+ else
749
+ lang = value
750
+ end
751
+ !!ISO_LANG[lang.downcase]
752
+ end