spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,10 @@
1
+ require 'delegate'
2
+ require 'enumerator'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class Base < SimpleDelegator
7
+ include Enumerable
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,82 @@
1
+ require 'html5/filters/base'
2
+
3
+ module HTML5
4
+ module Filters
5
+ class InjectMetaCharset < Base
6
+ def initialize(source, encoding)
7
+ super(source)
8
+ @encoding = encoding
9
+ end
10
+
11
+ def each
12
+ state = :pre_head
13
+ meta_found = @encoding.nil?
14
+ pending = []
15
+
16
+ __getobj__.each do |token|
17
+ case token[:type]
18
+ when :StartTag
19
+ state = :in_head if token[:name].downcase == "head"
20
+
21
+ when :EmptyTag
22
+ if token[:name].downcase == "meta"
23
+ # replace charset with actual encoding
24
+ token[:data].each_with_index do |(name, value), index|
25
+ if name == 'charset'
26
+ token[:data][index][1] = @encoding
27
+ meta_found = true
28
+ end
29
+ end
30
+
31
+ # replace charset with actual encoding
32
+ has_http_equiv_content_type = false
33
+ content_index = -1
34
+ token[:data].each_with_index do |(name, value), i|
35
+ if name.downcase == 'charset'
36
+ token[:data][i] = ['charset', @encoding]
37
+ meta_found = true
38
+ break
39
+ elsif name == 'http-equiv' and value.downcase == 'content-type'
40
+ has_http_equiv_content_type = true
41
+ elsif name == 'content'
42
+ content_index = i
43
+ end
44
+ end
45
+
46
+ if !meta_found
47
+ if has_http_equiv_content_type && content_index >= 0
48
+ token[:data][content_index][1] = 'text/html; charset=%s' % @encoding
49
+ meta_found = true
50
+ end
51
+ end
52
+
53
+ elsif token[:name].downcase == "head" && !meta_found
54
+ # insert meta into empty head
55
+ yield :type => :StartTag, :name => "head", :data => token[:data]
56
+ yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]]
57
+ yield :type => :EndTag, :name => "head"
58
+ meta_found = true
59
+ next
60
+ end
61
+
62
+ when :EndTag
63
+ if token[:name].downcase == "head" && pending.any?
64
+ # insert meta into head (if necessary) and flush pending queue
65
+ yield pending.shift
66
+ yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]] if !meta_found
67
+ yield pending.shift while pending.any?
68
+ meta_found = true
69
+ state = :post_head
70
+ end
71
+ end
72
+
73
+ if state == :in_head
74
+ pending << token
75
+ else
76
+ yield token
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,755 @@
1
+ # borrowed from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ module ISO639Codes
24
+
25
+ ISO_LANG = {
26
+ 'aa' => 'Afar',
27
+ 'ab' => 'Abkhazian',
28
+ 'ae' => 'Avestan',
29
+ 'af' => 'Afrikaans',
30
+ 'ak' => 'Akan',
31
+ 'am' => 'Amharic',
32
+ 'an' => 'Aragonese',
33
+ 'ar' => 'Arabic',
34
+ 'as' => 'Assamese',
35
+ 'av' => 'Avaric',
36
+ 'ay' => 'Aymara',
37
+ 'az' => 'Azerbaijani',
38
+ 'ba' => 'Bashkir',
39
+ 'be' => 'Byelorussian',
40
+ 'bg' => 'Bulgarian',
41
+ 'bh' => 'Bihari',
42
+ 'bi' => 'Bislama',
43
+ 'bm' => 'Bambara',
44
+ 'bn' => 'Bengali;Bangla',
45
+ 'bo' => 'Tibetan',
46
+ 'br' => 'Breton',
47
+ 'bs' => 'Bosnian',
48
+ 'ca' => 'Catalan',
49
+ 'ce' => 'Chechen',
50
+ 'ch' => 'Chamorro',
51
+ 'co' => 'Corsican',
52
+ 'cr' => 'Cree',
53
+ 'cs' => 'Czech',
54
+ 'cu' => 'Church Slavic',
55
+ 'cv' => 'Chuvash',
56
+ 'cy' => 'Welsh',
57
+ 'da' => 'Danish',
58
+ 'de' => 'German',
59
+ 'dv' => 'Divehi',
60
+ 'dz' => 'Dzongkha',
61
+ 'ee' => 'Ewe',
62
+ 'el' => 'Greek',
63
+ 'en' => 'English',
64
+ 'eo' => 'Esperanto',
65
+ 'es' => 'Spanish',
66
+ 'et' => 'Estonian',
67
+ 'eu' => 'Basque',
68
+ 'fa' => 'Persian (Farsi)',
69
+ 'ff' => 'Fulah',
70
+ 'fi' => 'Finnish',
71
+ 'fj' => 'Fiji',
72
+ 'fo' => 'Faroese',
73
+ 'fr' => 'French',
74
+ 'fy' => 'Frisian, Western',
75
+ 'ga' => 'Irish',
76
+ 'gd' => 'Scots Gaelic',
77
+ 'gl' => 'Galician',
78
+ 'gn' => 'Guarani',
79
+ 'gu' => 'Gujarati',
80
+ 'gv' => 'Manx',
81
+ 'ha' => 'Hausa',
82
+ 'he' => 'Hebrew',
83
+ 'hi' => 'Hindi',
84
+ 'ho' => 'Hiri Motu',
85
+ 'hr' => 'Croatian',
86
+ 'ht' => 'Haitian',
87
+ 'hu' => 'Hungarian',
88
+ 'hy' => 'Armenian',
89
+ 'hz' => 'Herero',
90
+ 'ia' => 'Interlingua',
91
+ 'id' => 'Indonesian',
92
+ 'ie' => 'Interlingue',
93
+ 'ig' => 'Igbo',
94
+ 'ii' => 'Sichuan Yi',
95
+ 'ik' => 'Inupiak',
96
+ 'io' => 'Ido',
97
+ 'is' => 'Icelandic',
98
+ 'it' => 'Italian',
99
+ 'iu' => 'Inuktitut',
100
+ 'ja' => 'Japanese',
101
+ 'jv' => 'Javanese',
102
+ 'ka' => 'Georgian',
103
+ 'kg' => 'Kongo',
104
+ 'ki' => 'Kikuyu; Gikuyu',
105
+ 'kj' => 'Kuanyama; Kwanyama',
106
+ 'kk' => 'Kazakh',
107
+ 'kl' => 'Greenlandic',
108
+ 'km' => 'Cambodian',
109
+ 'kn' => 'Kannada',
110
+ 'ko' => 'Korean',
111
+ 'kr' => 'Kanuri',
112
+ 'ks' => 'Kashmiri',
113
+ 'ku' => 'Kurdish',
114
+ 'kv' => 'Komi',
115
+ 'kw' => 'Cornish',
116
+ 'ky' => 'Kirghiz',
117
+ 'la' => 'Latin',
118
+ 'lb' => 'Letzeburgesch; Luxembourgish',
119
+ 'lg' => 'Ganda',
120
+ 'li' => 'Limburgan; Limburger, Limburgish',
121
+ 'ln' => 'Lingala',
122
+ 'lo' => 'Lao',
123
+ 'lt' => 'Lithuanian',
124
+ 'lu' => 'Luba-Katanga',
125
+ 'lv' => 'Latvian',
126
+ 'mg' => 'Malagasy',
127
+ 'mh' => 'Marshallese',
128
+ 'mi' => 'Maori',
129
+ 'mk' => 'Macedonian',
130
+ 'ml' => 'Malayalam',
131
+ 'mn' => 'Mongolian',
132
+ 'mo' => 'Moldavian',
133
+ 'mr' => 'Marathi',
134
+ 'ms' => 'Malay',
135
+ 'mt' => 'Maltese',
136
+ 'my' => 'Burmese',
137
+ 'na' => 'Nauru',
138
+ 'nb' => 'Norwegian Bokmal',
139
+ 'nd' => 'Ndebele, North',
140
+ 'ne' => 'Nepali',
141
+ 'ng' => 'Ndonga',
142
+ 'nl' => 'Dutch',
143
+ 'nn' => 'Norwegian Nynorsk',
144
+ 'no' => 'Norwegian',
145
+ 'nr' => 'Ndebele, South',
146
+ 'nv' => 'Navaho; Navajo',
147
+ 'ny' => 'Chewa; Chichewa; Nyanha',
148
+ 'oc' => 'Occitan',
149
+ 'oj' => 'Ojibwa',
150
+ 'om' => 'Afan (Oromo)',
151
+ 'or' => 'Oriya',
152
+ 'os' => 'Ossetian; Ossetic',
153
+ 'pa' => 'Punjabi',
154
+ 'pi' => 'Pali',
155
+ 'pl' => 'Polish',
156
+ 'ps' => 'Pushto',
157
+ 'pt' => 'Portuguese',
158
+ 'qu' => 'Quechua',
159
+ 'rm' => 'Rhaeto-Romance',
160
+ 'rn' => 'Kurundi',
161
+ 'ro' => 'Romanian',
162
+ 'ru' => 'Russian',
163
+ 'rw' => 'Kinyarwanda',
164
+ 'sa' => 'Sanskrit',
165
+ 'sc' => 'Sardinian',
166
+ 'sd' => 'Sindhi',
167
+ 'se' => 'Northern Sami',
168
+ 'sg' => 'Sangho',
169
+ 'sh' => 'Serbo-Croatian',
170
+ 'si' => 'Singhalese',
171
+ 'sk' => 'Slovak',
172
+ 'sl' => 'Slovenian',
173
+ 'sm' => 'Samoan',
174
+ 'sn' => 'Shona',
175
+ 'so' => 'Somali',
176
+ 'sq' => 'Albanian',
177
+ 'sr' => 'Serbian',
178
+ 'ss' => 'Swati',
179
+ 'st' => 'Sotho, Southern',
180
+ 'su' => 'Sundanese',
181
+ 'sv' => 'Swedish',
182
+ 'sw' => 'Swahili',
183
+ 'ta' => 'Tamil',
184
+ 'te' => 'Telugu',
185
+ 'tg' => 'Tajik',
186
+ 'th' => 'Thai',
187
+ 'ti' => 'Tigrinya',
188
+ 'tk' => 'Turkmen',
189
+ 'tl' => 'Tagalog',
190
+ 'tn' => 'Tswana',
191
+ 'to' => 'Tonga',
192
+ 'tr' => 'Turkish',
193
+ 'ts' => 'Tsonga',
194
+ 'tt' => 'Tatar',
195
+ 'tw' => 'Twi',
196
+ 'ty' => 'Tahitian',
197
+ 'ug' => 'Uigur',
198
+ 'uk' => 'Ukrainian',
199
+ 'ur' => 'Urdu',
200
+ 'uz' => 'Uzbek',
201
+ 've' => 'Venda',
202
+ 'vi' => 'Vietnamese',
203
+ 'vo' => 'Volapuk',
204
+ 'wa' => 'Walloon',
205
+ 'wo' => 'Wolof',
206
+ 'xh' => 'Xhosa',
207
+ 'yi' => 'Yiddish',
208
+ 'yo' => 'Yoruba',
209
+ 'za' => 'Zhuang',
210
+ 'zh' => 'Chinese',
211
+ 'zu' => 'Zulu',
212
+ 'x' => 'a user-defined language',
213
+ 'xx' => 'a user-defined language',
214
+
215
+ 'abk' => 'Abkhazian',
216
+ 'ace' => 'Achinese',
217
+ 'ach' => 'Acoli',
218
+ 'ada' => 'Adangme',
219
+ 'ady' => 'Adygei',
220
+ 'ady' => 'Adyghe',
221
+ 'aar' => 'Afar',
222
+ 'afh' => 'Afrihili',
223
+ 'afr' => 'Afrikaans',
224
+ 'afa' => 'Afro-Asiatic (Other)',
225
+ 'ain' => 'Ainu',
226
+ 'aka' => 'Akan',
227
+ 'akk' => 'Akkadian',
228
+ 'alb' => 'Albanian',
229
+ 'sqi' => 'Albanian',
230
+ 'gws' => 'Alemanic',
231
+ 'ale' => 'Aleut',
232
+ 'alg' => 'Algonquian languages',
233
+ 'tut' => 'Altaic (Other)',
234
+ 'amh' => 'Amharic',
235
+ 'anp' => 'Angika',
236
+ 'apa' => 'Apache languages',
237
+ 'ara' => 'Arabic',
238
+ 'arg' => 'Aragonese',
239
+ 'arc' => 'Aramaic',
240
+ 'arp' => 'Arapaho',
241
+ 'arn' => 'Araucanian',
242
+ 'arw' => 'Arawak',
243
+ 'arm' => 'Armenian',
244
+ 'hye' => 'Armenian',
245
+ 'rup' => 'Aromanian',
246
+ 'art' => 'Artificial (Other)',
247
+ 'asm' => 'Assamese',
248
+ 'ast' => 'Asturian',
249
+ 'ath' => 'Athapascan languages',
250
+ 'aus' => 'Australian languages',
251
+ 'map' => 'Austronesian (Other)',
252
+ 'ava' => 'Avaric',
253
+ 'ave' => 'Avestan',
254
+ 'awa' => 'Awadhi',
255
+ 'aym' => 'Aymara',
256
+ 'aze' => 'Azerbaijani',
257
+ 'ast' => 'Bable',
258
+ 'ban' => 'Balinese',
259
+ 'bat' => 'Baltic (Other)',
260
+ 'bal' => 'Baluchi',
261
+ 'bam' => 'Bambara',
262
+ 'bai' => 'Bamileke languages',
263
+ 'bad' => 'Banda',
264
+ 'bnt' => 'Bantu (Other)',
265
+ 'bas' => 'Basa',
266
+ 'bak' => 'Bashkir',
267
+ 'baq' => 'Basque',
268
+ 'eus' => 'Basque',
269
+ 'btk' => 'Batak (Indonesia)',
270
+ 'bej' => 'Beja',
271
+ 'bel' => 'Belarusian',
272
+ 'bem' => 'Bemba',
273
+ 'ben' => 'Bengali',
274
+ 'ber' => 'Berber (Other)',
275
+ 'bho' => 'Bhojpuri',
276
+ 'bih' => 'Bihari',
277
+ 'bik' => 'Bikol',
278
+ 'byn' => 'Bilin',
279
+ 'bin' => 'Bini',
280
+ 'bis' => 'Bislama',
281
+ 'byn' => 'Blin',
282
+ 'nob' => 'Bokmal, Norwegian',
283
+ 'bos' => 'Bosnian',
284
+ 'bra' => 'Braj',
285
+ 'bre' => 'Breton',
286
+ 'bug' => 'Buginese',
287
+ 'bul' => 'Bulgarian',
288
+ 'bua' => 'Buriat',
289
+ 'bur' => 'Burmese',
290
+ 'mya' => 'Burmese',
291
+ 'cad' => 'Caddo',
292
+ 'car' => 'Carib',
293
+ 'spa' => 'Castilian',
294
+ 'cat' => 'Catalan',
295
+ 'cau' => 'Caucasian (Other)',
296
+ 'ceb' => 'Cebuano',
297
+ 'cel' => 'Celtic (Other)',
298
+ 'cai' => 'Central American Indian (Other)',
299
+ 'chg' => 'Chagatai',
300
+ 'cmc' => 'Chamic languages',
301
+ 'cha' => 'Chamorro',
302
+ 'che' => 'Chechen',
303
+ 'chr' => 'Cherokee',
304
+ 'nya' => 'Chewa',
305
+ 'chy' => 'Cheyenne',
306
+ 'chb' => 'Chibcha',
307
+ 'nya' => 'Chichewa',
308
+ 'chi' => 'Chinese',
309
+ 'zho' => 'Chinese',
310
+ 'chn' => 'Chinook jargon',
311
+ 'chp' => 'Chipewyan',
312
+ 'cho' => 'Choctaw',
313
+ 'zha' => 'Chuang',
314
+ 'chu' => 'Church Slavic; Church Slavonic; Old Church Slavonic; Old Church Slavic; Old Bulgarian',
315
+ 'chk' => 'Chuukese',
316
+ 'chv' => 'Chuvash',
317
+ 'nwc' => 'Classical Nepal Bhasa; Classical Newari; Old Newari',
318
+ 'cop' => 'Coptic',
319
+ 'cor' => 'Cornish',
320
+ 'cos' => 'Corsican',
321
+ 'cre' => 'Cree',
322
+ 'mus' => 'Creek',
323
+ 'crp' => 'Creoles and pidgins(Other)',
324
+ 'cpe' => 'Creoles and pidgins, English-based (Other)',
325
+ 'cpf' => 'Creoles and pidgins, French-based (Other)',
326
+ 'cpp' => 'Creoles and pidgins, Portuguese-based (Other)',
327
+ 'crh' => 'Crimean Tatar; Crimean Turkish',
328
+ 'scr' => 'Croatian',
329
+ 'hrv' => 'Croatian',
330
+ 'cus' => 'Cushitic (Other)',
331
+ 'cze' => 'Czech',
332
+ 'ces' => 'Czech',
333
+ 'dak' => 'Dakota',
334
+ 'dan' => 'Danish',
335
+ 'dar' => 'Dargwa',
336
+ 'day' => 'Dayak',
337
+ 'del' => 'Delaware',
338
+ 'din' => 'Dinka',
339
+ 'div' => 'Divehi',
340
+ 'doi' => 'Dogri',
341
+ 'dgr' => 'Dogrib',
342
+ 'dra' => 'Dravidian (Other)',
343
+ 'dua' => 'Duala',
344
+ 'dut' => 'Dutch',
345
+ 'nld' => 'Dutch',
346
+ 'dum' => 'Dutch, Middle (ca. 1050-1350)',
347
+ 'dyu' => 'Dyula',
348
+ 'dzo' => 'Dzongkha',
349
+ 'efi' => 'Efik',
350
+ 'egy' => 'Egyptian (Ancient)',
351
+ 'eka' => 'Ekajuk',
352
+ 'elx' => 'Elamite',
353
+ 'eng' => 'English',
354
+ 'enm' => 'English, Middle (1100-1500)',
355
+ 'ang' => 'English, Old (ca.450-1100)',
356
+ 'myv' => 'Erzya',
357
+ 'epo' => 'Esperanto',
358
+ 'est' => 'Estonian',
359
+ 'ewe' => 'Ewe',
360
+ 'ewo' => 'Ewondo',
361
+ 'fan' => 'Fang',
362
+ 'fat' => 'Fanti',
363
+ 'fao' => 'Faroese',
364
+ 'fij' => 'Fijian',
365
+ 'fil' => 'Filipino; Pilipino',
366
+ 'fin' => 'Finnish',
367
+ 'fiu' => 'Finno-Ugrian (Other)',
368
+ 'fon' => 'Fon',
369
+ 'fre' => 'French',
370
+ 'fra' => 'French',
371
+ 'frm' => 'French, Middle (ca.1400-1600)',
372
+ 'fro' => 'French, Old (842-ca.1400)',
373
+ 'frs' => 'Frisian, Eastern',
374
+ 'fry' => 'Frisian, Western',
375
+ 'fur' => 'Friulian',
376
+ 'ful' => 'Fulah',
377
+ 'gaa' => 'Ga',
378
+ 'gla' => 'Gaelic',
379
+ 'glg' => 'Gallegan',
380
+ 'lug' => 'Ganda',
381
+ 'gay' => 'Gayo',
382
+ 'gba' => 'Gbaya',
383
+ 'gez' => 'Geez',
384
+ 'geo' => 'Georgian',
385
+ 'kat' => 'Georgian',
386
+ 'ger' => 'German',
387
+ 'deu' => 'German',
388
+ 'nds' => 'German, Low',
389
+ 'gmh' => 'German, Middle High (ca.1050-1500)',
390
+ 'goh' => 'German, Old High (ca.750-1050)',
391
+ 'gem' => 'Germanic (Other)',
392
+ 'kik' => 'Gikuyu',
393
+ 'gil' => 'Gilbertese',
394
+ 'gon' => 'Gondi',
395
+ 'gor' => 'Gorontalo',
396
+ 'got' => 'Gothic',
397
+ 'grb' => 'Grebo',
398
+ 'grc' => 'Greek, Ancient (to 1453)',
399
+ 'gre' => 'Greek, Modern (1453-)',
400
+ 'ell' => 'Greek, Modern (1453-)',
401
+ 'kal' => 'Greenlandic; Kalaallisut',
402
+ 'grn' => 'Guarani',
403
+ 'guj' => 'Gujarati',
404
+ 'gwi' => 'Gwich\'in',
405
+ 'hai' => 'Haida',
406
+ 'hat' => 'Haitian',
407
+ 'hau' => 'Hausa',
408
+ 'haw' => 'Hawaiian',
409
+ 'heb' => 'Hebrew',
410
+ 'her' => 'Herero',
411
+ 'hil' => 'Hiligaynon',
412
+ 'him' => 'Himachali',
413
+ 'hin' => 'Hindi',
414
+ 'hmo' => 'Hiri Motu',
415
+ 'hit' => 'Hittite',
416
+ 'hmn' => 'Hmong',
417
+ 'hun' => 'Hungarian',
418
+ 'hup' => 'Hupa',
419
+ 'iba' => 'Iban',
420
+ 'ice' => 'Icelandic',
421
+ 'isl' => 'Icelandic',
422
+ 'ido' => 'Ido',
423
+ 'ibo' => 'Igbo',
424
+ 'ijo' => 'Ijo',
425
+ 'ilo' => 'Iloko',
426
+ 'smn' => 'Inari Sami',
427
+ 'inc' => 'Indic (Other)',
428
+ 'ine' => 'Indo-European (Other)',
429
+ 'ind' => 'Indonesian',
430
+ 'inh' => 'Ingush',
431
+ 'ina' => 'Interlingua (International Auxiliary Language Association)',
432
+ 'ile' => 'Interlingue',
433
+ 'iku' => 'Inuktitut',
434
+ 'ipk' => 'Inupiaq',
435
+ 'ira' => 'Iranian (Other)',
436
+ 'gle' => 'Irish',
437
+ 'mga' => 'Irish, Middle (900-1200)',
438
+ 'sga' => 'Irish, Old (to 900)',
439
+ 'iro' => 'Iroquoian languages',
440
+ 'ita' => 'Italian',
441
+ 'jpn' => 'Japanese',
442
+ 'jav' => 'Javanese',
443
+ 'jrb' => 'Judeo-Arabic',
444
+ 'jpr' => 'Judeo-Persian',
445
+ 'kbd' => 'Kabardian',
446
+ 'kab' => 'Kabyle',
447
+ 'kac' => 'Kachin',
448
+ 'kal' => 'Kalaallisut',
449
+ 'xal' => 'Kalmyk',
450
+ 'kam' => 'Kamba',
451
+ 'kan' => 'Kannada',
452
+ 'kau' => 'Kanuri',
453
+ 'krc' => 'Karachay-Balkar',
454
+ 'kaa' => 'Kara-Kalpak',
455
+ 'krl' => 'Karelian',
456
+ 'kar' => 'Karen',
457
+ 'kas' => 'Kashmiri',
458
+ 'csb' => 'Kashubian',
459
+ 'kaw' => 'Kawi',
460
+ 'kaz' => 'Kazakh',
461
+ 'kha' => 'Khasi',
462
+ 'khm' => 'Khmer',
463
+ 'khi' => 'Khoisan (Other)',
464
+ 'kho' => 'Khotanese',
465
+ 'kik' => 'Kikuyu',
466
+ 'kmb' => 'Kimbundu',
467
+ 'kin' => 'Kinyarwanda',
468
+ 'kir' => 'Kirghiz',
469
+ 'tlh' => 'Klingon; tlhIngan-Hol',
470
+ 'kom' => 'Komi',
471
+ 'kon' => 'Kongo',
472
+ 'kok' => 'Konkani',
473
+ 'kor' => 'Korean',
474
+ 'kos' => 'Kosraean',
475
+ 'kpe' => 'Kpelle',
476
+ 'kro' => 'Kru',
477
+ 'kua' => 'Kuanyama',
478
+ 'kum' => 'Kumyk',
479
+ 'kur' => 'Kurdish',
480
+ 'kru' => 'Kurukh',
481
+ 'kut' => 'Kutenai',
482
+ 'kua' => 'Kwanyama',
483
+ 'lad' => 'Ladino',
484
+ 'lah' => 'Lahnda',
485
+ 'lam' => 'Lamba',
486
+ 'lao' => 'Lao',
487
+ 'lat' => 'Latin',
488
+ 'lav' => 'Latvian',
489
+ 'ltz' => 'Letzeburgesch',
490
+ 'lez' => 'Lezghian',
491
+ 'lim' => 'Limburgan',
492
+ 'lin' => 'Lingala',
493
+ 'lit' => 'Lithuanian',
494
+ 'jbo' => 'Lojban',
495
+ 'nds' => 'Low German',
496
+ 'dsb' => 'Lower Sorbian',
497
+ 'loz' => 'Lozi',
498
+ 'lub' => 'Luba-Katanga',
499
+ 'lua' => 'Luba-Lulua',
500
+ 'lui' => 'Luiseno',
501
+ 'smj' => 'Lule Sami',
502
+ 'lun' => 'Lunda',
503
+ 'luo' => 'Luo (Kenya and Tanzania)',
504
+ 'lus' => 'Lushai',
505
+ 'ltz' => 'Luxembourgish',
506
+ 'mac' => 'Macedonian',
507
+ 'mkd' => 'Macedonian',
508
+ 'mad' => 'Madurese',
509
+ 'mag' => 'Magahi',
510
+ 'mai' => 'Maithili',
511
+ 'mak' => 'Makasar',
512
+ 'mlg' => 'Malagasy',
513
+ 'may' => 'Malay',
514
+ 'msa' => 'Malay',
515
+ 'mal' => 'Malayalam',
516
+ 'mlt' => 'Maltese',
517
+ 'mnc' => 'Manchu',
518
+ 'mdr' => 'Mandar',
519
+ 'man' => 'Mandingo',
520
+ 'mni' => 'Manipuri',
521
+ 'mno' => 'Manobo languages',
522
+ 'glv' => 'Manx',
523
+ 'mao' => 'Maori',
524
+ 'mri' => 'Maori',
525
+ 'mar' => 'Marathi',
526
+ 'chm' => 'Mari',
527
+ 'mah' => 'Marshallese',
528
+ 'mwr' => 'Marwari',
529
+ 'mas' => 'Masai',
530
+ 'myn' => 'Mayan languages',
531
+ 'men' => 'Mende',
532
+ 'mic' => 'Micmac',
533
+ 'min' => 'Minangkabau',
534
+ 'mwl' => 'Mirandese',
535
+ 'mis' => 'Miscellaneous languages',
536
+ 'moh' => 'Mohawk',
537
+ 'mdf' => 'Moksha',
538
+ 'mol' => 'Moldavian',
539
+ 'mkh' => 'Mon-Khmer (Other)',
540
+ 'lol' => 'Mongo',
541
+ 'mon' => 'Mongolian',
542
+ 'mos' => 'Mossi',
543
+ 'mul' => 'Multiple languages',
544
+ 'mun' => 'Munda languages',
545
+ 'nah' => 'Nahuatl',
546
+ 'nau' => 'Nauru',
547
+ 'nav' => 'Navaho; Navajo',
548
+ 'nde' => 'Ndebele, North',
549
+ 'nbl' => 'Ndebele, South',
550
+ 'ndo' => 'Ndonga',
551
+ 'nap' => 'Neapolitan',
552
+ 'nep' => 'Nepali',
553
+ 'new' => 'Newari',
554
+ 'nia' => 'Nias',
555
+ 'nic' => 'Niger-Kordofanian (Other)',
556
+ 'ssa' => 'Nilo-Saharan (Other)',
557
+ 'niu' => 'Niuean',
558
+ 'nog' => 'Nogai',
559
+ 'non' => 'Norse, Old',
560
+ 'nai' => 'North American Indian (Other)',
561
+ 'frr' => 'Northern Frisian',
562
+ 'sme' => 'Northern Sami',
563
+ 'nso' => 'Northern Sotho; Pedi; Sepedi',
564
+ 'nde' => 'North Ndebele',
565
+ 'nor' => 'Norwegian',
566
+ 'nob' => 'Norwegian Bokmal',
567
+ 'nno' => 'Norwegian Nynorsk',
568
+ 'nub' => 'Nubian languages',
569
+ 'nym' => 'Nyamwezi',
570
+ 'nya' => 'Nyanja',
571
+ 'nyn' => 'Nyankole',
572
+ 'nno' => 'Nynorsk, Norwegian',
573
+ 'nyo' => 'Nyoro',
574
+ 'nzi' => 'Nzima',
575
+ 'oci' => 'Occitan (post 1500)',
576
+ 'oji' => 'Ojibwa',
577
+ 'ori' => 'Oriya',
578
+ 'orm' => 'Oromo',
579
+ 'osa' => 'Osage',
580
+ 'oss' => 'Ossetian; Ossetic',
581
+ 'oto' => 'Otomian languages',
582
+ 'pal' => 'Pahlavi',
583
+ 'pau' => 'Palauan',
584
+ 'pli' => 'Pali',
585
+ 'pam' => 'Pampanga',
586
+ 'pag' => 'Pangasinan',
587
+ 'pan' => 'Panjabi',
588
+ 'pap' => 'Papiamento',
589
+ 'paa' => 'Papuan (Other)',
590
+ 'per' => 'Persian',
591
+ 'fas' => 'Persian',
592
+ 'peo' => 'Persian, Old (ca.600-400)',
593
+ 'phi' => 'Philippine (Other)',
594
+ 'phn' => 'Phoenician',
595
+ 'pon' => 'Pohnpeian',
596
+ 'pol' => 'Polish',
597
+ 'por' => 'Portuguese',
598
+ 'pra' => 'Prakrit languages',
599
+ 'oci' => 'Provencal',
600
+ 'pro' => 'Provencal, Old (to 1500)',
601
+ 'pan' => 'Punjabi',
602
+ 'pus' => 'Pushto',
603
+ 'que' => 'Quechua',
604
+ 'roh' => 'Raeto-Romance',
605
+ 'raj' => 'Rajasthani',
606
+ 'rap' => 'Rapanui',
607
+ 'rar' => 'Rarotongan',
608
+ 'qaa' => 'Reserved for local use',
609
+ 'qtz' => 'Reserved for local use',
610
+ 'roa' => 'Romance (Other)',
611
+ 'rum' => 'Romanian',
612
+ 'ron' => 'Romanian',
613
+ 'rom' => 'Romany',
614
+ 'run' => 'Rundi',
615
+ 'rus' => 'Russian',
616
+ 'sal' => 'Salishan languages',
617
+ 'sam' => 'Samaritan Aramaic',
618
+ 'smi' => 'Sami languages (Other)',
619
+ 'smo' => 'Samoan',
620
+ 'sad' => 'Sandawe',
621
+ 'sag' => 'Sango',
622
+ 'san' => 'Sanskrit',
623
+ 'sat' => 'Santali',
624
+ 'srd' => 'Sardinian',
625
+ 'sas' => 'Sasak',
626
+ 'nds' => 'Saxon, Low',
627
+ 'sco' => 'Scots',
628
+ 'gla' => 'Scottish Gaelic',
629
+ 'sel' => 'Selkup',
630
+ 'sem' => 'Semitic (Other)',
631
+ 'nso' => 'Sepedi; Northern Sotho; Pedi',
632
+ 'scc' => 'Serbian',
633
+ 'srp' => 'Serbian',
634
+ 'srr' => 'Serer',
635
+ 'shn' => 'Shan',
636
+ 'sna' => 'Shona',
637
+ 'iii' => 'Sichuan Yi',
638
+ 'scn' => 'Sicilian',
639
+ 'sid' => 'Sidamo',
640
+ 'sgn' => 'Sign languages',
641
+ 'bla' => 'Siksika',
642
+ 'snd' => 'Sindhi',
643
+ 'sin' => 'Sinhalese',
644
+ 'sit' => 'Sino-Tibetan (Other)',
645
+ 'sio' => 'Siouan languages',
646
+ 'sms' => 'Skolt Sami',
647
+ 'den' => 'Slave (Athapascan)',
648
+ 'sla' => 'Slavic (Other)',
649
+ 'slo' => 'Slovak',
650
+ 'slk' => 'Slovak',
651
+ 'slv' => 'Slovenian',
652
+ 'sog' => 'Sogdian',
653
+ 'som' => 'Somali',
654
+ 'son' => 'Songhai',
655
+ 'snk' => 'Soninke',
656
+ 'wen' => 'Sorbian languages',
657
+ 'nso' => 'Sotho, Northern',
658
+ 'sot' => 'Sotho, Southern',
659
+ 'sai' => 'South American Indian (Other)',
660
+ 'alt' => 'Southern Altai',
661
+ 'sma' => 'Southern Sami',
662
+ 'nbl' => 'South Ndebele',
663
+ 'spa' => 'Spanish',
664
+ 'srn' => 'Sranan Tongo',
665
+ 'suk' => 'Sukuma',
666
+ 'sux' => 'Sumerian',
667
+ 'sun' => 'Sundanese',
668
+ 'sus' => 'Susu',
669
+ 'swa' => 'Swahili',
670
+ 'ssw' => 'Swati',
671
+ 'swe' => 'Swedish',
672
+ 'gsw' => 'Swiss German; Alemanic',
673
+ 'syr' => 'Syriac',
674
+ 'tgl' => 'Tagalog',
675
+ 'tah' => 'Tahitian',
676
+ 'tai' => 'Tai (Other)',
677
+ 'tgk' => 'Tajik',
678
+ 'tmh' => 'Tamashek',
679
+ 'tam' => 'Tamil',
680
+ 'tat' => 'Tatar',
681
+ 'tel' => 'Telugu',
682
+ 'ter' => 'Tereno',
683
+ 'tet' => 'Tetum',
684
+ 'tha' => 'Thai',
685
+ 'tib' => 'Tibetan',
686
+ 'bod' => 'Tibetan',
687
+ 'tig' => 'Tigre',
688
+ 'tir' => 'Tigrinya',
689
+ 'tem' => 'Timne',
690
+ 'tiv' => 'Tiv',
691
+ 'tlh' => 'tlhIngan-Hol; Klingon',
692
+ 'tli' => 'Tlingit',
693
+ 'tpi' => 'Tok Pisin',
694
+ 'tkl' => 'Tokelau',
695
+ 'tog' => 'Tonga (Nyasa)',
696
+ 'ton' => 'Tonga (Tonga Islands)',
697
+ 'tsi' => 'Tsimshian',
698
+ 'tso' => 'Tsonga',
699
+ 'tsn' => 'Tswana',
700
+ 'tum' => 'Tumbuka',
701
+ 'tup' => 'Tupi languages',
702
+ 'tur' => 'Turkish',
703
+ 'ota' => 'Turkish, Ottoman (1500-1928)',
704
+ 'tuk' => 'Turkmen',
705
+ 'tvl' => 'Tuvalu',
706
+ 'tyv' => 'Tuvinian',
707
+ 'twi' => 'Twi',
708
+ 'udm' => 'Udmurt',
709
+ 'uga' => 'Ugaritic',
710
+ 'uig' => 'Uighur',
711
+ 'ukr' => 'Ukrainian',
712
+ 'umb' => 'Umbundu',
713
+ 'und' => 'Undetermined',
714
+ 'hsb' => 'Upper Sorbian',
715
+ 'urd' => 'Urdu',
716
+ 'uzb' => 'Uzbek',
717
+ 'vai' => 'Vai',
718
+ 'cat' => 'Valencian',
719
+ 'ven' => 'Venda',
720
+ 'vie' => 'Vietnamese',
721
+ 'vol' => 'Volapuk',
722
+ 'vot' => 'Votic',
723
+ 'wak' => 'Wakashan languages',
724
+ 'wal' => 'Walamo',
725
+ 'wln' => 'Walloon',
726
+ 'war' => 'Waray',
727
+ 'was' => 'Washo',
728
+ 'wel' => 'Welsh',
729
+ 'cym' => 'Welsh',
730
+ 'fry' => 'Wester Frisian',
731
+ 'wol' => 'Wolof',
732
+ 'xho' => 'Xhosa',
733
+ 'sah' => 'Yakut',
734
+ 'yao' => 'Yao',
735
+ 'yap' => 'Yapese',
736
+ 'yid' => 'Yiddish',
737
+ 'yor' => 'Yoruba',
738
+ 'ypk' => 'Yupik languages',
739
+ 'znd' => 'Zande',
740
+ 'zap' => 'Zapotec',
741
+ 'zen' => 'Zenaga',
742
+ 'zha' => 'Zhuang',
743
+ 'zul' => 'Zulu',
744
+ 'zun' => 'Zuni'
745
+ }
746
+
747
+ def is_valid_lang_code(value)
748
+ if value.include? '-'
749
+ lang, sublang = value.split('-', 2)
750
+ else
751
+ lang = value
752
+ end
753
+ !!ISO_LANG[lang.downcase]
754
+ end
755
+ end