spk-html5 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -0
- data/Manifest.txt +73 -0
- data/README +45 -0
- data/Rakefile.rb +33 -0
- data/bin/html5 +7 -0
- data/lib/html5.rb +13 -0
- data/lib/html5/cli.rb +248 -0
- data/lib/html5/constants.rb +1061 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/iso639codes.rb +755 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/rfc2046.rb +31 -0
- data/lib/html5/filters/rfc3987.rb +91 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/validator.rb +834 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser.rb +247 -0
- data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
- data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/html5/html5parser/after_head_phase.rb +55 -0
- data/lib/html5/html5parser/before_head_phase.rb +44 -0
- data/lib/html5/html5parser/before_html_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +636 -0
- data/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
- data/lib/html5/html5parser/in_head_phase.rb +143 -0
- data/lib/html5/html5parser/in_row_phase.rb +96 -0
- data/lib/html5/html5parser/in_select_phase.rb +90 -0
- data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
- data/lib/html5/html5parser/in_table_phase.rb +177 -0
- data/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/html5/html5parser/phase.rb +171 -0
- data/lib/html5/inputstream.rb +735 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +209 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/sniffer.rb +45 -0
- data/lib/html5/tokenizer.rb +1059 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treebuilders/base.rb +339 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +215 -0
- data/lib/html5/treebuilders/simpletree.rb +191 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5/treewalkers/base.rb +162 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/version.rb +3 -0
- data/test/preamble.rb +69 -0
- data/test/test_cli.rb +16 -0
- data/test/test_encoding.rb +35 -0
- data/test/test_input_stream.rb +26 -0
- data/test/test_lxp.rb +283 -0
- data/test/test_parser.rb +63 -0
- data/test/test_sanitizer.rb +173 -0
- data/test/test_serializer.rb +67 -0
- data/test/test_sniffer.rb +27 -0
- data/test/test_stream.rb +71 -0
- data/test/test_tokenizer.rb +95 -0
- data/test/test_treewalkers.rb +135 -0
- data/test/test_validator.rb +31 -0
- data/test/tokenizer_test_parser.rb +67 -0
- data/test19.rb +38 -0
- metadata +198 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'html5/filters/base'
|
2
|
+
|
3
|
+
module HTML5
|
4
|
+
module Filters
|
5
|
+
class InjectMetaCharset < Base
|
6
|
+
def initialize(source, encoding)
|
7
|
+
super(source)
|
8
|
+
@encoding = encoding
|
9
|
+
end
|
10
|
+
|
11
|
+
def each
|
12
|
+
state = :pre_head
|
13
|
+
meta_found = @encoding.nil?
|
14
|
+
pending = []
|
15
|
+
|
16
|
+
__getobj__.each do |token|
|
17
|
+
case token[:type]
|
18
|
+
when :StartTag
|
19
|
+
state = :in_head if token[:name].downcase == "head"
|
20
|
+
|
21
|
+
when :EmptyTag
|
22
|
+
if token[:name].downcase == "meta"
|
23
|
+
# replace charset with actual encoding
|
24
|
+
token[:data].each_with_index do |(name, value), index|
|
25
|
+
if name == 'charset'
|
26
|
+
token[:data][index][1] = @encoding
|
27
|
+
meta_found = true
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# replace charset with actual encoding
|
32
|
+
has_http_equiv_content_type = false
|
33
|
+
content_index = -1
|
34
|
+
token[:data].each_with_index do |(name, value), i|
|
35
|
+
if name.downcase == 'charset'
|
36
|
+
token[:data][i] = ['charset', @encoding]
|
37
|
+
meta_found = true
|
38
|
+
break
|
39
|
+
elsif name == 'http-equiv' and value.downcase == 'content-type'
|
40
|
+
has_http_equiv_content_type = true
|
41
|
+
elsif name == 'content'
|
42
|
+
content_index = i
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
if !meta_found
|
47
|
+
if has_http_equiv_content_type && content_index >= 0
|
48
|
+
token[:data][content_index][1] = 'text/html; charset=%s' % @encoding
|
49
|
+
meta_found = true
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
elsif token[:name].downcase == "head" && !meta_found
|
54
|
+
# insert meta into empty head
|
55
|
+
yield :type => :StartTag, :name => "head", :data => token[:data]
|
56
|
+
yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]]
|
57
|
+
yield :type => :EndTag, :name => "head"
|
58
|
+
meta_found = true
|
59
|
+
next
|
60
|
+
end
|
61
|
+
|
62
|
+
when :EndTag
|
63
|
+
if token[:name].downcase == "head" && pending.any?
|
64
|
+
# insert meta into head (if necessary) and flush pending queue
|
65
|
+
yield pending.shift
|
66
|
+
yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]] if !meta_found
|
67
|
+
yield pending.shift while pending.any?
|
68
|
+
meta_found = true
|
69
|
+
state = :post_head
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
if state == :in_head
|
74
|
+
pending << token
|
75
|
+
else
|
76
|
+
yield token
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,755 @@
|
|
1
|
+
# borrowed from feedvalidator, original copyright license is
|
2
|
+
#
|
3
|
+
# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in all
|
13
|
+
# copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
# SOFTWARE.
|
22
|
+
|
23
|
+
module ISO639Codes
|
24
|
+
|
25
|
+
ISO_LANG = {
|
26
|
+
'aa' => 'Afar',
|
27
|
+
'ab' => 'Abkhazian',
|
28
|
+
'ae' => 'Avestan',
|
29
|
+
'af' => 'Afrikaans',
|
30
|
+
'ak' => 'Akan',
|
31
|
+
'am' => 'Amharic',
|
32
|
+
'an' => 'Aragonese',
|
33
|
+
'ar' => 'Arabic',
|
34
|
+
'as' => 'Assamese',
|
35
|
+
'av' => 'Avaric',
|
36
|
+
'ay' => 'Aymara',
|
37
|
+
'az' => 'Azerbaijani',
|
38
|
+
'ba' => 'Bashkir',
|
39
|
+
'be' => 'Byelorussian',
|
40
|
+
'bg' => 'Bulgarian',
|
41
|
+
'bh' => 'Bihari',
|
42
|
+
'bi' => 'Bislama',
|
43
|
+
'bm' => 'Bambara',
|
44
|
+
'bn' => 'Bengali;Bangla',
|
45
|
+
'bo' => 'Tibetan',
|
46
|
+
'br' => 'Breton',
|
47
|
+
'bs' => 'Bosnian',
|
48
|
+
'ca' => 'Catalan',
|
49
|
+
'ce' => 'Chechen',
|
50
|
+
'ch' => 'Chamorro',
|
51
|
+
'co' => 'Corsican',
|
52
|
+
'cr' => 'Cree',
|
53
|
+
'cs' => 'Czech',
|
54
|
+
'cu' => 'Church Slavic',
|
55
|
+
'cv' => 'Chuvash',
|
56
|
+
'cy' => 'Welsh',
|
57
|
+
'da' => 'Danish',
|
58
|
+
'de' => 'German',
|
59
|
+
'dv' => 'Divehi',
|
60
|
+
'dz' => 'Dzongkha',
|
61
|
+
'ee' => 'Ewe',
|
62
|
+
'el' => 'Greek',
|
63
|
+
'en' => 'English',
|
64
|
+
'eo' => 'Esperanto',
|
65
|
+
'es' => 'Spanish',
|
66
|
+
'et' => 'Estonian',
|
67
|
+
'eu' => 'Basque',
|
68
|
+
'fa' => 'Persian (Farsi)',
|
69
|
+
'ff' => 'Fulah',
|
70
|
+
'fi' => 'Finnish',
|
71
|
+
'fj' => 'Fiji',
|
72
|
+
'fo' => 'Faroese',
|
73
|
+
'fr' => 'French',
|
74
|
+
'fy' => 'Frisian, Western',
|
75
|
+
'ga' => 'Irish',
|
76
|
+
'gd' => 'Scots Gaelic',
|
77
|
+
'gl' => 'Galician',
|
78
|
+
'gn' => 'Guarani',
|
79
|
+
'gu' => 'Gujarati',
|
80
|
+
'gv' => 'Manx',
|
81
|
+
'ha' => 'Hausa',
|
82
|
+
'he' => 'Hebrew',
|
83
|
+
'hi' => 'Hindi',
|
84
|
+
'ho' => 'Hiri Motu',
|
85
|
+
'hr' => 'Croatian',
|
86
|
+
'ht' => 'Haitian',
|
87
|
+
'hu' => 'Hungarian',
|
88
|
+
'hy' => 'Armenian',
|
89
|
+
'hz' => 'Herero',
|
90
|
+
'ia' => 'Interlingua',
|
91
|
+
'id' => 'Indonesian',
|
92
|
+
'ie' => 'Interlingue',
|
93
|
+
'ig' => 'Igbo',
|
94
|
+
'ii' => 'Sichuan Yi',
|
95
|
+
'ik' => 'Inupiak',
|
96
|
+
'io' => 'Ido',
|
97
|
+
'is' => 'Icelandic',
|
98
|
+
'it' => 'Italian',
|
99
|
+
'iu' => 'Inuktitut',
|
100
|
+
'ja' => 'Japanese',
|
101
|
+
'jv' => 'Javanese',
|
102
|
+
'ka' => 'Georgian',
|
103
|
+
'kg' => 'Kongo',
|
104
|
+
'ki' => 'Kikuyu; Gikuyu',
|
105
|
+
'kj' => 'Kuanyama; Kwanyama',
|
106
|
+
'kk' => 'Kazakh',
|
107
|
+
'kl' => 'Greenlandic',
|
108
|
+
'km' => 'Cambodian',
|
109
|
+
'kn' => 'Kannada',
|
110
|
+
'ko' => 'Korean',
|
111
|
+
'kr' => 'Kanuri',
|
112
|
+
'ks' => 'Kashmiri',
|
113
|
+
'ku' => 'Kurdish',
|
114
|
+
'kv' => 'Komi',
|
115
|
+
'kw' => 'Cornish',
|
116
|
+
'ky' => 'Kirghiz',
|
117
|
+
'la' => 'Latin',
|
118
|
+
'lb' => 'Letzeburgesch; Luxembourgish',
|
119
|
+
'lg' => 'Ganda',
|
120
|
+
'li' => 'Limburgan; Limburger, Limburgish',
|
121
|
+
'ln' => 'Lingala',
|
122
|
+
'lo' => 'Lao',
|
123
|
+
'lt' => 'Lithuanian',
|
124
|
+
'lu' => 'Luba-Katanga',
|
125
|
+
'lv' => 'Latvian',
|
126
|
+
'mg' => 'Malagasy',
|
127
|
+
'mh' => 'Marshallese',
|
128
|
+
'mi' => 'Maori',
|
129
|
+
'mk' => 'Macedonian',
|
130
|
+
'ml' => 'Malayalam',
|
131
|
+
'mn' => 'Mongolian',
|
132
|
+
'mo' => 'Moldavian',
|
133
|
+
'mr' => 'Marathi',
|
134
|
+
'ms' => 'Malay',
|
135
|
+
'mt' => 'Maltese',
|
136
|
+
'my' => 'Burmese',
|
137
|
+
'na' => 'Nauru',
|
138
|
+
'nb' => 'Norwegian Bokmal',
|
139
|
+
'nd' => 'Ndebele, North',
|
140
|
+
'ne' => 'Nepali',
|
141
|
+
'ng' => 'Ndonga',
|
142
|
+
'nl' => 'Dutch',
|
143
|
+
'nn' => 'Norwegian Nynorsk',
|
144
|
+
'no' => 'Norwegian',
|
145
|
+
'nr' => 'Ndebele, South',
|
146
|
+
'nv' => 'Navaho; Navajo',
|
147
|
+
'ny' => 'Chewa; Chichewa; Nyanha',
|
148
|
+
'oc' => 'Occitan',
|
149
|
+
'oj' => 'Ojibwa',
|
150
|
+
'om' => 'Afan (Oromo)',
|
151
|
+
'or' => 'Oriya',
|
152
|
+
'os' => 'Ossetian; Ossetic',
|
153
|
+
'pa' => 'Punjabi',
|
154
|
+
'pi' => 'Pali',
|
155
|
+
'pl' => 'Polish',
|
156
|
+
'ps' => 'Pushto',
|
157
|
+
'pt' => 'Portuguese',
|
158
|
+
'qu' => 'Quechua',
|
159
|
+
'rm' => 'Rhaeto-Romance',
|
160
|
+
'rn' => 'Kurundi',
|
161
|
+
'ro' => 'Romanian',
|
162
|
+
'ru' => 'Russian',
|
163
|
+
'rw' => 'Kinyarwanda',
|
164
|
+
'sa' => 'Sanskrit',
|
165
|
+
'sc' => 'Sardinian',
|
166
|
+
'sd' => 'Sindhi',
|
167
|
+
'se' => 'Northern Sami',
|
168
|
+
'sg' => 'Sangho',
|
169
|
+
'sh' => 'Serbo-Croatian',
|
170
|
+
'si' => 'Singhalese',
|
171
|
+
'sk' => 'Slovak',
|
172
|
+
'sl' => 'Slovenian',
|
173
|
+
'sm' => 'Samoan',
|
174
|
+
'sn' => 'Shona',
|
175
|
+
'so' => 'Somali',
|
176
|
+
'sq' => 'Albanian',
|
177
|
+
'sr' => 'Serbian',
|
178
|
+
'ss' => 'Swati',
|
179
|
+
'st' => 'Sotho, Southern',
|
180
|
+
'su' => 'Sundanese',
|
181
|
+
'sv' => 'Swedish',
|
182
|
+
'sw' => 'Swahili',
|
183
|
+
'ta' => 'Tamil',
|
184
|
+
'te' => 'Telugu',
|
185
|
+
'tg' => 'Tajik',
|
186
|
+
'th' => 'Thai',
|
187
|
+
'ti' => 'Tigrinya',
|
188
|
+
'tk' => 'Turkmen',
|
189
|
+
'tl' => 'Tagalog',
|
190
|
+
'tn' => 'Tswana',
|
191
|
+
'to' => 'Tonga',
|
192
|
+
'tr' => 'Turkish',
|
193
|
+
'ts' => 'Tsonga',
|
194
|
+
'tt' => 'Tatar',
|
195
|
+
'tw' => 'Twi',
|
196
|
+
'ty' => 'Tahitian',
|
197
|
+
'ug' => 'Uigur',
|
198
|
+
'uk' => 'Ukrainian',
|
199
|
+
'ur' => 'Urdu',
|
200
|
+
'uz' => 'Uzbek',
|
201
|
+
've' => 'Venda',
|
202
|
+
'vi' => 'Vietnamese',
|
203
|
+
'vo' => 'Volapuk',
|
204
|
+
'wa' => 'Walloon',
|
205
|
+
'wo' => 'Wolof',
|
206
|
+
'xh' => 'Xhosa',
|
207
|
+
'yi' => 'Yiddish',
|
208
|
+
'yo' => 'Yoruba',
|
209
|
+
'za' => 'Zhuang',
|
210
|
+
'zh' => 'Chinese',
|
211
|
+
'zu' => 'Zulu',
|
212
|
+
'x' => 'a user-defined language',
|
213
|
+
'xx' => 'a user-defined language',
|
214
|
+
|
215
|
+
'abk' => 'Abkhazian',
|
216
|
+
'ace' => 'Achinese',
|
217
|
+
'ach' => 'Acoli',
|
218
|
+
'ada' => 'Adangme',
|
219
|
+
'ady' => 'Adygei',
|
220
|
+
'ady' => 'Adyghe',
|
221
|
+
'aar' => 'Afar',
|
222
|
+
'afh' => 'Afrihili',
|
223
|
+
'afr' => 'Afrikaans',
|
224
|
+
'afa' => 'Afro-Asiatic (Other)',
|
225
|
+
'ain' => 'Ainu',
|
226
|
+
'aka' => 'Akan',
|
227
|
+
'akk' => 'Akkadian',
|
228
|
+
'alb' => 'Albanian',
|
229
|
+
'sqi' => 'Albanian',
|
230
|
+
'gws' => 'Alemanic',
|
231
|
+
'ale' => 'Aleut',
|
232
|
+
'alg' => 'Algonquian languages',
|
233
|
+
'tut' => 'Altaic (Other)',
|
234
|
+
'amh' => 'Amharic',
|
235
|
+
'anp' => 'Angika',
|
236
|
+
'apa' => 'Apache languages',
|
237
|
+
'ara' => 'Arabic',
|
238
|
+
'arg' => 'Aragonese',
|
239
|
+
'arc' => 'Aramaic',
|
240
|
+
'arp' => 'Arapaho',
|
241
|
+
'arn' => 'Araucanian',
|
242
|
+
'arw' => 'Arawak',
|
243
|
+
'arm' => 'Armenian',
|
244
|
+
'hye' => 'Armenian',
|
245
|
+
'rup' => 'Aromanian',
|
246
|
+
'art' => 'Artificial (Other)',
|
247
|
+
'asm' => 'Assamese',
|
248
|
+
'ast' => 'Asturian',
|
249
|
+
'ath' => 'Athapascan languages',
|
250
|
+
'aus' => 'Australian languages',
|
251
|
+
'map' => 'Austronesian (Other)',
|
252
|
+
'ava' => 'Avaric',
|
253
|
+
'ave' => 'Avestan',
|
254
|
+
'awa' => 'Awadhi',
|
255
|
+
'aym' => 'Aymara',
|
256
|
+
'aze' => 'Azerbaijani',
|
257
|
+
'ast' => 'Bable',
|
258
|
+
'ban' => 'Balinese',
|
259
|
+
'bat' => 'Baltic (Other)',
|
260
|
+
'bal' => 'Baluchi',
|
261
|
+
'bam' => 'Bambara',
|
262
|
+
'bai' => 'Bamileke languages',
|
263
|
+
'bad' => 'Banda',
|
264
|
+
'bnt' => 'Bantu (Other)',
|
265
|
+
'bas' => 'Basa',
|
266
|
+
'bak' => 'Bashkir',
|
267
|
+
'baq' => 'Basque',
|
268
|
+
'eus' => 'Basque',
|
269
|
+
'btk' => 'Batak (Indonesia)',
|
270
|
+
'bej' => 'Beja',
|
271
|
+
'bel' => 'Belarusian',
|
272
|
+
'bem' => 'Bemba',
|
273
|
+
'ben' => 'Bengali',
|
274
|
+
'ber' => 'Berber (Other)',
|
275
|
+
'bho' => 'Bhojpuri',
|
276
|
+
'bih' => 'Bihari',
|
277
|
+
'bik' => 'Bikol',
|
278
|
+
'byn' => 'Bilin',
|
279
|
+
'bin' => 'Bini',
|
280
|
+
'bis' => 'Bislama',
|
281
|
+
'byn' => 'Blin',
|
282
|
+
'nob' => 'Bokmal, Norwegian',
|
283
|
+
'bos' => 'Bosnian',
|
284
|
+
'bra' => 'Braj',
|
285
|
+
'bre' => 'Breton',
|
286
|
+
'bug' => 'Buginese',
|
287
|
+
'bul' => 'Bulgarian',
|
288
|
+
'bua' => 'Buriat',
|
289
|
+
'bur' => 'Burmese',
|
290
|
+
'mya' => 'Burmese',
|
291
|
+
'cad' => 'Caddo',
|
292
|
+
'car' => 'Carib',
|
293
|
+
'spa' => 'Castilian',
|
294
|
+
'cat' => 'Catalan',
|
295
|
+
'cau' => 'Caucasian (Other)',
|
296
|
+
'ceb' => 'Cebuano',
|
297
|
+
'cel' => 'Celtic (Other)',
|
298
|
+
'cai' => 'Central American Indian (Other)',
|
299
|
+
'chg' => 'Chagatai',
|
300
|
+
'cmc' => 'Chamic languages',
|
301
|
+
'cha' => 'Chamorro',
|
302
|
+
'che' => 'Chechen',
|
303
|
+
'chr' => 'Cherokee',
|
304
|
+
'nya' => 'Chewa',
|
305
|
+
'chy' => 'Cheyenne',
|
306
|
+
'chb' => 'Chibcha',
|
307
|
+
'nya' => 'Chichewa',
|
308
|
+
'chi' => 'Chinese',
|
309
|
+
'zho' => 'Chinese',
|
310
|
+
'chn' => 'Chinook jargon',
|
311
|
+
'chp' => 'Chipewyan',
|
312
|
+
'cho' => 'Choctaw',
|
313
|
+
'zha' => 'Chuang',
|
314
|
+
'chu' => 'Church Slavic; Church Slavonic; Old Church Slavonic; Old Church Slavic; Old Bulgarian',
|
315
|
+
'chk' => 'Chuukese',
|
316
|
+
'chv' => 'Chuvash',
|
317
|
+
'nwc' => 'Classical Nepal Bhasa; Classical Newari; Old Newari',
|
318
|
+
'cop' => 'Coptic',
|
319
|
+
'cor' => 'Cornish',
|
320
|
+
'cos' => 'Corsican',
|
321
|
+
'cre' => 'Cree',
|
322
|
+
'mus' => 'Creek',
|
323
|
+
'crp' => 'Creoles and pidgins(Other)',
|
324
|
+
'cpe' => 'Creoles and pidgins, English-based (Other)',
|
325
|
+
'cpf' => 'Creoles and pidgins, French-based (Other)',
|
326
|
+
'cpp' => 'Creoles and pidgins, Portuguese-based (Other)',
|
327
|
+
'crh' => 'Crimean Tatar; Crimean Turkish',
|
328
|
+
'scr' => 'Croatian',
|
329
|
+
'hrv' => 'Croatian',
|
330
|
+
'cus' => 'Cushitic (Other)',
|
331
|
+
'cze' => 'Czech',
|
332
|
+
'ces' => 'Czech',
|
333
|
+
'dak' => 'Dakota',
|
334
|
+
'dan' => 'Danish',
|
335
|
+
'dar' => 'Dargwa',
|
336
|
+
'day' => 'Dayak',
|
337
|
+
'del' => 'Delaware',
|
338
|
+
'din' => 'Dinka',
|
339
|
+
'div' => 'Divehi',
|
340
|
+
'doi' => 'Dogri',
|
341
|
+
'dgr' => 'Dogrib',
|
342
|
+
'dra' => 'Dravidian (Other)',
|
343
|
+
'dua' => 'Duala',
|
344
|
+
'dut' => 'Dutch',
|
345
|
+
'nld' => 'Dutch',
|
346
|
+
'dum' => 'Dutch, Middle (ca. 1050-1350)',
|
347
|
+
'dyu' => 'Dyula',
|
348
|
+
'dzo' => 'Dzongkha',
|
349
|
+
'efi' => 'Efik',
|
350
|
+
'egy' => 'Egyptian (Ancient)',
|
351
|
+
'eka' => 'Ekajuk',
|
352
|
+
'elx' => 'Elamite',
|
353
|
+
'eng' => 'English',
|
354
|
+
'enm' => 'English, Middle (1100-1500)',
|
355
|
+
'ang' => 'English, Old (ca.450-1100)',
|
356
|
+
'myv' => 'Erzya',
|
357
|
+
'epo' => 'Esperanto',
|
358
|
+
'est' => 'Estonian',
|
359
|
+
'ewe' => 'Ewe',
|
360
|
+
'ewo' => 'Ewondo',
|
361
|
+
'fan' => 'Fang',
|
362
|
+
'fat' => 'Fanti',
|
363
|
+
'fao' => 'Faroese',
|
364
|
+
'fij' => 'Fijian',
|
365
|
+
'fil' => 'Filipino; Pilipino',
|
366
|
+
'fin' => 'Finnish',
|
367
|
+
'fiu' => 'Finno-Ugrian (Other)',
|
368
|
+
'fon' => 'Fon',
|
369
|
+
'fre' => 'French',
|
370
|
+
'fra' => 'French',
|
371
|
+
'frm' => 'French, Middle (ca.1400-1600)',
|
372
|
+
'fro' => 'French, Old (842-ca.1400)',
|
373
|
+
'frs' => 'Frisian, Eastern',
|
374
|
+
'fry' => 'Frisian, Western',
|
375
|
+
'fur' => 'Friulian',
|
376
|
+
'ful' => 'Fulah',
|
377
|
+
'gaa' => 'Ga',
|
378
|
+
'gla' => 'Gaelic',
|
379
|
+
'glg' => 'Gallegan',
|
380
|
+
'lug' => 'Ganda',
|
381
|
+
'gay' => 'Gayo',
|
382
|
+
'gba' => 'Gbaya',
|
383
|
+
'gez' => 'Geez',
|
384
|
+
'geo' => 'Georgian',
|
385
|
+
'kat' => 'Georgian',
|
386
|
+
'ger' => 'German',
|
387
|
+
'deu' => 'German',
|
388
|
+
'nds' => 'German, Low',
|
389
|
+
'gmh' => 'German, Middle High (ca.1050-1500)',
|
390
|
+
'goh' => 'German, Old High (ca.750-1050)',
|
391
|
+
'gem' => 'Germanic (Other)',
|
392
|
+
'kik' => 'Gikuyu',
|
393
|
+
'gil' => 'Gilbertese',
|
394
|
+
'gon' => 'Gondi',
|
395
|
+
'gor' => 'Gorontalo',
|
396
|
+
'got' => 'Gothic',
|
397
|
+
'grb' => 'Grebo',
|
398
|
+
'grc' => 'Greek, Ancient (to 1453)',
|
399
|
+
'gre' => 'Greek, Modern (1453-)',
|
400
|
+
'ell' => 'Greek, Modern (1453-)',
|
401
|
+
'kal' => 'Greenlandic; Kalaallisut',
|
402
|
+
'grn' => 'Guarani',
|
403
|
+
'guj' => 'Gujarati',
|
404
|
+
'gwi' => 'Gwich\'in',
|
405
|
+
'hai' => 'Haida',
|
406
|
+
'hat' => 'Haitian',
|
407
|
+
'hau' => 'Hausa',
|
408
|
+
'haw' => 'Hawaiian',
|
409
|
+
'heb' => 'Hebrew',
|
410
|
+
'her' => 'Herero',
|
411
|
+
'hil' => 'Hiligaynon',
|
412
|
+
'him' => 'Himachali',
|
413
|
+
'hin' => 'Hindi',
|
414
|
+
'hmo' => 'Hiri Motu',
|
415
|
+
'hit' => 'Hittite',
|
416
|
+
'hmn' => 'Hmong',
|
417
|
+
'hun' => 'Hungarian',
|
418
|
+
'hup' => 'Hupa',
|
419
|
+
'iba' => 'Iban',
|
420
|
+
'ice' => 'Icelandic',
|
421
|
+
'isl' => 'Icelandic',
|
422
|
+
'ido' => 'Ido',
|
423
|
+
'ibo' => 'Igbo',
|
424
|
+
'ijo' => 'Ijo',
|
425
|
+
'ilo' => 'Iloko',
|
426
|
+
'smn' => 'Inari Sami',
|
427
|
+
'inc' => 'Indic (Other)',
|
428
|
+
'ine' => 'Indo-European (Other)',
|
429
|
+
'ind' => 'Indonesian',
|
430
|
+
'inh' => 'Ingush',
|
431
|
+
'ina' => 'Interlingua (International Auxiliary Language Association)',
|
432
|
+
'ile' => 'Interlingue',
|
433
|
+
'iku' => 'Inuktitut',
|
434
|
+
'ipk' => 'Inupiaq',
|
435
|
+
'ira' => 'Iranian (Other)',
|
436
|
+
'gle' => 'Irish',
|
437
|
+
'mga' => 'Irish, Middle (900-1200)',
|
438
|
+
'sga' => 'Irish, Old (to 900)',
|
439
|
+
'iro' => 'Iroquoian languages',
|
440
|
+
'ita' => 'Italian',
|
441
|
+
'jpn' => 'Japanese',
|
442
|
+
'jav' => 'Javanese',
|
443
|
+
'jrb' => 'Judeo-Arabic',
|
444
|
+
'jpr' => 'Judeo-Persian',
|
445
|
+
'kbd' => 'Kabardian',
|
446
|
+
'kab' => 'Kabyle',
|
447
|
+
'kac' => 'Kachin',
|
448
|
+
'kal' => 'Kalaallisut',
|
449
|
+
'xal' => 'Kalmyk',
|
450
|
+
'kam' => 'Kamba',
|
451
|
+
'kan' => 'Kannada',
|
452
|
+
'kau' => 'Kanuri',
|
453
|
+
'krc' => 'Karachay-Balkar',
|
454
|
+
'kaa' => 'Kara-Kalpak',
|
455
|
+
'krl' => 'Karelian',
|
456
|
+
'kar' => 'Karen',
|
457
|
+
'kas' => 'Kashmiri',
|
458
|
+
'csb' => 'Kashubian',
|
459
|
+
'kaw' => 'Kawi',
|
460
|
+
'kaz' => 'Kazakh',
|
461
|
+
'kha' => 'Khasi',
|
462
|
+
'khm' => 'Khmer',
|
463
|
+
'khi' => 'Khoisan (Other)',
|
464
|
+
'kho' => 'Khotanese',
|
465
|
+
'kik' => 'Kikuyu',
|
466
|
+
'kmb' => 'Kimbundu',
|
467
|
+
'kin' => 'Kinyarwanda',
|
468
|
+
'kir' => 'Kirghiz',
|
469
|
+
'tlh' => 'Klingon; tlhIngan-Hol',
|
470
|
+
'kom' => 'Komi',
|
471
|
+
'kon' => 'Kongo',
|
472
|
+
'kok' => 'Konkani',
|
473
|
+
'kor' => 'Korean',
|
474
|
+
'kos' => 'Kosraean',
|
475
|
+
'kpe' => 'Kpelle',
|
476
|
+
'kro' => 'Kru',
|
477
|
+
'kua' => 'Kuanyama',
|
478
|
+
'kum' => 'Kumyk',
|
479
|
+
'kur' => 'Kurdish',
|
480
|
+
'kru' => 'Kurukh',
|
481
|
+
'kut' => 'Kutenai',
|
482
|
+
'kua' => 'Kwanyama',
|
483
|
+
'lad' => 'Ladino',
|
484
|
+
'lah' => 'Lahnda',
|
485
|
+
'lam' => 'Lamba',
|
486
|
+
'lao' => 'Lao',
|
487
|
+
'lat' => 'Latin',
|
488
|
+
'lav' => 'Latvian',
|
489
|
+
'ltz' => 'Letzeburgesch',
|
490
|
+
'lez' => 'Lezghian',
|
491
|
+
'lim' => 'Limburgan',
|
492
|
+
'lin' => 'Lingala',
|
493
|
+
'lit' => 'Lithuanian',
|
494
|
+
'jbo' => 'Lojban',
|
495
|
+
'nds' => 'Low German',
|
496
|
+
'dsb' => 'Lower Sorbian',
|
497
|
+
'loz' => 'Lozi',
|
498
|
+
'lub' => 'Luba-Katanga',
|
499
|
+
'lua' => 'Luba-Lulua',
|
500
|
+
'lui' => 'Luiseno',
|
501
|
+
'smj' => 'Lule Sami',
|
502
|
+
'lun' => 'Lunda',
|
503
|
+
'luo' => 'Luo (Kenya and Tanzania)',
|
504
|
+
'lus' => 'Lushai',
|
505
|
+
'ltz' => 'Luxembourgish',
|
506
|
+
'mac' => 'Macedonian',
|
507
|
+
'mkd' => 'Macedonian',
|
508
|
+
'mad' => 'Madurese',
|
509
|
+
'mag' => 'Magahi',
|
510
|
+
'mai' => 'Maithili',
|
511
|
+
'mak' => 'Makasar',
|
512
|
+
'mlg' => 'Malagasy',
|
513
|
+
'may' => 'Malay',
|
514
|
+
'msa' => 'Malay',
|
515
|
+
'mal' => 'Malayalam',
|
516
|
+
'mlt' => 'Maltese',
|
517
|
+
'mnc' => 'Manchu',
|
518
|
+
'mdr' => 'Mandar',
|
519
|
+
'man' => 'Mandingo',
|
520
|
+
'mni' => 'Manipuri',
|
521
|
+
'mno' => 'Manobo languages',
|
522
|
+
'glv' => 'Manx',
|
523
|
+
'mao' => 'Maori',
|
524
|
+
'mri' => 'Maori',
|
525
|
+
'mar' => 'Marathi',
|
526
|
+
'chm' => 'Mari',
|
527
|
+
'mah' => 'Marshallese',
|
528
|
+
'mwr' => 'Marwari',
|
529
|
+
'mas' => 'Masai',
|
530
|
+
'myn' => 'Mayan languages',
|
531
|
+
'men' => 'Mende',
|
532
|
+
'mic' => 'Micmac',
|
533
|
+
'min' => 'Minangkabau',
|
534
|
+
'mwl' => 'Mirandese',
|
535
|
+
'mis' => 'Miscellaneous languages',
|
536
|
+
'moh' => 'Mohawk',
|
537
|
+
'mdf' => 'Moksha',
|
538
|
+
'mol' => 'Moldavian',
|
539
|
+
'mkh' => 'Mon-Khmer (Other)',
|
540
|
+
'lol' => 'Mongo',
|
541
|
+
'mon' => 'Mongolian',
|
542
|
+
'mos' => 'Mossi',
|
543
|
+
'mul' => 'Multiple languages',
|
544
|
+
'mun' => 'Munda languages',
|
545
|
+
'nah' => 'Nahuatl',
|
546
|
+
'nau' => 'Nauru',
|
547
|
+
'nav' => 'Navaho; Navajo',
|
548
|
+
'nde' => 'Ndebele, North',
|
549
|
+
'nbl' => 'Ndebele, South',
|
550
|
+
'ndo' => 'Ndonga',
|
551
|
+
'nap' => 'Neapolitan',
|
552
|
+
'nep' => 'Nepali',
|
553
|
+
'new' => 'Newari',
|
554
|
+
'nia' => 'Nias',
|
555
|
+
'nic' => 'Niger-Kordofanian (Other)',
|
556
|
+
'ssa' => 'Nilo-Saharan (Other)',
|
557
|
+
'niu' => 'Niuean',
|
558
|
+
'nog' => 'Nogai',
|
559
|
+
'non' => 'Norse, Old',
|
560
|
+
'nai' => 'North American Indian (Other)',
|
561
|
+
'frr' => 'Northern Frisian',
|
562
|
+
'sme' => 'Northern Sami',
|
563
|
+
'nso' => 'Northern Sotho; Pedi; Sepedi',
|
564
|
+
'nde' => 'North Ndebele',
|
565
|
+
'nor' => 'Norwegian',
|
566
|
+
'nob' => 'Norwegian Bokmal',
|
567
|
+
'nno' => 'Norwegian Nynorsk',
|
568
|
+
'nub' => 'Nubian languages',
|
569
|
+
'nym' => 'Nyamwezi',
|
570
|
+
'nya' => 'Nyanja',
|
571
|
+
'nyn' => 'Nyankole',
|
572
|
+
'nno' => 'Nynorsk, Norwegian',
|
573
|
+
'nyo' => 'Nyoro',
|
574
|
+
'nzi' => 'Nzima',
|
575
|
+
'oci' => 'Occitan (post 1500)',
|
576
|
+
'oji' => 'Ojibwa',
|
577
|
+
'ori' => 'Oriya',
|
578
|
+
'orm' => 'Oromo',
|
579
|
+
'osa' => 'Osage',
|
580
|
+
'oss' => 'Ossetian; Ossetic',
|
581
|
+
'oto' => 'Otomian languages',
|
582
|
+
'pal' => 'Pahlavi',
|
583
|
+
'pau' => 'Palauan',
|
584
|
+
'pli' => 'Pali',
|
585
|
+
'pam' => 'Pampanga',
|
586
|
+
'pag' => 'Pangasinan',
|
587
|
+
'pan' => 'Panjabi',
|
588
|
+
'pap' => 'Papiamento',
|
589
|
+
'paa' => 'Papuan (Other)',
|
590
|
+
'per' => 'Persian',
|
591
|
+
'fas' => 'Persian',
|
592
|
+
'peo' => 'Persian, Old (ca.600-400)',
|
593
|
+
'phi' => 'Philippine (Other)',
|
594
|
+
'phn' => 'Phoenician',
|
595
|
+
'pon' => 'Pohnpeian',
|
596
|
+
'pol' => 'Polish',
|
597
|
+
'por' => 'Portuguese',
|
598
|
+
'pra' => 'Prakrit languages',
|
599
|
+
'oci' => 'Provencal',
|
600
|
+
'pro' => 'Provencal, Old (to 1500)',
|
601
|
+
'pan' => 'Punjabi',
|
602
|
+
'pus' => 'Pushto',
|
603
|
+
'que' => 'Quechua',
|
604
|
+
'roh' => 'Raeto-Romance',
|
605
|
+
'raj' => 'Rajasthani',
|
606
|
+
'rap' => 'Rapanui',
|
607
|
+
'rar' => 'Rarotongan',
|
608
|
+
'qaa' => 'Reserved for local use',
|
609
|
+
'qtz' => 'Reserved for local use',
|
610
|
+
'roa' => 'Romance (Other)',
|
611
|
+
'rum' => 'Romanian',
|
612
|
+
'ron' => 'Romanian',
|
613
|
+
'rom' => 'Romany',
|
614
|
+
'run' => 'Rundi',
|
615
|
+
'rus' => 'Russian',
|
616
|
+
'sal' => 'Salishan languages',
|
617
|
+
'sam' => 'Samaritan Aramaic',
|
618
|
+
'smi' => 'Sami languages (Other)',
|
619
|
+
'smo' => 'Samoan',
|
620
|
+
'sad' => 'Sandawe',
|
621
|
+
'sag' => 'Sango',
|
622
|
+
'san' => 'Sanskrit',
|
623
|
+
'sat' => 'Santali',
|
624
|
+
'srd' => 'Sardinian',
|
625
|
+
'sas' => 'Sasak',
|
626
|
+
'nds' => 'Saxon, Low',
|
627
|
+
'sco' => 'Scots',
|
628
|
+
'gla' => 'Scottish Gaelic',
|
629
|
+
'sel' => 'Selkup',
|
630
|
+
'sem' => 'Semitic (Other)',
|
631
|
+
'nso' => 'Sepedi; Northern Sotho; Pedi',
|
632
|
+
'scc' => 'Serbian',
|
633
|
+
'srp' => 'Serbian',
|
634
|
+
'srr' => 'Serer',
|
635
|
+
'shn' => 'Shan',
|
636
|
+
'sna' => 'Shona',
|
637
|
+
'iii' => 'Sichuan Yi',
|
638
|
+
'scn' => 'Sicilian',
|
639
|
+
'sid' => 'Sidamo',
|
640
|
+
'sgn' => 'Sign languages',
|
641
|
+
'bla' => 'Siksika',
|
642
|
+
'snd' => 'Sindhi',
|
643
|
+
'sin' => 'Sinhalese',
|
644
|
+
'sit' => 'Sino-Tibetan (Other)',
|
645
|
+
'sio' => 'Siouan languages',
|
646
|
+
'sms' => 'Skolt Sami',
|
647
|
+
'den' => 'Slave (Athapascan)',
|
648
|
+
'sla' => 'Slavic (Other)',
|
649
|
+
'slo' => 'Slovak',
|
650
|
+
'slk' => 'Slovak',
|
651
|
+
'slv' => 'Slovenian',
|
652
|
+
'sog' => 'Sogdian',
|
653
|
+
'som' => 'Somali',
|
654
|
+
'son' => 'Songhai',
|
655
|
+
'snk' => 'Soninke',
|
656
|
+
'wen' => 'Sorbian languages',
|
657
|
+
'nso' => 'Sotho, Northern',
|
658
|
+
'sot' => 'Sotho, Southern',
|
659
|
+
'sai' => 'South American Indian (Other)',
|
660
|
+
'alt' => 'Southern Altai',
|
661
|
+
'sma' => 'Southern Sami',
|
662
|
+
'nbl' => 'South Ndebele',
|
663
|
+
'spa' => 'Spanish',
|
664
|
+
'srn' => 'Sranan Tongo',
|
665
|
+
'suk' => 'Sukuma',
|
666
|
+
'sux' => 'Sumerian',
|
667
|
+
'sun' => 'Sundanese',
|
668
|
+
'sus' => 'Susu',
|
669
|
+
'swa' => 'Swahili',
|
670
|
+
'ssw' => 'Swati',
|
671
|
+
'swe' => 'Swedish',
|
672
|
+
'gsw' => 'Swiss German; Alemanic',
|
673
|
+
'syr' => 'Syriac',
|
674
|
+
'tgl' => 'Tagalog',
|
675
|
+
'tah' => 'Tahitian',
|
676
|
+
'tai' => 'Tai (Other)',
|
677
|
+
'tgk' => 'Tajik',
|
678
|
+
'tmh' => 'Tamashek',
|
679
|
+
'tam' => 'Tamil',
|
680
|
+
'tat' => 'Tatar',
|
681
|
+
'tel' => 'Telugu',
|
682
|
+
'ter' => 'Tereno',
|
683
|
+
'tet' => 'Tetum',
|
684
|
+
'tha' => 'Thai',
|
685
|
+
'tib' => 'Tibetan',
|
686
|
+
'bod' => 'Tibetan',
|
687
|
+
'tig' => 'Tigre',
|
688
|
+
'tir' => 'Tigrinya',
|
689
|
+
'tem' => 'Timne',
|
690
|
+
'tiv' => 'Tiv',
|
691
|
+
'tlh' => 'tlhIngan-Hol; Klingon',
|
692
|
+
'tli' => 'Tlingit',
|
693
|
+
'tpi' => 'Tok Pisin',
|
694
|
+
'tkl' => 'Tokelau',
|
695
|
+
'tog' => 'Tonga (Nyasa)',
|
696
|
+
'ton' => 'Tonga (Tonga Islands)',
|
697
|
+
'tsi' => 'Tsimshian',
|
698
|
+
'tso' => 'Tsonga',
|
699
|
+
'tsn' => 'Tswana',
|
700
|
+
'tum' => 'Tumbuka',
|
701
|
+
'tup' => 'Tupi languages',
|
702
|
+
'tur' => 'Turkish',
|
703
|
+
'ota' => 'Turkish, Ottoman (1500-1928)',
|
704
|
+
'tuk' => 'Turkmen',
|
705
|
+
'tvl' => 'Tuvalu',
|
706
|
+
'tyv' => 'Tuvinian',
|
707
|
+
'twi' => 'Twi',
|
708
|
+
'udm' => 'Udmurt',
|
709
|
+
'uga' => 'Ugaritic',
|
710
|
+
'uig' => 'Uighur',
|
711
|
+
'ukr' => 'Ukrainian',
|
712
|
+
'umb' => 'Umbundu',
|
713
|
+
'und' => 'Undetermined',
|
714
|
+
'hsb' => 'Upper Sorbian',
|
715
|
+
'urd' => 'Urdu',
|
716
|
+
'uzb' => 'Uzbek',
|
717
|
+
'vai' => 'Vai',
|
718
|
+
'cat' => 'Valencian',
|
719
|
+
'ven' => 'Venda',
|
720
|
+
'vie' => 'Vietnamese',
|
721
|
+
'vol' => 'Volapuk',
|
722
|
+
'vot' => 'Votic',
|
723
|
+
'wak' => 'Wakashan languages',
|
724
|
+
'wal' => 'Walamo',
|
725
|
+
'wln' => 'Walloon',
|
726
|
+
'war' => 'Waray',
|
727
|
+
'was' => 'Washo',
|
728
|
+
'wel' => 'Welsh',
|
729
|
+
'cym' => 'Welsh',
|
730
|
+
'fry' => 'Wester Frisian',
|
731
|
+
'wol' => 'Wolof',
|
732
|
+
'xho' => 'Xhosa',
|
733
|
+
'sah' => 'Yakut',
|
734
|
+
'yao' => 'Yao',
|
735
|
+
'yap' => 'Yapese',
|
736
|
+
'yid' => 'Yiddish',
|
737
|
+
'yor' => 'Yoruba',
|
738
|
+
'ypk' => 'Yupik languages',
|
739
|
+
'znd' => 'Zande',
|
740
|
+
'zap' => 'Zapotec',
|
741
|
+
'zen' => 'Zenaga',
|
742
|
+
'zha' => 'Zhuang',
|
743
|
+
'zul' => 'Zulu',
|
744
|
+
'zun' => 'Zuni'
|
745
|
+
}
|
746
|
+
|
747
|
+
def is_valid_lang_code(value)
|
748
|
+
if value.include? '-'
|
749
|
+
lang, sublang = value.split('-', 2)
|
750
|
+
else
|
751
|
+
lang = value
|
752
|
+
end
|
753
|
+
!!ISO_LANG[lang.downcase]
|
754
|
+
end
|
755
|
+
end
|