lang 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,9 @@
1
+ module Lang #:nodoc:
2
+ module Subtags
3
+ # Holds data about region subtags.
4
+ class Region < Entry
5
+ end
6
+ end
7
+ end
8
+
9
+ # EOF
@@ -0,0 +1,9 @@
1
+ module Lang #:nodoc:
2
+ module Subtags
3
+ # Holds data about script subtags.
4
+ class Script < Entry
5
+ end
6
+ end
7
+ end
8
+
9
+ # EOF
@@ -0,0 +1,17 @@
1
+ module Lang #:nodoc:
2
+ module Subtags
3
+ # Holds data about variant subtags.
4
+ class Variant < Entry
5
+
6
+ attr_reader :prefixes
7
+
8
+ def add_prefix(prefix)
9
+ @prefixes ||= []
10
+ @prefixes << prefix
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
17
+ # EOF
data/lib/lang/tag.rb ADDED
@@ -0,0 +1,141 @@
1
+ require 'lang/tag/pattern'
2
+ require 'lang/tag/composition'
3
+ require 'lang/tag/langtag'
4
+ require 'lang/tag/grandfathered'
5
+ require 'lang/tag/privateuse'
6
+
7
+ module Lang
8
+
9
+ def self.Tag(thing)
10
+ #return thing if Tag::Composition === thing
11
+ Tag::Grandfathered(thing) rescue
12
+ Tag::Langtag(thing) rescue
13
+ Tag::Privateuse(thing)
14
+ rescue
15
+ raise ArgumentError, "#{thing.inspect} is not a language tag."
16
+ end
17
+
18
+ module Tag
19
+
20
+ class Error < StandardError
21
+ end
22
+
23
+ class InvalidComponentError < Error
24
+ end
25
+
26
+ #--
27
+ # Grandfathered tags that do not match the 'langtag' production in the
28
+ # ABNF and would otherwise be invalid are considered 'irregular'
29
+ # grandfathered tags. With the exception of "en-GB-oed", which is a
30
+ # variant of "en-GB", each of them, in its entirety, represents a
31
+ # language.
32
+ #++
33
+
34
+ IRREGULAR = {
35
+ 'en-gb-oed' => nil ,
36
+ 'i-ami' => 'ami' ,
37
+ 'i-bnn' => 'bnn' ,
38
+ 'i-default' => nil ,
39
+ 'i-enochian' => nil ,
40
+ 'i-hak' => 'hak' ,
41
+ 'i-klingon' => 'tlh' ,
42
+ 'i-lux' => 'lb' ,
43
+ 'i-mingo' => nil ,
44
+ 'i-navajo' => 'nv' ,
45
+ 'i-pwn' => 'pwn' ,
46
+ 'i-tao' => 'tao' ,
47
+ 'i-tay' => 'tay' ,
48
+ 'i-tsu' => 'tsu' ,
49
+ 'sgn-be-fr' => 'sfb' ,
50
+ 'sgn-be-nl' => 'vgt' ,
51
+ 'sgn-ch-de' => 'sgg' ,
52
+ }.freeze
53
+
54
+ #--
55
+ # Grandfathered tags that (appear to) match the 'langtag' production in
56
+ # Figure 1 are considered 'regular' grandfathered tags. These tags
57
+ # contain one or more subtags that either do not individually appear in
58
+ # the registry or appear but with a different semantic meaning: each
59
+ # tag, in its entirety, represents a language or collection of
60
+ # languages.
61
+ #++
62
+
63
+ GRANDFATHERED = IRREGULAR.merge(
64
+ 'art-lojban' => 'jbo' ,
65
+ 'cel-gaulish' => nil ,
66
+ 'no-bok' => 'nb' ,
67
+ 'no-nyn' => 'nn' ,
68
+ 'zh-guoyu' => 'cmn' ,
69
+ 'zh-hakka' => 'hak' ,
70
+ 'zh-min' => nil ,
71
+ 'zh-min-nan' => 'nan' ,
72
+ 'zh-xiang' => 'hsn'
73
+ ).freeze
74
+
75
+ HYPHEN = '-'.freeze
76
+ HYPHEN_SPLITTER = RUBY_VERSION < '1.9.1' ? /-/.freeze : HYPHEN
77
+ PRIVATEUSE = 'x'.freeze
78
+ LANGUAGE_REGEX = /^(?:#{PATTERN::LANGUAGE})$/io.freeze
79
+ SCRIPT_REGEX = /^(?:#{PATTERN::SCRIPT})$/io.freeze
80
+ REGION_REGEX = /^(?:#{PATTERN::REGION})$/io.freeze
81
+ VARIANTS_SEQUENCE_REGEX = /^(?:#{PATTERN::VARIANT_SEQUENCE}+)$/io.freeze
82
+ EXTENSIONS_SEQUENCE_REGEX = /^#{PATTERN::EXTENSION_SEQUENCE}+$/io.freeze
83
+ EXTENSIONS_SEQUENCE_SPLITTER = /(?:^|-)(?=#{PATTERN::SINGLETON}-)/io.freeze
84
+ PRIVATEUSE_REGEX = /^#{PATTERN::PRIVATEUSE}$/io.freeze
85
+
86
+ LANGTAG_REGEX = /^
87
+ (#{PATTERN::LANGUAGE}) (?# shortest ISO 639 code plus extlang or reserved for future use or registered language subtag)
88
+ (?:-(#{PATTERN::SCRIPT}))? (?# ISO 15924 code)
89
+ (?:-(#{PATTERN::REGION}))? (?# ISO 3166-1 code or UN M.49 code)
90
+ (#{PATTERN::VARIANT_SEQUENCE}*)? (?# registered variants)
91
+ (#{PATTERN::EXTENSION_SEQUENCE}*)? (?# extensions)
92
+ (?=(?:-#{PATTERN::PRIVATEUSE})?$) (?# privateuse)
93
+ /iox.freeze
94
+
95
+ LANGTAG_WELLFORMEDNESS_REGEX = /^
96
+ (?:#{PATTERN::LOOSE_LANGUAGE}) (?# shortest ISO 639 code plus at most 3 extlangs or reserved for future use or registered language subtag)
97
+ (?:-(?:#{PATTERN::SCRIPT}))? (?# ISO 15924 code)
98
+ (?:-(?:#{PATTERN::REGION}))? (?# ISO 3166-1 code or UN M.49 code)
99
+ (?=#{PATTERN::VARIANT_SEQUENCE}* (?# registered variants)
100
+ #{PATTERN::EXTENSION_SEQUENCE}* (?# extensions)
101
+ (?:-#{PATTERN::PRIVATEUSE})?$) (?# privateuse)
102
+ /iox.freeze
103
+
104
+ class << self
105
+
106
+ # Checks if the +String+ passed represents a 'privateuse' language tag.
107
+ # Works case-insensitively.
108
+ #
109
+ def privateuse?(snippet)
110
+ PRIVATEUSE_REGEX === snippet
111
+ end
112
+
113
+ # Checks if the +String+ passed represents a 'grandfathered' language tag.
114
+ # Works case-insensitively.
115
+ #
116
+ def grandfathered?(snippet)
117
+ GRANDFATHERED.key?(snippet) || GRANDFATHERED.key?(snippet.downcase)
118
+ end
119
+
120
+ #--
121
+ # RFC 5646, Section 2.2.9:
122
+ # A tag is considered "well-formed" if it conforms to the ABNF
123
+ # (Section 2.1). Language tags may be well-formed in terms of syntax
124
+ # but not valid in terms of content. However, many operations
125
+ # involving language tags work well without knowing anything about the
126
+ # meaning or validity of the subtags.
127
+ #++
128
+
129
+ # Checks if the +String+ passed represents a well-formed language tag.
130
+ # Works case-insensitively.
131
+ #
132
+ def wellformed?(snippet)
133
+ privateuse?(snippet) || grandfathered?(snippet) || LANGTAG_WELLFORMEDNESS_REGEX === snippet
134
+ end
135
+
136
+ end
137
+
138
+ end
139
+ end
140
+
141
+ # EOF
@@ -0,0 +1,376 @@
1
+ require 'lang/tag'
2
+ require 'lang/subtags'
3
+
4
+ module Lang #:nodoc:
5
+ module Tag
6
+
7
+ module Canonicalization
8
+
9
+ # Handles exceptions that might
10
+ # appear in canonicalization or validation processes.
11
+ #
12
+ class Error < Error
13
+ end
14
+
15
+ #--
16
+ # RFC 5646, Section 2.2.1
17
+ # The subtags in the range 'qaa' through 'qtz' are reserved for
18
+ # private use in language tags. These subtags correspond to codes
19
+ # reserved by ISO 639-2 for private use. These codes MAY be used
20
+ # for non-registered primary language subtags (instead of using
21
+ # private use subtags following 'x-').
22
+ #++
23
+
24
+ PRIVATE_LANGUAGE_REGEX = /^q[a-t][a-z]$/i.freeze
25
+
26
+ #--
27
+ # RFC 5646, Section 3.1.7
28
+ # Extended language subtags always have a mapping to their
29
+ # identical primary language subtag. For example, the extended
30
+ # language subtag 'yue' (Cantonese) can be used to form the tag
31
+ # "zh-yue". It has a 'Preferred-Value' mapping to the primary
32
+ # language subtag 'yue', meaning that a tag such as
33
+ # "zh-yue-Hant-HK" can be canonicalized to "yue-Hant-HK".
34
+ #++
35
+
36
+ # Canonicalizes language component, applying rules that described
37
+ # in RFC5646, sections 2.2.1, 2.2.2 and 4.5. Also validates the
38
+ # language sequence using the 'Prefix' field-value of the extlang.
39
+ #
40
+ def canonicalize_language
41
+ raise InvalidComponentError, "Language can not be omitted." unless @language
42
+ decompose_language unless @primary
43
+
44
+
45
+ if @extlang
46
+ subtag = Subtags::Extlang(@extlang)
47
+ raise Error, "Extlang #{@extlang.inspect} is not registered." unless subtag
48
+
49
+ # RFC 5646, Section 2.2.2
50
+ # Extended language subtag records MUST include a 'Preferred-
51
+ # Value'. The 'Preferred-Value' and 'Subtag' fields MUST be
52
+ # identical.
53
+
54
+ # RFC 5646, Section 4.5
55
+ # For extlangs, the original primary language subtag is also
56
+ # replaced if there is a primary language subtag in the 'Preferred-Value'.
57
+ # The 'Preferred-Value' field in subtag records of type "extlang" also
58
+ # contains an "extended language range". This allows the subtag to be
59
+ # deprecated in favor of either a single primary language subtag or a
60
+ # new language-extlang sequence.
61
+
62
+ unless subtag.prefix == @primary ||
63
+ subtag.prefix == @primary.downcase # as of now, we have exactly one extlang
64
+
65
+ # RFC 5646, Section 3.4
66
+ # Extended language subtag records MUST include exactly one
67
+ # 'Prefix' field indicating an appropriate subtag or sequence of
68
+ # subtags for that extended language subtag.
69
+
70
+ raise Error, "Extlang #{@extlang.inspect} requires prefix #{subtag.prefix.inspect}."
71
+ end
72
+
73
+ @language = subtag.preferred_value
74
+ @primary = nil
75
+ @extlang = nil
76
+ dirty
77
+
78
+ elsif PRIVATE_LANGUAGE_REGEX !~ @primary
79
+ subtag = Subtags::Language(@primary)
80
+ raise Error, "Language #{@primary.inspect} is not registered." unless subtag
81
+ if subtag.preferred_value
82
+ @language = subtag.preferred_value
83
+ @primary = nil
84
+ dirty
85
+ end
86
+ end
87
+
88
+ nil
89
+ end
90
+
91
+ protected :canonicalize_language
92
+
93
+ #--
94
+ # RFC 5646, Section 2.2.3
95
+ # The script subtags 'Qaaa' through 'Qabx' are reserved for private
96
+ # use in language tags. These subtags correspond to codes reserved
97
+ # by ISO 15924 for private use. These codes MAY be used for non-
98
+ # registered script values. Please refer to Section 4.6 for more
99
+ # information on private use subtags.
100
+ #++
101
+
102
+ PRIVATE_SCRIPT_REGEX = /^Qa[ab][a-x]$/i.freeze
103
+
104
+ def canonicalize_script
105
+ return if !@script || PRIVATE_SCRIPT_REGEX === @script
106
+ subtag = Subtags::Script(@script)
107
+ raise Error, "Script #{@script.inspect} is not registered." unless subtag
108
+ if subtag.preferred_value
109
+ @script = subtag.preferred_value
110
+ dirty
111
+ end
112
+ nil
113
+ end
114
+
115
+ protected :canonicalize_script
116
+
117
+ #--
118
+ # RFC 5646, Section 2.2.4
119
+ # The region subtags 'AA', 'QM'-'QZ', 'XA'-'XZ', and 'ZZ' are
120
+ # reserved for private use in language tags. These subtags
121
+ # correspond to codes reserved by ISO 3166 for private use. These
122
+ # codes MAY be used for private use region subtags (instead of
123
+ # using a private use subtag sequence). Please refer to
124
+ # Section 4.6 for more information on private use subtags.
125
+ #++
126
+
127
+ PRIVATE_REGION_REGEX = /^(?:AA|Q[M-Z]|X[A-Z]|ZZ)$/i.freeze
128
+
129
+ #--
130
+ # RFC 5646, Section 4.5
131
+ # Example: Although the tag "en-BU" (English as used in Burma)
132
+ # maintains its validity, the language tag "en-BU" is not in canonical
133
+ # form because the 'BU' subtag has a canonical mapping to 'MM'
134
+ # (Myanmar).
135
+ #++
136
+
137
+ def canonicalize_region
138
+ return if !@region || PRIVATE_REGION_REGEX === @region
139
+ subtag = Subtags::Region(@region)
140
+ raise Error, "Region #{@region.inspect} is not registered." unless subtag
141
+ if subtag.preferred_value
142
+ @region = subtag.preferred_value
143
+ dirty
144
+ end
145
+ nil
146
+ end
147
+
148
+ protected :canonicalize_region
149
+
150
+ #--
151
+ # RFC 5646, Section 3.1.8
152
+ # The 'Prefix' also indicates when variant subtags make sense when used
153
+ # together (many that otherwise share a 'Prefix' are mutually
154
+ # exclusive) and what the relative ordering of variants is supposed to
155
+ # be. For example, the variant '1994' (Standardized Resian
156
+ # orthography) has several 'Prefix' fields in the registry ("sl-rozaj",
157
+ # "sl-rozaj-biske", "sl-rozaj-njiva", "sl-rozaj-osojs", and "sl-rozaj-
158
+ # solba"). This indicates not only that '1994' is appropriate to use
159
+ # with each of these five Resian variant subtags ('rozaj', 'biske',
160
+ # 'njiva', 'osojs', and 'solba'), but also that it SHOULD appear
161
+ # following any of these variants in a tag. Thus, the language tag
162
+ # ought to take the form "sl-rozaj-biske-1994", rather than "sl-1994-
163
+ # rozaj-biske" or "sl-rozaj-1994-biske".
164
+ #++
165
+
166
+ PREFIX_REGEX = /^(#{PATTERN::LANGUAGE})(?:-(#{PATTERN::SCRIPT}))?(?:-(#{PATTERN::REGION}))?(?:-(.+))?$/io.freeze
167
+
168
+ # Canonicalizes variants, applying rules that described in RFC 5646,
169
+ # sections 2.2.5 and 4.5. Also validates the sequence of variants
170
+ # using 'Prefix' field-values (see RFC 5646, Section 3.1.8).
171
+ #
172
+ def canonicalize_variants
173
+ return unless @variants_sequence
174
+
175
+ sequence = nil
176
+ sequence_dirty = false
177
+ @variants = variants.map do |variant|
178
+
179
+ v = Subtags::Variant(variant)
180
+ raise Error, "Variant #{variant.inspect} is not registered." unless v
181
+
182
+ if !v.prefixes || v.prefixes.any? { |prefix|
183
+ PREFIX_REGEX === prefix
184
+ ($4 == nil || $4 == sequence) &&
185
+ ($3 == nil || @region && ($3 == @region || $3 == @region.upcase)) &&
186
+ ($2 == nil || @script && ($2 == @script || $2 == @script.capitalize)) &&
187
+ ($1 == @language || $1 == @language.downcase)
188
+ }
189
+
190
+ sequence ? sequence << HYPHEN : sequence = ""
191
+ sequence << v.name
192
+
193
+ if v.preferred_value
194
+ sequence_dirty ||= true
195
+ v.preferred_value
196
+ else
197
+ variant
198
+ end
199
+
200
+ else raise Error,
201
+ "Variant #{variant.inspect} requires " \
202
+ "one of following prefixes: " \
203
+ "#{v.prefixes.map{ |p| p.inspect }.join(", ")}."
204
+ end
205
+
206
+ end
207
+
208
+ if sequence_dirty
209
+ @variants_sequence = @variants.join(HYPHEN)
210
+ dirty
211
+ end
212
+
213
+ nil
214
+ end
215
+
216
+ protected :canonicalize_variants
217
+
218
+ #--
219
+ # RFC 5646, Section 4.5
220
+ # Example: The language tag "en-a-aaa-b-ccc-bbb-x-xyz" is in canonical
221
+ # form, while "en-b-ccc-bbb-a-aaa-X-xyz" is well-formed and potentially
222
+ # valid (extensions 'a' and 'b' are not defined as of the publication
223
+ # of this document) but not in canonical form (the extensions are not
224
+ # in alphabetical order).
225
+ #++
226
+
227
+ def canonicalize_extensions
228
+ return unless @extensions_sequence
229
+ ordered = @extensions_sequence.
230
+ split(EXTENSIONS_SEQUENCE_SPLITTER).
231
+ sort!{ |k,v| k.downcase <=> v.downcase }.join(HYPHEN)
232
+
233
+ unless @extensions_sequence == ordered
234
+ @extensions_sequence = ordered
235
+ dirty
236
+ end
237
+ nil
238
+ end
239
+
240
+ protected :canonicalize_extensions
241
+
242
+ #--
243
+ # RFC 5646, Section 3.1.7
244
+ # For example, the tags "zh-yue-Hant-HK" and "yue-Hant-HK"
245
+ # are semantically equivalent and ought to be treated as
246
+ # if they were the same tag.
247
+ #++
248
+
249
+ def same?(other)
250
+ self.canonicalize == other.canonicalize
251
+ end
252
+
253
+ def canonicalize
254
+ duplicated = self.dup
255
+ duplicated.canonicalize!
256
+ duplicated
257
+ end
258
+
259
+ def canonicalize!
260
+
261
+ # 1. Extension sequences are ordered into case-insensitive ASCII order
262
+ # by singleton subtag.
263
+
264
+ canonicalize_extensions
265
+
266
+ # A redundant tag is a grandfathered
267
+ # registration whose individual subtags appear with the same semantic
268
+ # meaning in the registry. For example, the tag "zh-Hant" (Traditional
269
+ # Chinese) can now be composed from the subtags 'zh' (Chinese) and
270
+ # 'Hant' (Han script traditional variant). These redundant tags are
271
+ # maintained in the registry as records of type 'redundant', mostly as
272
+ # a matter of historical curiosity.
273
+
274
+ # 2. Redundant or grandfathered tags are replaced by their 'Preferred-
275
+ # Value', if there is one.
276
+
277
+ if re = Subtags::Redundant(composition)
278
+ return recompose(re.preferred_value) if re.preferred_value
279
+ end
280
+
281
+ # 3. Subtags are replaced by their 'Preferred-Value', if there is one.
282
+ # For extlangs, the original primary language subtag is also
283
+ # replaced if there is a primary language subtag in the 'Preferred-
284
+ # Value'.
285
+
286
+ canonicalize_language
287
+ canonicalize_script
288
+ canonicalize_region
289
+ canonicalize_variants
290
+
291
+ nil
292
+ end
293
+
294
+ alias :to_canonical_form! :canonicalize!
295
+ alias :to_canonical_form :canonicalize
296
+
297
+ #--
298
+ # RFC 5646, Section 4.5
299
+ # For example, "hak-CN" (Hakka, China) has the primary language
300
+ # subtag 'hak', which in turn has an 'extlang' record with a
301
+ # 'Prefix' 'zh' (Chinese). The extlang form is "zh-hak-CN"
302
+ # (Chinese, Hakka, China).
303
+ #++
304
+
305
+ def to_extlang_form!
306
+ canonicalize!
307
+ subtag = Subtags::Extlang(@language)
308
+ @primary = subtag.prefix
309
+ @extlang = @language
310
+ @language = "#{@primary}#{HYPHEN}#{@extlang}"
311
+ dirty
312
+ nil
313
+ end
314
+
315
+ def to_extlang_form
316
+ duplicated = self.dup
317
+ duplicated.to_extlang_form!
318
+ duplicated
319
+ end
320
+
321
+ #--
322
+ # RFC 5646, Section 4.1
323
+ # The script subtag SHOULD NOT be used to form language tags unless
324
+ # the script adds some distinguishing information to the tag.
325
+ # ...
326
+ # The field 'Suppress-Script' in the primary or extended language
327
+ # record in the registry indicates script subtags that do not add
328
+ # distinguishing information for most applications; this field
329
+ # defines when users SHOULD NOT include a script subtag with a
330
+ # particular primary language subtag.
331
+ #
332
+ # For example, if an implementation selects content using Basic
333
+ # Filtering [RFC4647] (originally described in Section 14.4 of
334
+ # [RFC2616]) and the user requested the language range "en-US",
335
+ # content labeled "en-Latn-US" will not match the request and thus
336
+ # not be selected. Therefore, it is important to know when script
337
+ # subtags will customarily be used and when they ought not be used.
338
+ #++
339
+
340
+ def suppress_script!
341
+ return unless @script && @language
342
+ decompose_language unless @primary
343
+
344
+ return if PRIVATE_LANGUAGE_REGEX === @primary
345
+
346
+ subtag = Subtags::Language(@primary)
347
+ raise Error, "Language #{@primary.inspect} is not registered." unless subtag
348
+ if subtag.suppress_script && @script == subtag.suppress_script
349
+ @script = nil
350
+ dirty
351
+ #elsif @extlang
352
+ # subtag = Subtags::Extlang(@extlang)
353
+ # raise Error, "Extlang #{@extlang.inspect} is not registered." unless subtag
354
+ # if subtag.suppress_script && @script == subtag.suppress_script
355
+ # dirty
356
+ # end
357
+ end
358
+ nil
359
+ end
360
+
361
+ def suppress_script
362
+ duplicated = self.dup
363
+ duplicated.suppress_script!
364
+ duplicated
365
+ end
366
+
367
+ end
368
+
369
+ class Langtag
370
+ include Canonicalization
371
+ end
372
+
373
+ end
374
+ end
375
+
376
+ # EOF