lang 0.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ module Lang #:nodoc:
2
+ module Subtags
3
+ # Holds data about region subtags.
4
+ class Region < Entry
5
+ end
6
+ end
7
+ end
8
+
9
+ # EOF
@@ -0,0 +1,9 @@
1
+ module Lang #:nodoc:
2
+ module Subtags
3
+ # Holds data about script subtags.
4
+ class Script < Entry
5
+ end
6
+ end
7
+ end
8
+
9
+ # EOF
@@ -0,0 +1,17 @@
1
+ module Lang #:nodoc:
2
+ module Subtags
3
+ # Holds data about variant subtags.
4
+ class Variant < Entry
5
+
6
+ attr_reader :prefixes
7
+
8
+ def add_prefix(prefix)
9
+ @prefixes ||= []
10
+ @prefixes << prefix
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
17
+ # EOF
data/lib/lang/tag.rb ADDED
@@ -0,0 +1,141 @@
1
+ require 'lang/tag/pattern'
2
+ require 'lang/tag/composition'
3
+ require 'lang/tag/langtag'
4
+ require 'lang/tag/grandfathered'
5
+ require 'lang/tag/privateuse'
6
+
7
+ module Lang
8
+
9
+ def self.Tag(thing)
10
+ #return thing if Tag::Composition === thing
11
+ Tag::Grandfathered(thing) rescue
12
+ Tag::Langtag(thing) rescue
13
+ Tag::Privateuse(thing)
14
+ rescue
15
+ raise ArgumentError, "#{thing.inspect} is not a language tag."
16
+ end
17
+
18
+ module Tag
19
+
20
+ class Error < StandardError
21
+ end
22
+
23
+ class InvalidComponentError < Error
24
+ end
25
+
26
+ #--
27
+ # Grandfathered tags that do not match the 'langtag' production in the
28
+ # ABNF and would otherwise be invalid are considered 'irregular'
29
+ # grandfathered tags. With the exception of "en-GB-oed", which is a
30
+ # variant of "en-GB", each of them, in its entirety, represents a
31
+ # language.
32
+ #++
33
+
34
+ IRREGULAR = {
35
+ 'en-gb-oed' => nil ,
36
+ 'i-ami' => 'ami' ,
37
+ 'i-bnn' => 'bnn' ,
38
+ 'i-default' => nil ,
39
+ 'i-enochian' => nil ,
40
+ 'i-hak' => 'hak' ,
41
+ 'i-klingon' => 'tlh' ,
42
+ 'i-lux' => 'lb' ,
43
+ 'i-mingo' => nil ,
44
+ 'i-navajo' => 'nv' ,
45
+ 'i-pwn' => 'pwn' ,
46
+ 'i-tao' => 'tao' ,
47
+ 'i-tay' => 'tay' ,
48
+ 'i-tsu' => 'tsu' ,
49
+ 'sgn-be-fr' => 'sfb' ,
50
+ 'sgn-be-nl' => 'vgt' ,
51
+ 'sgn-ch-de' => 'sgg' ,
52
+ }.freeze
53
+
54
+ #--
55
+ # Grandfathered tags that (appear to) match the 'langtag' production in
56
+ # Figure 1 are considered 'regular' grandfathered tags. These tags
57
+ # contain one or more subtags that either do not individually appear in
58
+ # the registry or appear but with a different semantic meaning: each
59
+ # tag, in its entirety, represents a language or collection of
60
+ # languages.
61
+ #++
62
+
63
+ GRANDFATHERED = IRREGULAR.merge(
64
+ 'art-lojban' => 'jbo' ,
65
+ 'cel-gaulish' => nil ,
66
+ 'no-bok' => 'nb' ,
67
+ 'no-nyn' => 'nn' ,
68
+ 'zh-guoyu' => 'cmn' ,
69
+ 'zh-hakka' => 'hak' ,
70
+ 'zh-min' => nil ,
71
+ 'zh-min-nan' => 'nan' ,
72
+ 'zh-xiang' => 'hsn'
73
+ ).freeze
74
+
75
+ HYPHEN = '-'.freeze
76
+ HYPHEN_SPLITTER = RUBY_VERSION < '1.9.1' ? /-/.freeze : HYPHEN
77
+ PRIVATEUSE = 'x'.freeze
78
+ LANGUAGE_REGEX = /^(?:#{PATTERN::LANGUAGE})$/io.freeze
79
+ SCRIPT_REGEX = /^(?:#{PATTERN::SCRIPT})$/io.freeze
80
+ REGION_REGEX = /^(?:#{PATTERN::REGION})$/io.freeze
81
+ VARIANTS_SEQUENCE_REGEX = /^(?:#{PATTERN::VARIANT_SEQUENCE}+)$/io.freeze
82
+ EXTENSIONS_SEQUENCE_REGEX = /^#{PATTERN::EXTENSION_SEQUENCE}+$/io.freeze
83
+ EXTENSIONS_SEQUENCE_SPLITTER = /(?:^|-)(?=#{PATTERN::SINGLETON}-)/io.freeze
84
+ PRIVATEUSE_REGEX = /^#{PATTERN::PRIVATEUSE}$/io.freeze
85
+
86
+ LANGTAG_REGEX = /^
87
+ (#{PATTERN::LANGUAGE}) (?# shortest ISO 639 code plus extlang or reserved for future use or registered language subtag)
88
+ (?:-(#{PATTERN::SCRIPT}))? (?# ISO 15924 code)
89
+ (?:-(#{PATTERN::REGION}))? (?# ISO 3166-1 code or UN M.49 code)
90
+ (#{PATTERN::VARIANT_SEQUENCE}*)? (?# registered variants)
91
+ (#{PATTERN::EXTENSION_SEQUENCE}*)? (?# extensions)
92
+ (?=(?:-#{PATTERN::PRIVATEUSE})?$) (?# privateuse)
93
+ /iox.freeze
94
+
95
+ LANGTAG_WELLFORMEDNESS_REGEX = /^
96
+ (?:#{PATTERN::LOOSE_LANGUAGE}) (?# shortest ISO 639 code plus at most 3 extlangs or reserved for future use or registered language subtag)
97
+ (?:-(?:#{PATTERN::SCRIPT}))? (?# ISO 15924 code)
98
+ (?:-(?:#{PATTERN::REGION}))? (?# ISO 3166-1 code or UN M.49 code)
99
+ (?=#{PATTERN::VARIANT_SEQUENCE}* (?# registered variants)
100
+ #{PATTERN::EXTENSION_SEQUENCE}* (?# extensions)
101
+ (?:-#{PATTERN::PRIVATEUSE})?$) (?# privateuse)
102
+ /iox.freeze
103
+
104
+ class << self
105
+
106
+ # Checks if the +String+ passed represents a 'privateuse' language tag.
107
+ # Works case-insensitively.
108
+ #
109
+ def privateuse?(snippet)
110
+ PRIVATEUSE_REGEX === snippet
111
+ end
112
+
113
+ # Checks if the +String+ passed represents a 'grandfathered' language tag.
114
+ # Works case-insensitively.
115
+ #
116
+ def grandfathered?(snippet)
117
+ GRANDFATHERED.key?(snippet) || GRANDFATHERED.key?(snippet.downcase)
118
+ end
119
+
120
+ #--
121
+ # RFC 5646, Section 2.2.9:
122
+ # A tag is considered "well-formed" if it conforms to the ABNF
123
+ # (Section 2.1). Language tags may be well-formed in terms of syntax
124
+ # but not valid in terms of content. However, many operations
125
+ # involving language tags work well without knowing anything about the
126
+ # meaning or validity of the subtags.
127
+ #++
128
+
129
+ # Checks if the +String+ passed represents a well-formed language tag.
130
+ # Works case-insensitively.
131
+ #
132
+ def wellformed?(snippet)
133
+ privateuse?(snippet) || grandfathered?(snippet) || LANGTAG_WELLFORMEDNESS_REGEX === snippet
134
+ end
135
+
136
+ end
137
+
138
+ end
139
+ end
140
+
141
+ # EOF
@@ -0,0 +1,376 @@
1
+ require 'lang/tag'
2
+ require 'lang/subtags'
3
+
4
+ module Lang #:nodoc:
5
+ module Tag
6
+
7
+ module Canonicalization
8
+
9
+ # Handles exceptions that might
10
+ # appear in canonicalization or validation processes.
11
+ #
12
+ class Error < Error
13
+ end
14
+
15
+ #--
16
+ # RFC 5646, Section 2.2.1
17
+ # The subtags in the range 'qaa' through 'qtz' are reserved for
18
+ # private use in language tags. These subtags correspond to codes
19
+ # reserved by ISO 639-2 for private use. These codes MAY be used
20
+ # for non-registered primary language subtags (instead of using
21
+ # private use subtags following 'x-').
22
+ #++
23
+
24
+ PRIVATE_LANGUAGE_REGEX = /^q[a-t][a-z]$/i.freeze
25
+
26
+ #--
27
+ # RFC 5646, Section 3.1.7
28
+ # Extended language subtags always have a mapping to their
29
+ # identical primary language subtag. For example, the extended
30
+ # language subtag 'yue' (Cantonese) can be used to form the tag
31
+ # "zh-yue". It has a 'Preferred-Value' mapping to the primary
32
+ # language subtag 'yue', meaning that a tag such as
33
+ # "zh-yue-Hant-HK" can be canonicalized to "yue-Hant-HK".
34
+ #++
35
+
36
+ # Canonicalizes language component, applying rules that described
37
+ # in RFC5646, sections 2.2.1, 2.2.2 and 4.5. Also validates the
38
+ # language sequence using the 'Prefix' field-value of the extlang.
39
+ #
40
+ def canonicalize_language
41
+ raise InvalidComponentError, "Language can not be omitted." unless @language
42
+ decompose_language unless @primary
43
+
44
+
45
+ if @extlang
46
+ subtag = Subtags::Extlang(@extlang)
47
+ raise Error, "Extlang #{@extlang.inspect} is not registered." unless subtag
48
+
49
+ # RFC 5646, Section 2.2.2
50
+ # Extended language subtag records MUST include a 'Preferred-
51
+ # Value'. The 'Preferred-Value' and 'Subtag' fields MUST be
52
+ # identical.
53
+
54
+ # RFC 5646, Section 4.5
55
+ # For extlangs, the original primary language subtag is also
56
+ # replaced if there is a primary language subtag in the 'Preferred-Value'.
57
+ # The 'Preferred-Value' field in subtag records of type "extlang" also
58
+ # contains an "extended language range". This allows the subtag to be
59
+ # deprecated in favor of either a single primary language subtag or a
60
+ # new language-extlang sequence.
61
+
62
+ unless subtag.prefix == @primary ||
63
+ subtag.prefix == @primary.downcase # as of now, we have exactly one extlang
64
+
65
+ # RFC 5646, Section 3.4
66
+ # Extended language subtag records MUST include exactly one
67
+ # 'Prefix' field indicating an appropriate subtag or sequence of
68
+ # subtags for that extended language subtag.
69
+
70
+ raise Error, "Extlang #{@extlang.inspect} requires prefix #{subtag.prefix.inspect}."
71
+ end
72
+
73
+ @language = subtag.preferred_value
74
+ @primary = nil
75
+ @extlang = nil
76
+ dirty
77
+
78
+ elsif PRIVATE_LANGUAGE_REGEX !~ @primary
79
+ subtag = Subtags::Language(@primary)
80
+ raise Error, "Language #{@primary.inspect} is not registered." unless subtag
81
+ if subtag.preferred_value
82
+ @language = subtag.preferred_value
83
+ @primary = nil
84
+ dirty
85
+ end
86
+ end
87
+
88
+ nil
89
+ end
90
+
91
+ protected :canonicalize_language
92
+
93
+ #--
94
+ # RFC 5646, Section 2.2.3
95
+ # The script subtags 'Qaaa' through 'Qabx' are reserved for private
96
+ # use in language tags. These subtags correspond to codes reserved
97
+ # by ISO 15924 for private use. These codes MAY be used for non-
98
+ # registered script values. Please refer to Section 4.6 for more
99
+ # information on private use subtags.
100
+ #++
101
+
102
+ PRIVATE_SCRIPT_REGEX = /^Qa[ab][a-x]$/i.freeze
103
+
104
+ def canonicalize_script
105
+ return if !@script || PRIVATE_SCRIPT_REGEX === @script
106
+ subtag = Subtags::Script(@script)
107
+ raise Error, "Script #{@script.inspect} is not registered." unless subtag
108
+ if subtag.preferred_value
109
+ @script = subtag.preferred_value
110
+ dirty
111
+ end
112
+ nil
113
+ end
114
+
115
+ protected :canonicalize_script
116
+
117
+ #--
118
+ # RFC 5646, Section 2.2.4
119
+ # The region subtags 'AA', 'QM'-'QZ', 'XA'-'XZ', and 'ZZ' are
120
+ # reserved for private use in language tags. These subtags
121
+ # correspond to codes reserved by ISO 3166 for private use. These
122
+ # codes MAY be used for private use region subtags (instead of
123
+ # using a private use subtag sequence). Please refer to
124
+ # Section 4.6 for more information on private use subtags.
125
+ #++
126
+
127
+ PRIVATE_REGION_REGEX = /^(?:AA|Q[M-Z]|X[A-Z]|ZZ)$/i.freeze
128
+
129
+ #--
130
+ # RFC 5646, Section 4.5
131
+ # Example: Although the tag "en-BU" (English as used in Burma)
132
+ # maintains its validity, the language tag "en-BU" is not in canonical
133
+ # form because the 'BU' subtag has a canonical mapping to 'MM'
134
+ # (Myanmar).
135
+ #++
136
+
137
+ def canonicalize_region
138
+ return if !@region || PRIVATE_REGION_REGEX === @region
139
+ subtag = Subtags::Region(@region)
140
+ raise Error, "Region #{@region.inspect} is not registered." unless subtag
141
+ if subtag.preferred_value
142
+ @region = subtag.preferred_value
143
+ dirty
144
+ end
145
+ nil
146
+ end
147
+
148
+ protected :canonicalize_region
149
+
150
+ #--
151
+ # RFC 5646, Section 3.1.8
152
+ # The 'Prefix' also indicates when variant subtags make sense when used
153
+ # together (many that otherwise share a 'Prefix' are mutually
154
+ # exclusive) and what the relative ordering of variants is supposed to
155
+ # be. For example, the variant '1994' (Standardized Resian
156
+ # orthography) has several 'Prefix' fields in the registry ("sl-rozaj",
157
+ # "sl-rozaj-biske", "sl-rozaj-njiva", "sl-rozaj-osojs", and "sl-rozaj-
158
+ # solba"). This indicates not only that '1994' is appropriate to use
159
+ # with each of these five Resian variant subtags ('rozaj', 'biske',
160
+ # 'njiva', 'osojs', and 'solba'), but also that it SHOULD appear
161
+ # following any of these variants in a tag. Thus, the language tag
162
+ # ought to take the form "sl-rozaj-biske-1994", rather than "sl-1994-
163
+ # rozaj-biske" or "sl-rozaj-1994-biske".
164
+ #++
165
+
166
+ PREFIX_REGEX = /^(#{PATTERN::LANGUAGE})(?:-(#{PATTERN::SCRIPT}))?(?:-(#{PATTERN::REGION}))?(?:-(.+))?$/io.freeze
167
+
168
+ # Canonicalizes variants, applying rules that described in RFC 5646,
169
+ # sections 2.2.5 and 4.5. Also validates the sequence of variants
170
+ # using 'Prefix' field-values (see RFC 5646, Section 3.1.8).
171
+ #
172
+ def canonicalize_variants
173
+ return unless @variants_sequence
174
+
175
+ sequence = nil
176
+ sequence_dirty = false
177
+ @variants = variants.map do |variant|
178
+
179
+ v = Subtags::Variant(variant)
180
+ raise Error, "Variant #{variant.inspect} is not registered." unless v
181
+
182
+ if !v.prefixes || v.prefixes.any? { |prefix|
183
+ PREFIX_REGEX === prefix
184
+ ($4 == nil || $4 == sequence) &&
185
+ ($3 == nil || @region && ($3 == @region || $3 == @region.upcase)) &&
186
+ ($2 == nil || @script && ($2 == @script || $2 == @script.capitalize)) &&
187
+ ($1 == @language || $1 == @language.downcase)
188
+ }
189
+
190
+ sequence ? sequence << HYPHEN : sequence = ""
191
+ sequence << v.name
192
+
193
+ if v.preferred_value
194
+ sequence_dirty ||= true
195
+ v.preferred_value
196
+ else
197
+ variant
198
+ end
199
+
200
+ else raise Error,
201
+ "Variant #{variant.inspect} requires " \
202
+ "one of following prefixes: " \
203
+ "#{v.prefixes.map{ |p| p.inspect }.join(", ")}."
204
+ end
205
+
206
+ end
207
+
208
+ if sequence_dirty
209
+ @variants_sequence = @variants.join(HYPHEN)
210
+ dirty
211
+ end
212
+
213
+ nil
214
+ end
215
+
216
+ protected :canonicalize_variants
217
+
218
+ #--
219
+ # RFC 5646, Section 4.5
220
+ # Example: The language tag "en-a-aaa-b-ccc-bbb-x-xyz" is in canonical
221
+ # form, while "en-b-ccc-bbb-a-aaa-X-xyz" is well-formed and potentially
222
+ # valid (extensions 'a' and 'b' are not defined as of the publication
223
+ # of this document) but not in canonical form (the extensions are not
224
+ # in alphabetical order).
225
+ #++
226
+
227
+ def canonicalize_extensions
228
+ return unless @extensions_sequence
229
+ ordered = @extensions_sequence.
230
+ split(EXTENSIONS_SEQUENCE_SPLITTER).
231
+ sort!{ |k,v| k.downcase <=> v.downcase }.join(HYPHEN)
232
+
233
+ unless @extensions_sequence == ordered
234
+ @extensions_sequence = ordered
235
+ dirty
236
+ end
237
+ nil
238
+ end
239
+
240
+ protected :canonicalize_extensions
241
+
242
+ #--
243
+ # RFC 5646, Section 3.1.7
244
+ # For example, the tags "zh-yue-Hant-HK" and "yue-Hant-HK"
245
+ # are semantically equivalent and ought to be treated as
246
+ # if they were the same tag.
247
+ #++
248
+
249
+ def same?(other)
250
+ self.canonicalize == other.canonicalize
251
+ end
252
+
253
+ def canonicalize
254
+ duplicated = self.dup
255
+ duplicated.canonicalize!
256
+ duplicated
257
+ end
258
+
259
+ def canonicalize!
260
+
261
+ # 1. Extension sequences are ordered into case-insensitive ASCII order
262
+ # by singleton subtag.
263
+
264
+ canonicalize_extensions
265
+
266
+ # A redundant tag is a grandfathered
267
+ # registration whose individual subtags appear with the same semantic
268
+ # meaning in the registry. For example, the tag "zh-Hant" (Traditional
269
+ # Chinese) can now be composed from the subtags 'zh' (Chinese) and
270
+ # 'Hant' (Han script traditional variant). These redundant tags are
271
+ # maintained in the registry as records of type 'redundant', mostly as
272
+ # a matter of historical curiosity.
273
+
274
+ # 2. Redundant or grandfathered tags are replaced by their 'Preferred-
275
+ # Value', if there is one.
276
+
277
+ if re = Subtags::Redundant(composition)
278
+ return recompose(re.preferred_value) if re.preferred_value
279
+ end
280
+
281
+ # 3. Subtags are replaced by their 'Preferred-Value', if there is one.
282
+ # For extlangs, the original primary language subtag is also
283
+ # replaced if there is a primary language subtag in the 'Preferred-
284
+ # Value'.
285
+
286
+ canonicalize_language
287
+ canonicalize_script
288
+ canonicalize_region
289
+ canonicalize_variants
290
+
291
+ nil
292
+ end
293
+
294
+ alias :to_canonical_form! :canonicalize!
295
+ alias :to_canonical_form :canonicalize
296
+
297
+ #--
298
+ # RFC 5646, Section 4.5
299
+ # For example, "hak-CN" (Hakka, China) has the primary language
300
+ # subtag 'hak', which in turn has an 'extlang' record with a
301
+ # 'Prefix' 'zh' (Chinese). The extlang form is "zh-hak-CN"
302
+ # (Chinese, Hakka, China).
303
+ #++
304
+
305
+ def to_extlang_form!
306
+ canonicalize!
307
+ subtag = Subtags::Extlang(@language)
308
+ @primary = subtag.prefix
309
+ @extlang = @language
310
+ @language = "#{@primary}#{HYPHEN}#{@extlang}"
311
+ dirty
312
+ nil
313
+ end
314
+
315
+ def to_extlang_form
316
+ duplicated = self.dup
317
+ duplicated.to_extlang_form!
318
+ duplicated
319
+ end
320
+
321
+ #--
322
+ # RFC 5646, Section 4.1
323
+ # The script subtag SHOULD NOT be used to form language tags unless
324
+ # the script adds some distinguishing information to the tag.
325
+ # ...
326
+ # The field 'Suppress-Script' in the primary or extended language
327
+ # record in the registry indicates script subtags that do not add
328
+ # distinguishing information for most applications; this field
329
+ # defines when users SHOULD NOT include a script subtag with a
330
+ # particular primary language subtag.
331
+ #
332
+ # For example, if an implementation selects content using Basic
333
+ # Filtering [RFC4647] (originally described in Section 14.4 of
334
+ # [RFC2616]) and the user requested the language range "en-US",
335
+ # content labeled "en-Latn-US" will not match the request and thus
336
+ # not be selected. Therefore, it is important to know when script
337
+ # subtags will customarily be used and when they ought not be used.
338
+ #++
339
+
340
+ def suppress_script!
341
+ return unless @script && @language
342
+ decompose_language unless @primary
343
+
344
+ return if PRIVATE_LANGUAGE_REGEX === @primary
345
+
346
+ subtag = Subtags::Language(@primary)
347
+ raise Error, "Language #{@primary.inspect} is not registered." unless subtag
348
+ if subtag.suppress_script && @script == subtag.suppress_script
349
+ @script = nil
350
+ dirty
351
+ #elsif @extlang
352
+ # subtag = Subtags::Extlang(@extlang)
353
+ # raise Error, "Extlang #{@extlang.inspect} is not registered." unless subtag
354
+ # if subtag.suppress_script && @script == subtag.suppress_script
355
+ # dirty
356
+ # end
357
+ end
358
+ nil
359
+ end
360
+
361
+ def suppress_script
362
+ duplicated = self.dup
363
+ duplicated.suppress_script!
364
+ duplicated
365
+ end
366
+
367
+ end
368
+
369
+ class Langtag
370
+ include Canonicalization
371
+ end
372
+
373
+ end
374
+ end
375
+
376
+ # EOF