bcp47_spec 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bcp47_spec/parser.rb +126 -124
- data/lib/bcp47_spec/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2039159ce3a6ad3a971a0fd8d0119c83ef6727447651cc4eacfe0a2cca9dab15
|
4
|
+
data.tar.gz: d3c6f3bf99c68cfe41612d26ca2ce50de63244cc2928c1ec4c6532368cdecb42
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 32ad050b2858a0300439ef971602706f387722e52ca8b64be443bc325fd0c8523e33eecbdc63a20c01b583b33da90578a15b16744574749b523813e2618cf4d8
|
7
|
+
data.tar.gz: 792e8721f1b5b894f0632e4b64a7734a93a8f8900cc00faae9a0cef4cdc862c6a7fe96014177a048804ab4ff4d1872b8b1c6313cf2392e78eacae72b7eb179ea
|
data/lib/bcp47_spec/parser.rb
CHANGED
@@ -1,135 +1,137 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
module BCP47
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
3
|
+
module BCP47
|
4
|
+
module Parser
|
5
|
+
# https://tools.ietf.org/html/bcp47#section-2.1
|
6
|
+
# Augmented BNF for Syntax Specifications: https://tools.ietf.org/html/rfc5234
|
7
|
+
|
8
|
+
# Language-Tag = langtag ; normal language tags
|
9
|
+
# / privateuse ; private use tag
|
10
|
+
# / grandfathered ; grandfathered tags
|
11
|
+
|
12
|
+
# langtag = language
|
13
|
+
# ["-" script]
|
14
|
+
# ["-" region]
|
15
|
+
# *("-" variant)
|
16
|
+
# *("-" extension)
|
17
|
+
# ["-" privateuse]
|
18
|
+
|
19
|
+
# language = 2*3ALPHA ; shortest ISO 639 code
|
20
|
+
# ["-" extlang] ; sometimes followed by
|
21
|
+
# ; extended language subtags
|
22
|
+
# / 4ALPHA ; or reserved for future use
|
23
|
+
# / 5*8ALPHA ; or registered language subtag
|
24
|
+
|
25
|
+
# extlang = 3ALPHA ; selected ISO 639 codes
|
26
|
+
# *2("-" 3ALPHA) ; permanently reserved
|
27
|
+
|
28
|
+
# script = 4ALPHA ; ISO 15924 code
|
29
|
+
|
30
|
+
# region = 2ALPHA ; ISO 3166-1 code
|
31
|
+
# / 3DIGIT ; UN M.49 code
|
32
|
+
|
33
|
+
# variant = 5*8alphanum ; registered variants
|
34
|
+
# / (DIGIT 3alphanum)
|
35
|
+
|
36
|
+
# extension = singleton 1*("-" (2*8alphanum))
|
37
|
+
|
38
|
+
# ; Single alphanumerics
|
39
|
+
# ; "x" reserved for private use
|
40
|
+
# singleton = DIGIT ; 0 - 9
|
41
|
+
# / %x41-57 ; A - W
|
42
|
+
# / %x59-5A ; Y - Z
|
43
|
+
# / %x61-77 ; a - w
|
44
|
+
# / %x79-7A ; y - z
|
45
|
+
|
46
|
+
# privateuse = "x" 1*("-" (1*8alphanum))
|
47
|
+
|
48
|
+
# grandfathered = irregular ; non-redundant tags registered
|
49
|
+
# / regular ; during the RFC 3066 era
|
50
|
+
|
51
|
+
# irregular = "en-GB-oed" ; irregular tags do not match
|
52
|
+
# / "i-ami" ; the 'langtag' production and
|
53
|
+
# / "i-bnn" ; would not otherwise be
|
54
|
+
# / "i-default" ; considered 'well-formed'
|
55
|
+
# / "i-enochian" ; These tags are all valid,
|
56
|
+
# / "i-hak" ; but most are deprecated
|
57
|
+
# / "i-klingon" ; in favor of more modern
|
58
|
+
# / "i-lux" ; subtags or subtag
|
59
|
+
# / "i-mingo" ; combination
|
60
|
+
# / "i-navajo"
|
61
|
+
# / "i-pwn"
|
62
|
+
# / "i-tao"
|
63
|
+
# / "i-tay"
|
64
|
+
# / "i-tsu"
|
65
|
+
# / "sgn-BE-FR"
|
66
|
+
# / "sgn-BE-NL"
|
67
|
+
# / "sgn-CH-DE"
|
68
|
+
|
69
|
+
# regular = "art-lojban" ; these tags match the 'langtag'
|
70
|
+
# / "cel-gaulish" ; production, but their subtags
|
71
|
+
# / "no-bok" ; are not extended language
|
72
|
+
# / "no-nyn" ; or variant subtags: their meaning
|
73
|
+
# / "zh-guoyu" ; is defined by their registration
|
74
|
+
# / "zh-hakka" ; and all of these are deprecated
|
75
|
+
# / "zh-min" ; in favor of a more modern
|
76
|
+
# / "zh-min-nan" ; subtag or sequence of subtags
|
77
|
+
# / "zh-xiang"
|
78
|
+
|
79
|
+
# alphanum = (ALPHA / DIGIT) ; letters and numbers
|
80
|
+
|
81
|
+
# Simplified check. Not implementing high level privateuse / grandfathered.
|
82
|
+
# Should replace with a proper check at some point.
|
83
|
+
ALPHANUM = /[a-zA-Z\d]/
|
84
|
+
SINGLETON = /[\dA-WY-Za-wy-z]/
|
85
|
+
|
86
|
+
EXTLANG = /[a-zA-Z]{3}(-[a-zA-Z]{3}){0,2}/
|
87
|
+
|
88
|
+
LANGUAGE = /([a-zA-Z]{2,3}(-#{EXTLANG})?|[a-zA-Z]{4}|[a-zA-Z]{5,8})/
|
89
|
+
SCRIPT = /[a-zA-Z]{4}/
|
90
|
+
REGION = /([a-zA-Z]{2}|\d{3})/
|
91
|
+
VARIANT = /(#{ALPHANUM}{5,8}|\d#{ALPHANUM}{3})/
|
92
|
+
EXTENSION = /#{SINGLETON}(-[a-zA-Z]{2,8})+/
|
93
|
+
PRIVATEUSE = /x(-#{ALPHANUM}{1,8})+/
|
94
|
+
|
95
|
+
# Ruby .match only keeps the first captured group, so expressions like variants/extensions we need to keep everything
|
96
|
+
# in one captured group, then break them down in multipe groups separately
|
97
|
+
LANGTAG = %r{
|
98
|
+
(?<language>#{LANGUAGE})
|
99
|
+
(-(?<script>#{SCRIPT}))?
|
100
|
+
(-(?<region>#{REGION}))?
|
101
|
+
(?<variants>(-#{VARIANT})*)
|
102
|
+
(?<extensions>(-#{EXTENSION})*)
|
103
|
+
(-(?<private>#{PRIVATEUSE}))?
|
104
|
+
}x
|
105
|
+
|
106
|
+
LANGUAGE_TAG = /\A#{LANGTAG}\z/
|
107
|
+
|
108
|
+
class << self
|
109
|
+
def parse(language_tag)
|
110
|
+
return unless match = language_tag.match(LANGUAGE_TAG)
|
111
|
+
|
112
|
+
named_captures(match).tap do |captures|
|
113
|
+
captures['variants'] = captures['variants'].to_s.empty? ? [] : captures['variants'][/-(.*)/, 1].split('-').sort
|
114
|
+
captures['extensions'] = split_extensions(captures['extensions'])
|
115
|
+
captures['private'] = captures['private'].to_s.empty? ? [] : captures['private'][/x-(.*)/, 1].split('-').sort
|
116
|
+
end
|
115
117
|
end
|
116
|
-
end
|
117
118
|
|
118
|
-
|
119
|
+
private
|
119
120
|
|
120
|
-
|
121
|
-
|
121
|
+
def named_captures(match)
|
122
|
+
return match.named_captures if match.respond_to?(:named_captures)
|
122
123
|
|
123
|
-
|
124
|
-
|
124
|
+
match.names.each_with_object({}) { |name, acc| acc[name] = match[name] }
|
125
|
+
end
|
125
126
|
|
126
|
-
|
127
|
-
|
127
|
+
def split_extensions(extensions)
|
128
|
+
return [] if extensions.to_s.empty?
|
128
129
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
130
|
+
# [["u-attr-co-phonebk"], ["t-und-cyrl"]]
|
131
|
+
extensions = extensions.scan(/\b(?<ext>#{EXTENSION})\b/)
|
132
|
+
# [["t", "und-cyrl"], ["u", "attr-co-phonebk"]]
|
133
|
+
extensions.flatten.sort.map { |st| st.split('-', 2) }
|
134
|
+
end
|
133
135
|
end
|
134
136
|
end
|
135
137
|
end
|
data/lib/bcp47_spec/version.rb
CHANGED